In [36]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, DoubleType, IntegerType
from pyspark import SparkConf, SparkContext

spark = SparkSession.builder.appName("Teste de Leitura e Escrita com Minio").getOrCreate()

spark

In [157]:
data = [
    {
    "des_tipo_prso":"PROCEDIMENTO COMUM CÍVEL",
    "num_docm_raiz":"07207996",
    "num_docm_veri": "50",
    "num_prso": "999878255655.2020.8.13.15478",
    "risc_prso":"BAIXISSIMO",
    "risc_prso_2":"1"
    }
]

In [158]:
df = spark.createDataFrame(data)
df.show()

+--------------------+-------------+-------------+--------------------+----------+-----------+
|       des_tipo_prso|num_docm_raiz|num_docm_veri|            num_prso| risc_prso|risc_prso_2|
+--------------------+-------------+-------------+--------------------+----------+-----------+
|PROCEDIMENTO COMU...|     07207996|           50|999878255655.2020...|BAIXISSIMO|          1|
+--------------------+-------------+-------------+--------------------+----------+-----------+



In [159]:
type(df)

pyspark.sql.dataframe.DataFrame

In [160]:
from pyspark.sql.functions import col, lit, create_map

In [161]:
df = df.withColumn("num_docm_completo", col("num_docm_raiz") + col("num_docm_veri")).drop("num_docm_raiz", "num_docm_veri")

df.show()

+--------------------+--------------------+----------+-----------+-----------------+
|       des_tipo_prso|            num_prso| risc_prso|risc_prso_2|num_docm_completo|
+--------------------+--------------------+----------+-----------+-----------------+
|PROCEDIMENTO COMU...|999878255655.2020...|BAIXISSIMO|          1|        7208046.0|
+--------------------+--------------------+----------+-----------+-----------------+



In [164]:
df.drop('num_docm_completo', *['risc_prso', 'risc_prso_2']).columns

['des_tipo_prso', 'num_prso']

In [163]:
[f(x) for x in df.drop('num_docm_completo', *['risc_prso', 'risc_prso_2']).columns for f in (lit, col) ]

[Column<'des_tipo_prso'>,
 Column<'des_tipo_prso'>,
 Column<'num_prso'>,
 Column<'num_prso'>]

In [165]:
def MapBy(dataframe, map_by, map_col_name, drop_these):
    cols_to_map = [f(x) for x in df.drop(map_by, *drop_these).columns for f in (lit, col) ]
    result_df = df.withColumn(map_col_name, create_map(cols_to_map))
    result_df = result_df.select([map_by, map_col_name])
    return result_df

In [166]:
df = MapBy(dataframe = df, map_by = 'num_docm_completo', map_col_name = 'map', drop_these= ['risc_prso', 'risc_prso_2'])
df.show(truncate = False)

+-----------------+-------------------------------------------------------------------------------------+
|num_docm_completo|map                                                                                  |
+-----------------+-------------------------------------------------------------------------------------+
|7208046.0        |{des_tipo_prso -> PROCEDIMENTO COMUM CÍVEL, num_prso -> 999878255655.2020.8.13.15478}|
+-----------------+-------------------------------------------------------------------------------------+



In [144]:

cols_to_map = [f(x) for x in df.drop('num_docm_completo').columns for f in (lit, col) ]
cols_to_map

[Column<'des_tipo_prso'>,
 Column<'des_tipo_prso'>,
 Column<'num_prso'>,
 Column<'num_prso'>,
 Column<'risc_prso'>,
 Column<'risc_prso'>,
 Column<'risc_prso_2'>,
 Column<'risc_prso_2'>]

In [88]:
df = df.withColumn("info_processos", create_map(cols_to_map))
df.show()

+--------------------+--------------------+----------+-----------+-----------------+--------------------+
|       des_tipo_prso|            num_prso| risc_prso|risc_prso_2|num_docm_completo|      info_processos|
+--------------------+--------------------+----------+-----------+-----------------+--------------------+
|PROCEDIMENTO COMU...|999878255655.2020...|BAIXISSIMO|          1|        7208046.0|{des_tipo_prso ->...|
+--------------------+--------------------+----------+-----------+-----------------+--------------------+



In [78]:
[
    lit("des_tipo_prso"),col("des_tipo_prso"),
    lit("num_prso"),col("num_prso"),
    lit("risc_prso"),col("risc_prso"),
    lit("risc_prso_2"),col("risc_prso_2")
]

[Column<'des_tipo_prso'>,
 Column<'des_tipo_prso'>,
 Column<'num_prso'>,
 Column<'num_prso'>,
 Column<'risc_prso'>,
 Column<'risc_prso'>,
 Column<'risc_prso_2'>,
 Column<'risc_prso_2'>]

In [56]:

df = df.withColumn("info_processos", create_map([lit("des_tipo_prso"),col("des_tipo_prso"),
                                           lit("num_prso"),col("num_prso"),
                                          lit("risc_prso"),col("risc_prso"),
                                           lit("risc_prso_2"),col("risc_prso_2")]
                                          ))
df.show()

TypeError: Invalid argument, not a string or column: (Column<'des_tipo_prso'>, Column<'des_tipo_prso'>) of type <class 'tuple'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

In [33]:
df.show()

+--------------------+--------------------+----------+-----------+-----------------+--------------------+
|       des_tipo_prso|            num_prso| risc_prso|risc_prso_2|num_docm_completo|      info_processos|
+--------------------+--------------------+----------+-----------+-----------------+--------------------+
|PROCEDIMENTO COMU...|999878255655.2020...|BAIXISSIMO|          1|        7208046.0|{des_tipo_prso ->...|
+--------------------+--------------------+----------+-----------+-----------------+--------------------+



In [34]:
df = df.select("num_docm_completo", "info_processos")
df.show(truncate = False)

+-----------------+--------------------------------------------------------------------------------------------------------------------------------+
|num_docm_completo|info_processos                                                                                                                  |
+-----------------+--------------------------------------------------------------------------------------------------------------------------------+
|7208046.0        |{des_tipo_prso -> PROCEDIMENTO COMUM CÍVEL, num_prso -> 999878255655.2020.8.13.15478, risc_prso -> BAIXISSIMO, risc_prso_2 -> 1}|
+-----------------+--------------------------------------------------------------------------------------------------------------------------------+



In [35]:
df.printSchema()

root
 |-- num_docm_completo: double (nullable = true)
 |-- info_processos: map (nullable = false)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

