In [12]:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
# from pyspark.sql.functions import col, from_json, substring, when, length, split, lit, trim, concat_ws
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType

In [2]:
# Nomes de aquivos e pastas
csv_pj = 'dataset/empresas_estabelecimentos_1m.csv'
parquet_dir_name = 'output/embeddings_cnae_1m'

In [3]:
spark = SparkSession.builder \
        .config("spark.driver.memory", "32g") \
        .appName("gera_embedding_cnae") \
        .getOrCreate()

23/12/05 16:02:10 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [49]:
# Schema para Json Empresa
json_schema_em = StructType([
    StructField("porteEmpresa", StringType(), True),
    StructField("capitalSocial", StringType(), True),
    StructField("cpfResponsavel", StringType(), True),
    StructField("nomeEmpresarial", StringType(), True),
    StructField("naturezaJuridica", StringType(), True),
    StructField("qualificacaoResponsavel", StringType(), True)
])

# Schema para Json Estabelecimento
json_schema_es = StructType([
    StructField("uf", StringType(), True),
    StructField("cep", StringType(), True),
    StructField("ddd1", StringType(), True),
    StructField("ddd2", StringType(), True),
    StructField("pais", StringType(), True),
    StructField("email", StringType(), True),
    StructField("bairro", StringType(), True),
    StructField("numero", StringType(), True),
    StructField("municipio", StringType(), True),
    StructField("telefone1", StringType(), True),
    StructField("telefone2", StringType(), True),
    StructField("cnaeFiscal", StringType(), True),
    StructField("logradouro", StringType(), True),
    StructField("complemento", StringType(), True),
    StructField("dataCadastro", StringType(), True),
    StructField("nomeFantasia", StringType(), True),
    StructField("cidadeExterior", StringType(), True),
    StructField("tipoLogradouro", StringType(), True),
    StructField("cnaesSecundarias", StringType(), True),
    StructField("situacaoEspecial", StringType(), True),
    StructField("situacaoCadastral", StringType(), True),
    StructField("dataSituacaoEspecial", StringType(), True),
    StructField("dataSituacaoCadastral", StringType(), True),
    StructField("motivoSituacaoCadastral", StringType(), True),
    StructField("identificadorMatrizFilial", StringType(), True)
])

# Definindo o esquema para leitura do CSV
schema = StructType([
    StructField("nu_cnpj_raiz", StringType(), True),
    StructField("te_dados_em", StringType(), True),
    StructField("id_estabelecimento", StringType(), True),
    StructField("te_dados_es", StringType(), True)
])

# Lendo o CSV com o esquema definido
pj = spark.read \
    .option("delimiter", ",") \
    .option("multiline", "true") \
    .option("escape", "\"") \
    .csv(csv_pj, schema=schema, header=True)

pj = pj.withColumn("te_dados_em", from_json(col("te_dados_em"), json_schema_em)) \
      .withColumn("te_dados_es", from_json(col("te_dados_es"), json_schema_es)) \
      .filter(substring(col("id_estabelecimento"), 0, 4) == '0001') \
      .withColumnRenamed("nu_cnpj_raiz", "id")

pj = pj.orderBy(col("id").asc()).dropDuplicates(['id'])

#### Tratamento CNAE

In [50]:
cnae_pj = pj.select(col('id'),
                    col('te_dados_es.cnaeFiscal').alias('cnaeFiscal'),
                    col('te_dados_es.cnaesSecundarias').alias('cnaesSecundarias')) \
            .withColumn(
                    'cnaesSecundarias_Limpo',
                    translate(col('cnaesSecundarias'), '[]"', ''))
# cnae_pj.show()

In [51]:
# montando vetor de cnaes
cnae_pj = cnae_pj.withColumn(
    'vetor_cnae',
    when(
        col('cnaesSecundarias_Limpo').isNull() | (col('cnaesSecundarias_Limpo') == ''),
        array()
    ).otherwise(split(col('cnaesSecundarias_Limpo'), ','))
)
cnae_pj = cnae_pj.withColumn(
    'vetor_cnae',
    array_union(array(col('cnaeFiscal')), col('vetor_cnae'))
)

cnae_pj = cnae_pj.withColumn(
    'vetor_grupo_cnae',
    expr("transform(vetor_cnae, x -> substring(x, 1, 3))")  # Pega os três primeiros caracteres de cada elemento
)
# print(cnae_pj)

cnae_pj = cnae_pj.drop('cnaesSecundarias', 'cnaesSecundarias_Limpo')
cnae_pj.show()

[Stage 57:>                                                         (0 + 1) / 1]

+--------+----------+--------------------+--------------------+
|      id|cnaeFiscal|          vetor_cnae|    vetor_grupo_cnae|
+--------+----------+--------------------+--------------------+
|03230095|   4759899|           [4759899]|               [475]|
|03230096|   4530703|           [4530703]|               [453]|
|03230097|   4755502|           [4755502]|               [475]|
|03230098|   4520003|  [4520003, 4530703]|          [452, 453]|
|03230099|   8630503|[8630503, 8630501...|[863, 863, 863, 8...|
|03230100|   6462000|           [6462000]|               [646]|
|03230101|   9430800|[9430800, 9493600...|     [943, 949, 949]|
|03230102|   9430800|[9430800, 9493600...|     [943, 949, 949]|
|03230103|   4729699|           [4729699]|               [472]|
|03230104|   4619200|           [4619200]|               [461]|
|03230105|   4781400|           [4781400]|               [478]|
|03230106|   2222600|           [2222600]|               [222]|
|03230107|   4722901|           [4722901

                                                                                

In [54]:
# Persista o DataFrame para consulta posterior
cnae_pj.write.mode('overwrite').parquet(parquet_dir_name)

                                                                                

In [55]:
spark.stop()