In [11]:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col, from_json, substring, when, length, split, lit
from pyspark.sql.types import StructType, StructField, StringType
from pyspark import StorageLevel

In [12]:
# Nomes de aquivos e pastas
csv_pj = 'dataset/empresas_estabelecimentos_30m.csv'
parquet_dir_name = 'output/embeddings_nome_30m'
investigaCEP='13560'
csv_socios = 'dataset/socios_30m.csv'

In [13]:
spark = SparkSession.builder \
        .config("spark.driver.memory", "128g") \
        .appName("gera_embedding_nome") \
        .getOrCreate()

In [14]:
# Schema para Json Empresa
json_schema_em = StructType([
    StructField("porteEmpresa", StringType(), True),
    StructField("capitalSocial", StringType(), True),
    StructField("cpfResponsavel", StringType(), True),
    StructField("nomeEmpresarial", StringType(), True),
    StructField("naturezaJuridica", StringType(), True),
    StructField("qualificacaoResponsavel", StringType(), True)
])

# Schema para Json Estabelecimento
json_schema_es = StructType([
    StructField("uf", StringType(), True),
    StructField("cep", StringType(), True),
    StructField("ddd1", StringType(), True),
    StructField("ddd2", StringType(), True),
    StructField("pais", StringType(), True),
    StructField("email", StringType(), True),
    StructField("bairro", StringType(), True),
    StructField("numero", StringType(), True),
    StructField("municipio", StringType(), True),
    StructField("telefone1", StringType(), True),
    StructField("telefone2", StringType(), True),
    StructField("cnaeFiscal", StringType(), True),
    StructField("logradouro", StringType(), True),
    StructField("complemento", StringType(), True),
    StructField("dataCadastro", StringType(), True),
    StructField("nomeFantasia", StringType(), True),
    StructField("cidadeExterior", StringType(), True),
    StructField("tipoLogradouro", StringType(), True),
    StructField("cnaesSecundarias", StringType(), True),
    StructField("situacaoEspecial", StringType(), True),
    StructField("situacaoCadastral", StringType(), True),
    StructField("dataSituacaoEspecial", StringType(), True),
    StructField("dataSituacaoCadastral", StringType(), True),
    StructField("motivoSituacaoCadastral", StringType(), True),
    StructField("identificadorMatrizFilial", StringType(), True)
])

# Definindo o esquema para leitura do CSV
schema = StructType([
    StructField("nu_cnpj_raiz", StringType(), True),
    StructField("te_dados_em", StringType(), True),
    StructField("id_estabelecimento", StringType(), True),
    StructField("te_dados_es", StringType(), True)
])

# Lendo o CSV com o esquema definido
pj = spark.read \
    .option("delimiter", ",") \
    .option("multiline", "true") \
    .option("escape", "\"") \
    .csv(csv_pj, schema=schema, header=True)

pj = pj.withColumn("te_dados_em", from_json(col("te_dados_em"), json_schema_em)) \
      .withColumn("te_dados_es", from_json(col("te_dados_es"), json_schema_es)) \
      .filter(substring(col("id_estabelecimento"), 0, 4) == '0001') \
      .withColumnRenamed("nu_cnpj_raiz", "id")

pj = pj.orderBy(col("id").asc()).dropDuplicates(['id'])

In [15]:
# print(pj.count())

In [16]:
# Schema para Json Socios
json_schema_sc = StructType([
    StructField("pais", StringType(), True),
    StructField("entradaSociedade", StringType(), True),
    StructField("socioEstrangeiro", StringType(), True),
    StructField("qualificacaoSocio", StringType(), True),
    StructField("identificadorSocio", StringType(), True),
    StructField("cpfRepresentanteLegal", StringType(), True),
    StructField("qualificacaoRepresentanteLegal", StringType(), True)
])

# Definindo o esquema para leitura do CSV
schema = StructType([
    StructField("nu_cnpj_raiz", StringType(), True),
    StructField("id_socio", StringType(), True),
    StructField("te_dados_sc", StringType(), True)
])

# Lendo o CSV com o esquema definido
socio = spark.read \
        .option("delimiter", ",") \
        .option("multiline", "true") \
        .option("escape", "\"") \
        .csv(csv_socios, schema=schema, header=True)

# tratando o df socios
# Tratamento 1: Expandir o JSON
socio = socio.withColumn("te_dados_sc", from_json(col("te_dados_sc"), json_schema_sc))

# converte cnpj para nu_cnpj_raiz
socio = socio.withColumn("id_socio", when(socio["te_dados_sc"]["identificadorSocio"] == '1',
                                    socio["id_socio"].substr(1, 8))
                                .otherwise(socio["id_socio"]))
# Remomeia fonte e destino
socio = socio.withColumnRenamed("nu_cnpj_raiz", "src").withColumnRenamed("id_socio", "dst")
# Ordena por src, dst e remove arestas dubplicadas
socio = socio.orderBy(col("src").asc(), col("dst").asc()).dropDuplicates(['src', 'dst'])

In [17]:
# print(socio.count())

In [18]:
# Carrega embeddings
nomes_pj = spark.read.parquet(parquet_dir_name)

                                                                                

In [19]:
# print(nomes_pj.count())

In [None]:
cnpjs_filtrados = pj \
    .filter(col("te_dados_es.cep").startswith(investigaCEP)) \
    .select(col("id"))

# Filtrando 'nomes_pj' com base nos CNPJs filtrados
nomes_filtrados = nomes_pj.join(cnpjs_filtrados, nomes_pj["id"] == pj["id"], "inner") \
                          .select(nomes_pj["id"], nomes_pj["nomeFantasia"],  nomes_pj["embeddings"])
nomes_filtrados.show()

[Stage 29:>                                                         (0 + 1) / 1]

In [None]:
spark.stop()