In [1]:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, substring, when, length, split, lit
from pyspark.sql.types import StructType, StructField, StringType
from graphframes import *

In [2]:
# Dados de carga
csv_pj = 'dataset/empresas_estabelecimentos_1m.csv'
csv_socios = 'dataset/socios_1m.csv'

In [3]:
spark = SparkSession.builder \
        .config("spark.driver.memory", "32g") \
        .appName("trata_dados_3") \
        .getOrCreate()

sc = spark.sparkContext
sc.setCheckpointDir('graphframes_cps')

23/12/04 13:49:23 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
# Schema para Json Empresa
json_schema_em = StructType([
    StructField("porteEmpresa", StringType(), True),
    StructField("capitalSocial", StringType(), True),
    StructField("cpfResponsavel", StringType(), True),
    StructField("nomeEmpresarial", StringType(), True),
    StructField("naturezaJuridica", StringType(), True),
    StructField("qualificacaoResponsavel", StringType(), True)
])

# Schema para Json Estabelecimento
json_schema_es = StructType([
    StructField("uf", StringType(), True),
    StructField("cep", StringType(), True),
    StructField("ddd1", StringType(), True),
    StructField("ddd2", StringType(), True),
    StructField("pais", StringType(), True),
    StructField("email", StringType(), True),
    StructField("bairro", StringType(), True),
    StructField("numero", StringType(), True),
    StructField("municipio", StringType(), True),
    StructField("telefone1", StringType(), True),
    StructField("telefone2", StringType(), True),
    StructField("cnaeFiscal", StringType(), True),
    StructField("logradouro", StringType(), True),
    StructField("complemento", StringType(), True),
    StructField("dataCadastro", StringType(), True),
    StructField("nomeFantasia", StringType(), True),
    StructField("cidadeExterior", StringType(), True),
    StructField("tipoLogradouro", StringType(), True),
    StructField("cnaesSecundarias", StringType(), True),
    StructField("situacaoEspecial", StringType(), True),
    StructField("situacaoCadastral", StringType(), True),
    StructField("dataSituacaoEspecial", StringType(), True),
    StructField("dataSituacaoCadastral", StringType(), True),
    StructField("motivoSituacaoCadastral", StringType(), True),
    StructField("identificadorMatrizFilial", StringType(), True)
])

# Definindo o esquema para leitura do CSV
schema = StructType([
    StructField("nu_cnpj_raiz", StringType(), True),
    StructField("te_dados_em", StringType(), True),
    StructField("id_estabelecimento", StringType(), True),
    StructField("te_dados_es", StringType(), True)
])

# Lendo o CSV com o esquema definido
pj = spark.read \
    .option("delimiter", ",") \
    .option("multiline", "true") \
    .option("escape", "\"") \
    .csv(csv_pj, schema=schema, header=True)

pj = pj.withColumn("te_dados_em", from_json(col("te_dados_em"), json_schema_em)) \
      .withColumn("te_dados_es", from_json(col("te_dados_es"), json_schema_es)) \
      .filter(substring(col("id_estabelecimento"), 0, 4) == '0001') \
      .withColumnRenamed("nu_cnpj_raiz", "id")

pj = pj.orderBy(col("id").asc()).dropDuplicates(['id'])

# verificando unidade de chaves:
# count_df = pj.groupBy('id').count()
# repeated_cnpj = count_df.filter(col('count') > 1)
# repeated_cnpj.show()

In [5]:
# Schema para Json Socios
json_schema_sc = StructType([
    StructField("pais", StringType(), True),
    StructField("entradaSociedade", StringType(), True),
    StructField("socioEstrangeiro", StringType(), True),
    StructField("qualificacaoSocio", StringType(), True),
    StructField("identificadorSocio", StringType(), True),
    StructField("cpfRepresentanteLegal", StringType(), True),
    StructField("qualificacaoRepresentanteLegal", StringType(), True)
])

# Definindo o esquema para leitura do CSV
schema = StructType([
    StructField("nu_cnpj_raiz", StringType(), True),
    StructField("id_socio", StringType(), True),
    StructField("te_dados_sc", StringType(), True)
])

# Lendo o CSV com o esquema definido
socio = spark.read \
        .option("delimiter", ",") \
        .option("multiline", "true") \
        .option("escape", "\"") \
        .csv(csv_socios, schema=schema, header=True)

# tratando o df socios
# Tratamento 1: Expandir o JSON
socio = socio.withColumn("te_dados_sc", from_json(col("te_dados_sc"), json_schema_sc))

socio = socio.withColumn("id_socio", when(socio["te_dados_sc"]["identificadorSocio"] == '1',
                                    socio["id_socio"].substr(1, 8))
                                .otherwise(socio["id_socio"]))
# Remomeia fonte e destino
socio = socio.withColumnRenamed("nu_cnpj_raiz", "src").withColumnRenamed("id_socio", "dst")
# remove arestas dubplicadas
socio = socio.orderBy(col("src").asc(), col("dst").asc()).dropDuplicates(['src', 'dst'])

In [6]:
# Cria df para vértices do tipo Pessoa Física:
pf = socio.filter(col("te_dados_sc.identificadorSocio") == '2').select(col("dst").alias("id")).distinct()

cols_pj = [col_name for col_name in pj.columns if col_name != 'id']
for col_name in cols_pj:
    pf = pf.withColumn(col_name, lit(None))

# cria tipo de pessoa na nos vértices
pj = pj.withColumn("id_tipoPessoa", lit(1))
pf = pf.withColumn("id_tipoPessoa", lit(2))

vertex = pj.union(pf).orderBy("id")

# verificando unidade de chaves:
# count_df = vertex.groupBy('id').count()
# repeated_vertex = count_df.filter(col('count') > 1)
# repeated_vertex.show()

In [7]:
# Criando o grafo de relacionamento empresas->socios
g = GraphFrame(vertex, socio)
result=g.connectedComponents()

23/12/04 13:51:21 WARN BlockManager: Block rdd_57_9 already exists on this machine; not re-adding it
23/12/04 13:51:21 WARN BlockManager: Block rdd_57_12 already exists on this machine; not re-adding it
23/12/04 13:51:21 WARN BlockManager: Block rdd_57_8 already exists on this machine; not re-adding it
                                                                                

In [8]:
# conectados = result.select("id", "component", "id_tipoPessoa").orderBy("component")
# conectados.show()
component_sizes = result.groupBy('component').count()
largest_component = component_sizes.orderBy(col('count').desc()).first()
largest_component_id = largest_component['component']
largest_component_vertices = result.filter(result['component'] == largest_component_id)
largest_component_vertices.show()

23/12/04 13:55:18 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+--------------+--------------------+------------------+--------------------+-------------+-----------+
|            id|         te_dados_em|id_estabelecimento|         te_dados_es|id_tipoPessoa|  component|
+--------------+--------------------+------------------+--------------------+-------------+-----------+
|      03235791|{05, 000011547555...|            000127|{SP, 04552000, 11...|            1|25769859916|
|      03410522|{05, 000000403000...|            000150|{SP, 04552000, 11...|            1|25769859916|
|      04038317|{05, 000002813130...|            000178|{SP, 05307190, 11...|            1|25769859916|
|00002117401872|                NULL|              NULL|                NULL|            2|25769859916|
|      04100516|{05, 000000000000...|            000169|{SP, 05307190, 11...|            1|25769859916|
+--------------+--------------------+------------------+--------------------+-------------+-----------+



In [12]:
edges = g.edges.filter("src='03235791' or src='03410522' or src='04038317' or src='00002117401872' or src='04100516'")
edges.show()

[Stage 514:>                                                        (0 + 1) / 1]

+--------+--------------+--------------------+
|     src|           dst|         te_dados_sc|
+--------+--------------+--------------------+
|03235791|00000610508067|{000, 20170822, ,...|
|03235791|00001683293797|{000, 20230102, ,...|
|03235791|00002117401872|{000, 20021018, ,...|
|03235791|00003604616549|{000, 20021018, ,...|
|03235791|00009101021710|{000, 20230102, ,...|
|03235791|00009442262807|{000, 20230102, ,...|
|03235791|00029435367844|{000, 20230102, ,...|
|03235791|      73178600|{000, 19990624, ,...|
|03410522|00000610508067|{000, 20180611, ,...|
|03410522|00002117401872|{000, 20021029, ,...|
|03410522|00003604616549|{000, 20021029, ,...|
|03410522|00009101021710|{000, 20200428, ,...|
|03410522|00010793641705|{000, 20200428, ,...|
|03410522|00013290326837|{000, 20131125, ,...|
|03410522|00022148709895|{000, 20180611, ,...|
|03410522|00034569034802|{000, 20180611, ,...|
|03410522|      73178600|{000, 20061201, ,...|
|04038317|00002117401872|{000, 20080626, ,...|
|04038317|000

                                                                                

In [16]:
vertex = g.vertices.filter("id = '00002117401872' or id ='00001683293797'")
vertex.show()

[Stage 535:>                                                        (0 + 1) / 1]

+--------------+-----------+------------------+-----------+-------------+
|            id|te_dados_em|id_estabelecimento|te_dados_es|id_tipoPessoa|
+--------------+-----------+------------------+-----------+-------------+
|00001683293797|       NULL|              NULL|       NULL|            2|
|00002117401872|       NULL|              NULL|       NULL|            2|
+--------------+-----------+------------------+-----------+-------------+



                                                                                

In [None]:
spark.stop()