In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, substring, when, length, split, lit
from pyspark.sql.types import StructType, StructField, StringType
from graphframes import *

In [20]:
spark = SparkSession.builder.appName("trata_dados_2").getOrCreate()

In [21]:
# Schema para Json Empresa
json_schema_em = StructType([
    StructField("porteEmpresa", StringType(), True),
    StructField("capitalSocial", StringType(), True),
    StructField("cpfResponsavel", StringType(), True),
    StructField("nomeEmpresarial", StringType(), True),
    StructField("naturezaJuridica", StringType(), True),
    StructField("qualificacaoResponsavel", StringType(), True)
])

# Schema para Json Estabelecimento
json_schema_es = StructType([
    StructField("uf", StringType(), True),
    StructField("cep", StringType(), True),
    StructField("ddd1", StringType(), True),
    StructField("ddd2", StringType(), True),
    StructField("pais", StringType(), True),
    StructField("email", StringType(), True),
    StructField("bairro", StringType(), True),
    StructField("numero", StringType(), True),
    StructField("municipio", StringType(), True),
    StructField("telefone1", StringType(), True),
    StructField("telefone2", StringType(), True),
    StructField("cnaeFiscal", StringType(), True),
    StructField("logradouro", StringType(), True),
    StructField("complemento", StringType(), True),
    StructField("dataCadastro", StringType(), True),
    StructField("nomeFantasia", StringType(), True),
    StructField("cidadeExterior", StringType(), True),
    StructField("tipoLogradouro", StringType(), True),
    StructField("cnaesSecundarias", StringType(), True),
    StructField("situacaoEspecial", StringType(), True),
    StructField("situacaoCadastral", StringType(), True),
    StructField("dataSituacaoEspecial", StringType(), True),
    StructField("dataSituacaoCadastral", StringType(), True),
    StructField("motivoSituacaoCadastral", StringType(), True),
    StructField("identificadorMatrizFilial", StringType(), True)
])

# Definindo o esquema para leitura do CSV
schema = StructType([
    StructField("nu_cnpj_raiz", StringType(), True),
    StructField("te_dados_em", StringType(), True),
    StructField("id_estabelecimento", StringType(), True),
    StructField("te_dados_es", StringType(), True)
])

# Lendo o CSV com o esquema definido
pj = spark.read \
    .option("delimiter", ",") \
    .option("multiline", "true") \
    .option("escape", "\"") \
    .csv("dataset/empresas_estabelecimentos.csv", schema=schema, header=True)

In [22]:
pj = pj.withColumn("te_dados_em", from_json(col("te_dados_em"), json_schema_em)) \
      .withColumn("te_dados_es", from_json(col("te_dados_es"), json_schema_es)) \
      .filter(substring(col("id_estabelecimento"), 0, 4) == '0001') \
      .withColumnRenamed("nu_cnpj_raiz", "id")

In [23]:
# print (pj.head(2))
pj.show(truncate=False)
# pj.printSchema()

+--------+----------------------------------------------------------------------------------------------+------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id      |te_dados_em                                                                                   |id_estabelecimento|te_dados_es                                                                                                                                                                                                                                     |
+--------+----------------------------------------------------------------------------------------------+------------------+------------------------------------------------------------------------------------------------------------------------------------------------

In [24]:
# Schema para Json Socios
json_schema_sc = StructType([
    StructField("pais", StringType(), True),
    StructField("entradaSociedade", StringType(), True),
    StructField("socioEstrangeiro", StringType(), True),
    StructField("qualificacaoSocio", StringType(), True),
    StructField("identificadorSocio", StringType(), True),
    StructField("cpfRepresentanteLegal", StringType(), True),
    StructField("qualificacaoRepresentanteLegal", StringType(), True)
])

# Definindo o esquema para leitura do CSV
schema = StructType([
    StructField("nu_cnpj_raiz", StringType(), True),
    StructField("id_socio", StringType(), True),
    StructField("te_dados_sc", StringType(), True)
])

# Lendo o CSV com o esquema definido
sc = spark.read \
    .option("delimiter", ",") \
    .option("multiline", "true") \
    .option("escape", "\"") \
    .csv("dataset/socios.csv", schema=schema, header=True)

In [25]:
# tratando o df socios
# Tratamento 1: Expandir o JSON
sc = sc.withColumn("te_dados_sc", from_json(col("te_dados_sc"), json_schema_sc))

sc = sc.withColumn("id_socio", when(sc["te_dados_sc"]["identificadorSocio"] == '1',
                                    sc["id_socio"].substr(1, 8))
                                .otherwise(sc["id_socio"]))

sc = sc.withColumnRenamed("nu_cnpj_raiz", "src").withColumnRenamed("id_socio", "dst")

In [26]:
# print (sc.head(2))
sc.show(truncate=False)
# sc.printSchema()

+--------+--------------+-----------------------------------------+
|src     |dst           |te_dados_sc                              |
+--------+--------------+-----------------------------------------+
|03230097|00000563224703|{000, 20070110, , 49, 2, 00000000000, 00}|
|03230097|00026967782720|{000, 19990615, , 49, 2, 00000000000, 00}|
|03230099|00006039729856|{000, 19990615, , 49, 2, 00000000000, 00}|
|03230099|00010835249840|{000, 19990615, , 22, 2, 00000000000, 00}|
|03230100|00001264437692|{000, 19990617, , 22, 2, 00000000000, 00}|
|03230100|00004944619693|{000, 20070711, , 49, 2, 00000000000, 00}|
|03230101|00033689091187|{000, 20050912, , 16, 2, 00000000000, 00}|
|03230102|00016569903420|{000, 20050912, , 16, 2, 00000000000, 00}|
|03230104|00021801606900|{000, 19990616, , 49, 2, 00000000000, 00}|
|03230104|00029315018915|{000, 19990616, , 22, 2, 00000000000, 00}|
|03230104|00489144      |{000, 19990729, , 22, 1, 21801606900, 05}|
|03230105|00022409363806|{000, 20020416, , 49, 2

In [27]:
# Cria df para vértices do tipo Pessoa Física:
pf = sc.filter(col("te_dados_sc.identificadorSocio") == '2').select(col("dst").alias("id"))


In [28]:
cols_pj = [col_name for col_name in pj.columns if col_name != 'id']
for col_name in cols_pj:
    pf = pf.withColumn(col_name, lit(None))

In [11]:
pf.show(truncate=False)
# pf.printSchema()

+--------------+-----------+------------------+-----------+
|id            |te_dados_em|id_estabelecimento|te_dados_es|
+--------------+-----------+------------------+-----------+
|00000563224703|NULL       |NULL              |NULL       |
|00026967782720|NULL       |NULL              |NULL       |
|00006039729856|NULL       |NULL              |NULL       |
|00010835249840|NULL       |NULL              |NULL       |
|00001264437692|NULL       |NULL              |NULL       |
|00004944619693|NULL       |NULL              |NULL       |
|00033689091187|NULL       |NULL              |NULL       |
|00016569903420|NULL       |NULL              |NULL       |
|00021801606900|NULL       |NULL              |NULL       |
|00029315018915|NULL       |NULL              |NULL       |
|00022409363806|NULL       |NULL              |NULL       |
|00056186843572|NULL       |NULL              |NULL       |
|00006022252563|NULL       |NULL              |NULL       |
|00006185656558|NULL       |NULL        

In [29]:
# cria tipo de pessoa na nos vértices
pj = pj.withColumn("id_tipoPessoa", lit(1))
pf = pf.withColumn("id_tipoPessoa", lit(2))

In [31]:
pf.show(truncate=False)

+--------------+-----------+------------------+-----------+-------------+
|id            |te_dados_em|id_estabelecimento|te_dados_es|id_tipoPessoa|
+--------------+-----------+------------------+-----------+-------------+
|00000563224703|NULL       |NULL              |NULL       |2            |
|00026967782720|NULL       |NULL              |NULL       |2            |
|00006039729856|NULL       |NULL              |NULL       |2            |
|00010835249840|NULL       |NULL              |NULL       |2            |
|00001264437692|NULL       |NULL              |NULL       |2            |
|00004944619693|NULL       |NULL              |NULL       |2            |
|00033689091187|NULL       |NULL              |NULL       |2            |
|00016569903420|NULL       |NULL              |NULL       |2            |
|00021801606900|NULL       |NULL              |NULL       |2            |
|00029315018915|NULL       |NULL              |NULL       |2            |
|00022409363806|NULL       |NULL      

In [32]:
# Criando o grafo de relacionamento empresas->socios
g = GraphFrame(pj.union(pf), sc)

In [33]:
g.vertices.show()

+--------+--------------------+------------------+--------------------+-------------+
|      id|         te_dados_em|id_estabelecimento|         te_dados_es|id_tipoPessoa|
+--------+--------------------+------------------+--------------------+-------------+
|03230095|{05, 000000000000...|            000128|{SP, 13480021, 01...|            1|
|03230096|{05, 000000000000...|            000172|{MG, 35920000, , ...|            1|
|03230097|{01, 000000000000...|            000117|{RJ, 25071181, 21...|            1|
|03230098|{01, 000000003000...|            000161|{RS, 93510078, 51...|            1|
|03230099|{01, 000000000000...|            000106|{SP, 11310011, 01...|            1|
|03230100|{01, 000000005000...|            000100|{MG, 35680258, 37...|            1|
|03230101|{05, 000000000000...|            000147|{TO, 77690000, , ...|            1|
|03230102|{05, 000000000000...|            000191|{PE, 55750000, , ...|            1|
|03230103|{05, 000000000000...|            000136|{PR,

In [34]:
# Filtrando os vértices com comprimento de 'id' maior que 8
vertices_filtrados = g.vertices.filter(g.vertices["id_tipoPessoa"] == 2)

# Exibindo os vértices filtrados
vertices_filtrados.show()

+--------------+-----------+------------------+-----------+-------------+
|            id|te_dados_em|id_estabelecimento|te_dados_es|id_tipoPessoa|
+--------------+-----------+------------------+-----------+-------------+
|00000563224703|       NULL|              NULL|       NULL|            2|
|00026967782720|       NULL|              NULL|       NULL|            2|
|00006039729856|       NULL|              NULL|       NULL|            2|
|00010835249840|       NULL|              NULL|       NULL|            2|
|00001264437692|       NULL|              NULL|       NULL|            2|
|00004944619693|       NULL|              NULL|       NULL|            2|
|00033689091187|       NULL|              NULL|       NULL|            2|
|00016569903420|       NULL|              NULL|       NULL|            2|
|00021801606900|       NULL|              NULL|       NULL|            2|
|00029315018915|       NULL|              NULL|       NULL|            2|
|00022409363806|       NULL|          

In [12]:
g.degrees.show()

+--------------+------+
|            id|degree|
+--------------+------+
|00038512556749|     1|
|00048938050963|     1|
|00005178880765|     1|
|00053830113072|     1|
|      03230874|     1|
|00028828437049|     2|
|00017596401821|     1|
|      03231463|     2|
|00063030152120|     1|
|00018677938818|     1|
|00000366371819|     1|
|00064425037553|     1|
|00000155076108|     1|
|00005475477817|     1|
|      03232065|     2|
|      03232078|     1|
|00002604521814|     1|
|00086491032191|     1|
|00052800415304|     1|
|00062879332249|     1|
+--------------+------+
only showing top 20 rows



In [16]:
Distinct_nome_fantasia = g.vertices.select('te_dados_es.nomeFantasia').distinct()
Distinct_nome_fantasia.show()

+--------------------+
|        nomeFantasia|
+--------------------+
|MERCADINHO SAO PAULO|
|         MTC CEREAIS|
|N.SRA. APARECIDA ...|
|AUTO PECAS BRASIL II|
|GERAL COM E REPRE...|
|       PRAZER DE LER|
|   LANCHONETE VARGAS|
|             KAYALAB|
| LATICINIOS PINDOBAS|
|  CARLOS CONSTRUCOES|
|                D.S.|
|    CONSELHO ESCOLAR|
|   PALMEIRA BARCELOS|
|       PEDRAS HANDER|
|MERCADO CORDEIRO ...|
|  ASSEMBLEIA DE DEUS|
|     CRUZEIRO DO SUL|
|SANTA CASA DO MON...|
| MERCEARIA PROGRESSO|
|     MORDIDA DE AMOR|
+--------------------+
only showing top 20 rows



In [15]:
print (g.vertices.count())

2055175


In [16]:
print (g.edges.count())

1068159


In [18]:
spark.stop()