In [1]:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, substring, when, length, split, lit
from pyspark.sql.types import StructType, StructField, StringType
from graphframes import *

In [3]:
# Dados de carga
csv_pj = 'dataset/empresas_estabelecimentos_1m.csv'
csv_socios = 'dataset/socios_1m.csv'

In [2]:
spark = SparkSession.builder.appName("trata_dados_2").getOrCreate()

23/11/29 15:20:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
# Schema para Json Empresa
json_schema_em = StructType([
    StructField("porteEmpresa", StringType(), True),
    StructField("capitalSocial", StringType(), True),
    StructField("cpfResponsavel", StringType(), True),
    StructField("nomeEmpresarial", StringType(), True),
    StructField("naturezaJuridica", StringType(), True),
    StructField("qualificacaoResponsavel", StringType(), True)
])

# Schema para Json Estabelecimento
json_schema_es = StructType([
    StructField("uf", StringType(), True),
    StructField("cep", StringType(), True),
    StructField("ddd1", StringType(), True),
    StructField("ddd2", StringType(), True),
    StructField("pais", StringType(), True),
    StructField("email", StringType(), True),
    StructField("bairro", StringType(), True),
    StructField("numero", StringType(), True),
    StructField("municipio", StringType(), True),
    StructField("telefone1", StringType(), True),
    StructField("telefone2", StringType(), True),
    StructField("cnaeFiscal", StringType(), True),
    StructField("logradouro", StringType(), True),
    StructField("complemento", StringType(), True),
    StructField("dataCadastro", StringType(), True),
    StructField("nomeFantasia", StringType(), True),
    StructField("cidadeExterior", StringType(), True),
    StructField("tipoLogradouro", StringType(), True),
    StructField("cnaesSecundarias", StringType(), True),
    StructField("situacaoEspecial", StringType(), True),
    StructField("situacaoCadastral", StringType(), True),
    StructField("dataSituacaoEspecial", StringType(), True),
    StructField("dataSituacaoCadastral", StringType(), True),
    StructField("motivoSituacaoCadastral", StringType(), True),
    StructField("identificadorMatrizFilial", StringType(), True)
])

# Definindo o esquema para leitura do CSV
schema = StructType([
    StructField("nu_cnpj_raiz", StringType(), True),
    StructField("te_dados_em", StringType(), True),
    StructField("id_estabelecimento", StringType(), True),
    StructField("te_dados_es", StringType(), True)
])

# Lendo o CSV com o esquema definido
pj = spark.read \
    .option("delimiter", ",") \
    .option("multiline", "true") \
    .option("escape", "\"") \
    .csv("dataset/empresas_estabelecimentos.csv", schema=schema, header=True)

In [4]:
pj = pj.withColumn("te_dados_em", from_json(col("te_dados_em"), json_schema_em)) \
      .withColumn("te_dados_es", from_json(col("te_dados_es"), json_schema_es)) \
      .filter(substring(col("id_estabelecimento"), 0, 4) == '0001') \
      .withColumnRenamed("nu_cnpj_raiz", "id")

In [5]:
# print (pj.head(2))
pj.show(truncate=False)
# pj.printSchema()

23/11/28 18:52:42 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+--------+----------------------------------------------------------------------------------------------+------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id      |te_dados_em                                                                                   |id_estabelecimento|te_dados_es                                                                                                                                                                                                                                                |
+--------+----------------------------------------------------------------------------------------------+------------------+--------------------------------------------------------------------------------------------------------------------------

In [6]:
# Schema para Json Socios
json_schema_sc = StructType([
    StructField("pais", StringType(), True),
    StructField("entradaSociedade", StringType(), True),
    StructField("socioEstrangeiro", StringType(), True),
    StructField("qualificacaoSocio", StringType(), True),
    StructField("identificadorSocio", StringType(), True),
    StructField("cpfRepresentanteLegal", StringType(), True),
    StructField("qualificacaoRepresentanteLegal", StringType(), True)
])

# Definindo o esquema para leitura do CSV
schema = StructType([
    StructField("nu_cnpj_raiz", StringType(), True),
    StructField("id_socio", StringType(), True),
    StructField("te_dados_sc", StringType(), True)
])

# Lendo o CSV com o esquema definido
sc = spark.read \
    .option("delimiter", ",") \
    .option("multiline", "true") \
    .option("escape", "\"") \
    .csv("dataset/socios.csv", schema=schema, header=True)

In [7]:
# tratando o df socios
# Tratamento 1: Expandir o JSON
sc = sc.withColumn("te_dados_sc", from_json(col("te_dados_sc"), json_schema_sc))

sc = sc.withColumn("id_socio", when(sc["te_dados_sc"]["identificadorSocio"] == '1',
                                    sc["id_socio"].substr(1, 8))
                                .otherwise(sc["id_socio"]))

sc = sc.withColumnRenamed("nu_cnpj_raiz", "src").withColumnRenamed("id_socio", "dst")

In [8]:
# print (sc.head(2))
sc.show(truncate=False)
# sc.printSchema()

+--------+--------------+-----------------------------------------+
|src     |dst           |te_dados_sc                              |
+--------+--------------+-----------------------------------------+
|03250001|00050690981449|{000, 20050912, , 16, 2, 00000000000, 00}|
|03250002|00035046376091|{000, 20000914, , 22, 2, 00000000000, 00}|
|03250002|00082462291091|{000, 19991216, , 49, 2, 00000000000, 00}|
|03250003|00047093161987|{000, 20180829, , 49, 2, 00000000000, 00}|
|03250006|00005932096993|{000, 20060522, , 49, 2, 00000000000, 00}|
|03250006|00006117916922|{000, 20060522, , 22, 2, 00000000000, 00}|
|03250007|00026853060563|{000, 19990630, , 49, 2, 00000000000, 00}|
|03250007|00038269295515|{000, 19990630, , 49, 2, 00000000000, 00}|
|03250009|00012962745865|{000, 19990629, , 49, 2, 00000000000, 00}|
|03250010|00001732487995|{000, 20210416, , 49, 2, 00000000000, 00}|
|03250011|00025678426087|{000, 19990624, , 49, 2, 00000000000, 00}|
|03250011|00049920154091|{000, 19990624, , 22, 2

In [9]:
# Cria df para vértices do tipo Pessoa Física:
pf = sc.filter(col("te_dados_sc.identificadorSocio") == '2').select(col("dst").alias("id"))


In [10]:
cols_pj = [col_name for col_name in pj.columns if col_name != 'id']
for col_name in cols_pj:
    pf = pf.withColumn(col_name, lit(None))

In [11]:
pf.show(truncate=False)
# pf.printSchema()

+--------------+-----------+------------------+-----------+
|id            |te_dados_em|id_estabelecimento|te_dados_es|
+--------------+-----------+------------------+-----------+
|00050690981449|NULL       |NULL              |NULL       |
|00035046376091|NULL       |NULL              |NULL       |
|00082462291091|NULL       |NULL              |NULL       |
|00047093161987|NULL       |NULL              |NULL       |
|00005932096993|NULL       |NULL              |NULL       |
|00006117916922|NULL       |NULL              |NULL       |
|00026853060563|NULL       |NULL              |NULL       |
|00038269295515|NULL       |NULL              |NULL       |
|00012962745865|NULL       |NULL              |NULL       |
|00001732487995|NULL       |NULL              |NULL       |
|00025678426087|NULL       |NULL              |NULL       |
|00049920154091|NULL       |NULL              |NULL       |
|00007247777761|NULL       |NULL              |NULL       |
|00089969189700|NULL       |NULL        

In [12]:
# cria tipo de pessoa na nos vértices
pj = pj.withColumn("id_tipoPessoa", lit(1))
pf = pf.withColumn("id_tipoPessoa", lit(2))

In [13]:
pf.show(truncate=False)

+--------------+-----------+------------------+-----------+-------------+
|id            |te_dados_em|id_estabelecimento|te_dados_es|id_tipoPessoa|
+--------------+-----------+------------------+-----------+-------------+
|00050690981449|NULL       |NULL              |NULL       |2            |
|00035046376091|NULL       |NULL              |NULL       |2            |
|00082462291091|NULL       |NULL              |NULL       |2            |
|00047093161987|NULL       |NULL              |NULL       |2            |
|00005932096993|NULL       |NULL              |NULL       |2            |
|00006117916922|NULL       |NULL              |NULL       |2            |
|00026853060563|NULL       |NULL              |NULL       |2            |
|00038269295515|NULL       |NULL              |NULL       |2            |
|00012962745865|NULL       |NULL              |NULL       |2            |
|00001732487995|NULL       |NULL              |NULL       |2            |
|00025678426087|NULL       |NULL      

In [14]:
# Criando o grafo de relacionamento empresas->socios
g = GraphFrame(pj.union(pf), sc)

In [15]:
g.vertices.show()

+--------+--------------------+------------------+--------------------+-------------+
|      id|         te_dados_em|id_estabelecimento|         te_dados_es|id_tipoPessoa|
+--------+--------------------+------------------+--------------------+-------------+
|03250000|{01, 000000015000...|            000138|{RS, 97770000, 55...|            1|
|03250001|{05, 000000000000...|            000182|{PE, 56460000, , ...|            1|
|03250002|{01, 000000000000...|            000127|{RS, 91220580, 05...|            1|
|03250003|{03, 000000100000...|            000171|{SC, 88075001, 48...|            1|
|03250004|{05, 000000000000...|            000116|{RJ, 23073070, , ...|            1|
|03250005|{05, 000000000000...|            000160|{RS, 98130000, , ...|            1|
|03250006|{01, 000000000000...|            000105|{SC, 88960000, 48...|            1|
|03250007|{05, 000000000000...|            000150|{BA, 44001465, 07...|            1|
|03250008|{01, 000000000000...|            000102|{MT,

In [16]:
# Filtrando os vértices com comprimento de 'id' maior que 8
vertices_filtrados = g.vertices.filter(g.vertices["id_tipoPessoa"] == 2)

# Exibindo os vértices filtrados
vertices_filtrados.show()

+--------------+-----------+------------------+-----------+-------------+
|            id|te_dados_em|id_estabelecimento|te_dados_es|id_tipoPessoa|
+--------------+-----------+------------------+-----------+-------------+
|00050690981449|       NULL|              NULL|       NULL|            2|
|00035046376091|       NULL|              NULL|       NULL|            2|
|00082462291091|       NULL|              NULL|       NULL|            2|
|00047093161987|       NULL|              NULL|       NULL|            2|
|00005932096993|       NULL|              NULL|       NULL|            2|
|00006117916922|       NULL|              NULL|       NULL|            2|
|00026853060563|       NULL|              NULL|       NULL|            2|
|00038269295515|       NULL|              NULL|       NULL|            2|
|00012962745865|       NULL|              NULL|       NULL|            2|
|00001732487995|       NULL|              NULL|       NULL|            2|
|00025678426087|       NULL|          

In [17]:
g.degrees.show()

23/11/28 18:53:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/11/28 18:53:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/11/28 18:54:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/11/28 18:54:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
[Stage 6:>                                                          (0 + 1) / 1]

+--------------+------+
|            id|degree|
+--------------+------+
|      03250015|     1|
|00095818499715|     1|
|      03250329|     2|
|00026923943809|     1|
|      03250402|     2|
|00077271181415|     4|
|00035380802168|     1|
|00034271902187|     1|
|00017914088854|     1|
|00005149062430|     1|
|00001171545800|     1|
|      03251400|     2|
|00044526628620|     1|
|00099513803015|     1|
|      03251565|     2|
|00026132426809|     7|
|00003034530609|     2|
|00030445094400|     1|
|      03251941|     2|
|00095549153904|     1|
+--------------+------+
only showing top 20 rows



                                                                                

In [18]:
Distinct_nome_fantasia = g.vertices.select('te_dados_es.nomeFantasia').distinct()
Distinct_nome_fantasia.show()

23/11/28 18:56:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/11/28 19:00:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/11/28 19:02:57 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

+--------------------+
|        nomeFantasia|
+--------------------+
|     IDEAL MOTO TAXI|
|  SHOPPING DAS LOJAS|
|   PESQUEIRO PARAISO|
|    CONSELHO ESCOLAR|
|             CALMELL|
|   VITALLI ESTOFADOS|
|            METALART|
|CABOS, CORDAS & CIA.|
|     CASA DE RETORNO|
|MULTTECH INFORMATICA|
|POUSADA NOSSA SEN...|
|      MERCADO GRACEL|
|  CAMILOOS CORRETORA|
|BAR E MERCEARIA V...|
|   DELICIAS CASEIRAS|
|GRANJA FRANGO DE ...|
|              AAP/RN|
|  MECANICA DO GERINO|
|  PLANO AGROPECUARIO|
|           CLOVISTUR|
+--------------------+
only showing top 20 rows



                                                                                

In [19]:
print (g.vertices.count())



44071255


                                                                                

In [20]:
print (g.edges.count())

[Stage 15:>                                                         (0 + 1) / 1]

14377190


                                                                                

In [21]:
spark.stop()