In [1]:
'''
Machine Learning para Analise de Dados
CESAR Scholl
Projeto de analise de credito
Recife, 2019


Equipe:
    Claudio Alves Monteiro
    Marcos Antonio Almeida Souto Júnior
    Virgínia Heimann
    Kayo Renato da Silva Nascimento
    Rosely Cabral
'''



'\nMachine Learning para Analise de Dados\nCESAR Scholl\nProjeto de analise de credito\nRecife, 2019\n\n\nEquipe:\n    Claudio Alves Monteiro\n    Marcos Antonio Almeida Souto Júnior\n    Virgínia Heimann\n    Kayo Renato da Silva Nascimento\n    Rosely Cabral\n'

In [2]:
# Spark Session
from pyspark.sql import SparkSession, Row
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()


In [3]:
# import modules
import os
import pandas as pd
from pyspark.sql import functions as SF
import pyspark.sql.types as ST

# paths to spark and python3
os.environ['PYSPARK_SUBMIT_ARGS'] = '--executor-memory 1G pyspark-shell'
os.environ["SPARK_HOME"] = "/home/pacha/spark"
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"

In [4]:
# import data

df = spark.read.csv('data/jur.csv',
                    sep='\t',
                    encoding='utf-8',
                    header=True,
                    inferSchema=False)


df_cursor = spark.read.csv('data/Base_Des-TRN',
                    sep='\t',
                    encoding='utf-8',
                    header=True,
                    inferSchema=False)


In [5]:
df.printSchema()

root
 |-- CNPJ: string (nullable = true)
 |-- NOME_EMPRESA: string (nullable = true)
 |-- FANTASIA: string (nullable = true)
 |-- NATUREZA_JURIDICA: string (nullable = true)
 |-- ATIVIDADE_PRINCIPAL: string (nullable = true)
 |-- ATIVIDADES_SECUNDARIAS: string (nullable = true)
 |-- CNPJ_FORMATADO: string (nullable = true)
 |-- TIPO: string (nullable = true)
 |-- LOGRADOURO: string (nullable = true)
 |-- NUMERO: string (nullable = true)
 |-- COMPLEMENTO: string (nullable = true)
 |-- BAIRRO: string (nullable = true)
 |-- CEP: string (nullable = true)
 |-- MUNICIPIO: string (nullable = true)
 |-- UF: string (nullable = true)
 |-- TELEFONE_1: string (nullable = true)
 |-- TELEFONE_2: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- ABERTURA: string (nullable = true)
 |-- CAPITAL_SOCIAL: string (nullable = true)
 |-- MOTIVO_SITUACAO: string (nullable = true)
 |-- SITUACAO: string (nullable = true)
 |-- DATA_SITUACAO: string (nullable = true)
 |-- SITUACAO_ESPECIAL: strin

In [6]:
df.show(10, False)

+--------------+-----------------------------------------------+---------------------+-----------------------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+------+------------------------+------+------------------------+------------------+--------+--------------+---+--------------+--------------+------------------------------------+------------------------+--------------+---------------+--------+------------------------+-----------------+----------------------+----+--------------------------------+-------------------------------------------------+------------------+---------------+----------+------------------------+------+----------+-------------+------------+-

In [7]:
df_cursor.show(5)

+--------------+--------+-------+
|          CNPJ|DATA_REF|CS_ALVO|
+--------------+--------+-------+
|11507110000160| 2016-10|      0|
|15062189000114| 2016-07|      0|
| 5841171000167| 2016-08|      0|
| 8792807000116| 2016-08|      0|
|24060073000137| 2017-01|      1|
+--------------+--------+-------+
only showing top 5 rows



In [8]:
#------ tratar inicio do CNPJ sem o 0

@SF.udf('string')
def fill_cnpj(value):
    aux = '00000000000000'+value
    return aux[len(value):]

#------ tratar inicio do CEP sem o 0

@SF.udf('string')
def fill_cep(value):
    aux = '00000000'+value
    return aux[len(value):]



In [9]:
df_cursor = df_cursor.withColumn('CNPJ',fill_cnpj('CNPJ'))
df_cursor.show(5)

+--------------+--------+-------+
|          CNPJ|DATA_REF|CS_ALVO|
+--------------+--------+-------+
|11507110000160| 2016-10|      0|
|15062189000114| 2016-07|      0|
|05841171000167| 2016-08|      0|
|08792807000116| 2016-08|      0|
|24060073000137| 2017-01|      1|
+--------------+--------+-------+
only showing top 5 rows



In [10]:
df = df.withColumn('CEP',fill_cep('CEP'))
#df.show(20)

In [11]:
df = df.withColumn('CNPJ',fill_cnpj('CNPJ'))
#df.show(5)

In [12]:
#------ Convertendo REF_DATE do df_cursor para timestamp

df_cursor = df_cursor.withColumn('REF_DATE',SF.from_unixtime(
            SF.unix_timestamp('DATA_REF','yyyy-MM')).cast('timestamp'))

#df_cursor.show(5)

In [13]:
#------ Convertendo REF_DATE do df para timestamp

df = df.withColumn('REF_DATE',SF.col('REF_DATE').cast('timestamp'))

In [14]:
df = df.withColumnRenamed('CNPJ','CNPJ_DF').withColumnRenamed('REF_DATE','REF_DATE_DF')

In [15]:
#------ Criando o df_base. O df_base é construído a partir do cursor. Todas as entradas do cursor são preservadas 
# e é feito um join com as entradas do df que correspondem à condição dupla: mesmo "CNPJ" e "Data de Referência" do df
# anterior à "Dara de Referência" do cursor  

df_base = df_cursor.join(df,
                 (df_cursor['CNPJ']==df['CNPJ_DF']) & 
                 (df_cursor['REF_DATE'] >= df['REF_DATE_DF']),
                 'left')

<h1>1) Quantidade de EMAILS</h1>

In [16]:
# Considera todas as tuplas (CNPJ, REF_DATE, EMAIL) do df_base. Elimina as duplicadas. Assim, encontra a quantidade de e-mails
# diferentes.

#df_aux = df_base.select('CNPJ','REF_DATE','EMAIL').dropDuplicates().\
#            groupBy('CNPJ','REF_DATE').agg(SF.count('EMAIL').alias('QTD_EMAIL')).sort(SF.desc('QTD_EMAIL'))
#df_aux.show(10)

In [17]:
# Cria um df_final (que será a nossa base de dados para rodar o algoritmo de ML). Nesse momento será o cursor + Qtd de e-mails

#df_final = df_cursor.join(df_aux,['CNPJ','REF_DATE'],'left')
#df_final.show()

<h1>2) Quantidade de Telefones</h1>

In [18]:
#df_aux = df_base.select(['CNPJ','REF_DATE','TELEFONE_1','TELEFONE_2']).dropDuplicates()

In [19]:
#df_aux = df_aux.withColumn('QTD_TLF1',SF.when(SF.col('TELEFONE_1').isNotNull(),1).otherwise(0)).\
#                withColumn('QTD_TLF2',SF.when(SF.col('TELEFONE_2').isNotNull(),1).otherwise(0)).\
#                withColumn('QTD_TELEFONES',SF.col('QTD_TLF1')+SF.col('QTD_TLF2')).\
#                groupBy('CNPJ','REF_DATE').agg(SF.sum('QTD_TELEFONES').alias('QTD_TELEFONES'))
#df_aux.show()

In [20]:
#df_final = df_final.join(df_aux, ['CNPJ','REF_DATE'], 'left')
#df_final.show()

<h1>3) Classificação por CEP : Região, Subregião, Setor, Subsetor, Divisão de Subsetor e Logradouro</h1>

In [21]:
#df_aux = df_base.select(['CNPJ', 'REF_DATE','CEP']).dropDuplicates()

In [22]:
'''
df_aux = df_aux.withColumn('REGIAO',SF.col('CEP').substr(1, 1)).\
         withColumn('SUBREGIAO',SF.col('CEP').substr(1, 2)).\
         withColumn('SETOR',SF.col('CEP').substr(1, 3)).\
         withColumn('SUBSETOR',SF.col('CEP').substr(1, 4)).\
         withColumn('DIV_SUBSETOR',SF.col('CEP').substr(1, 5)).\
         withColumn('LOGRADOURO',SF.col('CEP').substr(1, 9)).\
         drop('CEP')
'''

"\ndf_aux = df_aux.withColumn('REGIAO',SF.col('CEP').substr(1, 1)).         withColumn('SUBREGIAO',SF.col('CEP').substr(1, 2)).         withColumn('SETOR',SF.col('CEP').substr(1, 3)).         withColumn('SUBSETOR',SF.col('CEP').substr(1, 4)).         withColumn('DIV_SUBSETOR',SF.col('CEP').substr(1, 5)).         withColumn('LOGRADOURO',SF.col('CEP').substr(1, 9)).         drop('CEP')\n"

In [23]:
#df_aux.show()

In [24]:
#df_final = df_cursor.join(df_aux,['CNPJ','REF_DATE'],'left')
#df_final.show()

#df_final = df_final.join(df_aux, ['CNPJ','REF_DATE'], 'left')
#df_final.show()

In [25]:
#df_final.groupBy('REGIAO').count().toPandas()

<h1>4) Quantidade de Sócios</h1>

In [26]:
df_aux = df_base.select('CNPJ','REF_DATE','NOME_SOCIO').dropDuplicates().\
            groupBy('CNPJ','REF_DATE').agg(SF.count('NOME_SOCIO').alias('QTD_SOCIOS')).sort(SF.desc('QTD_SOCIOS'))
df_aux.show(10)

+--------------+-------------------+----------+
|          CNPJ|           REF_DATE|QTD_SOCIOS|
+--------------+-------------------+----------+
|31130537000280|2016-10-01 00:00:00|        35|
|10297699000238|2017-04-01 00:00:00|        34|
|09456178000116|2016-09-01 00:00:00|        20|
|61087367000189|2017-03-01 00:00:00|        19|
|04907334000102|2016-09-01 00:00:00|        19|
|09643807000117|2017-04-01 00:00:00|        18|
|10392044000168|2017-01-01 00:00:00|        17|
|75492694000392|2016-11-01 00:00:00|        16|
|48093892000149|2016-11-01 00:00:00|        16|
|09541072000110|2017-04-01 00:00:00|        15|
+--------------+-------------------+----------+
only showing top 10 rows



In [27]:
df_final = df_cursor.join(df_aux, ['CNPJ','REF_DATE'], 'left')
df_final.toPandas()

Unnamed: 0,CNPJ,REF_DATE,DATA_REF,CS_ALVO,QTD_SOCIOS
0,00004330000155,2016-10-01,2016-10,0,1
1,00078090000133,2017-03-01,2017-03,0,2
2,00109982000154,2016-09-01,2016-09,1,2
3,00281129000115,2017-06-01,2017-06,0,3
4,00294916000100,2017-02-01,2017-02,0,0
5,00370197000150,2016-11-01,2016-11,1,3
6,00711050000187,2016-11-01,2016-11,0,0
7,00951519000155,2017-01-01,2017-01,0,0
8,00984383000180,2016-09-01,2016-09,0,1
9,01013673000149,2017-02-01,2017-02,0,2


<h1> 5) Quantidade de Filiais do Grupo</h1>

In [28]:
'''df_aux = df_base.select('CNPJ','REF_DATE').dropDuplicates()
df_aux = df_aux.withColumn('GRUPO_EMP',SF.col('CNPJ').substr(1, 8)).\
         withColumn('FILIAL',SF.col('CNPJ').substr(9, 4)).\
         groupBy('CNPJ','REF_DATE','GRUPO_EMP').agg(SF.max('FILIAL').alias('QTD_FILIAIS')).drop('GRUPO_EMP').sort(SF.desc('QTD_FILIAIS'))



df_aux.show(10)
'''

"df_aux = df_base.select('CNPJ','REF_DATE').dropDuplicates()\ndf_aux = df_aux.withColumn('GRUPO_EMP',SF.col('CNPJ').substr(1, 8)).         withColumn('FILIAL',SF.col('CNPJ').substr(9, 4)).         groupBy('CNPJ','REF_DATE','GRUPO_EMP').agg(SF.max('FILIAL').alias('QTD_FILIAIS')).drop('GRUPO_EMP').sort(SF.desc('QTD_FILIAIS'))\n\n\n\ndf_aux.show(10)\n"

In [29]:
#df_final = df_final.join(df_aux, ['CNPJ','REF_DATE'], 'left')
#df_final.limit(10).toPandas()

In [30]:
#df_final.orderBy("QTD_FILIAIS", ascending=False).show(10)

<h1> 6) Natureza Jurídica </h1>

In [31]:
df_aux = df_base.select('CNPJ','REF_DATE','NATUREZA_JURIDICA').dropDuplicates()
df_aux.limit(20).toPandas()

Unnamed: 0,CNPJ,REF_DATE,NATUREZA_JURIDICA
0,66749870000176,2016-12-01,SOCIEDADE EMPRESARIA LIMITADA
1,22556082000198,2016-07-01,SOCIEDADE EMPRESARIA LIMITADA
2,18969788000104,2016-10-01,SOCIEDADE EMPRESARIA LIMITADA
3,16714499000139,2017-05-01,SOCIEDADE EMPRESARIA LIMITADA
4,21961675000177,2017-01-01,EMPRESARIO (INDIVIDUAL)
5,18865759000100,2017-03-01,EMPRESARIO (INDIVIDUAL)
6,24153373000160,2017-04-01,SOCIEDADE EMPRESARIA LIMITADA
7,95821310000183,2017-01-01,COOPERATIVA
8,20968143000107,2016-12-01,EMPRESARIO (INDIVIDUAL)
9,9065539000101,2017-02-01,EMPRESA INDIVIDUAL DE RESPONSABILIDADE LIMITAD...


In [32]:
df_final = df_final.join(df_aux,['CNPJ','REF_DATE'], 'left' )
df_final.limit(10).toPandas()

Unnamed: 0,CNPJ,REF_DATE,DATA_REF,CS_ALVO,QTD_SOCIOS,NATUREZA_JURIDICA
0,4330000155,2016-10-01,2016-10,0,1,EMPRESA INDIVIDUAL DE RESPONSABILIDADE LIMITAD...
1,78090000133,2017-03-01,2017-03,0,2,SOCIEDADE EMPRESARIA LIMITADA
2,109982000154,2016-09-01,2016-09,1,2,SOCIEDADE EMPRESARIA LIMITADA
3,281129000115,2017-06-01,2017-06,0,3,SOCIEDADE EMPRESARIA LIMITADA
4,294916000100,2017-02-01,2017-02,0,0,EMPRESARIO (INDIVIDUAL)
5,370197000150,2016-11-01,2016-11,1,3,SOCIEDADE EMPRESARIA LIMITADA
6,711050000187,2016-11-01,2016-11,0,0,EMPRESARIO (INDIVIDUAL)
7,951519000155,2017-01-01,2017-01,0,0,EMPRESARIO (INDIVIDUAL)
8,984383000180,2016-09-01,2016-09,0,1,SOCIEDADE EMPRESARIA LIMITADA
9,1013673000149,2017-02-01,2017-02,0,2,SOCIEDADE EMPRESARIA LIMITADA


<h1> Idade da Empresa </h1>

In [33]:
df_aux = df_base.select('CNPJ','REF_DATE','ABERTURA').dropDuplicates()
df_aux = df_aux.withColumn('ABERTURA',SF.col('ABERTURA').cast('timestamp'))
df_aux = df_aux.withColumn('IDADE',SF.col('REF_DATE').cast("long")-SF.col('ABERTURA').cast("long")).drop('ABERTURA')

df_aux.show(5)

+--------------+-------------------+---------+
|          CNPJ|           REF_DATE|    IDADE|
+--------------+-------------------+---------+
|18199343000192|2017-03-01 00:00:00|119070000|
|18919730000156|2017-06-01 00:00:00|116737200|
|11074264000106|2017-04-01 00:00:00|242190000|
|04359955000190|2016-08-01 00:00:00|484369200|
|15583915000144|2016-08-01 00:00:00|132980400|
+--------------+-------------------+---------+
only showing top 5 rows



In [34]:
df_final = df_final.join(df_aux,['CNPJ','REF_DATE'], 'left' )
df_final.limit(10).toPandas()

Unnamed: 0,CNPJ,REF_DATE,DATA_REF,CS_ALVO,QTD_SOCIOS,NATUREZA_JURIDICA,IDADE
0,4330000155,2016-10-01,2016-10,0,1,EMPRESA INDIVIDUAL DE RESPONSABILIDADE LIMITAD...,706244400
1,78090000133,2017-03-01,2017-03,0,2,SOCIEDADE EMPRESARIA LIMITADA,718081200
2,109982000154,2016-09-01,2016-09,1,2,SOCIEDADE EMPRESARIA LIMITADA,698814000
3,281129000115,2017-06-01,2017-06,0,3,SOCIEDADE EMPRESARIA LIMITADA,712465200
4,294916000100,2017-02-01,2017-02,0,0,EMPRESARIO (INDIVIDUAL),701492400
5,370197000150,2016-11-01,2016-11,1,3,SOCIEDADE EMPRESARIA LIMITADA,1351047600
6,711050000187,2016-11-01,2016-11,0,0,EMPRESARIO (INDIVIDUAL),674276400
7,951519000155,2017-01-01,2017-01,0,0,EMPRESARIO (INDIVIDUAL),665204400
8,984383000180,2016-09-01,2016-09,0,1,SOCIEDADE EMPRESARIA LIMITADA,652590000
9,1013673000149,2017-02-01,2017-02,0,2,SOCIEDADE EMPRESARIA LIMITADA,670129200


<h1> Capital Social </h1>

In [35]:
df.select('CAPITAL_SOCIAL').summary().show()

+-------+-------------------+
|summary|     CAPITAL_SOCIAL|
+-------+-------------------+
|  count|             214685|
|   mean| 1675295.2094820316|
| stddev|7.994737990014492E7|
|    min|                0.0|
|    25%|             5000.0|
|    50%|            20000.0|
|    75%|            60000.0|
|    max|          9999944.0|
+-------+-------------------+



In [36]:
df.select('CAPITAL_SOCIAL','NATUREZA_JURIDICA').filter(SF.col('CAPITAL_SOCIAL')==0).show(20,False)

+--------------+-----------------------------+
|CAPITAL_SOCIAL|NATUREZA_JURIDICA            |
+--------------+-----------------------------+
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |SOCIEDADE EMPRESARIA LIMITADA|
|0.0           |EMPRESARIO (INDIVIDUAL)      |
|0.0           |EMPRESARIO (INDIVIDUAL)      |
|0.0         

In [37]:
df_aux = df_base.select('CNPJ','REF_DATE','CAPITAL_SOCIAL').dropDuplicates()
df_aux.show(50)

+--------------+-------------------+--------------+
|          CNPJ|           REF_DATE|CAPITAL_SOCIAL|
+--------------+-------------------+--------------+
|00302268000188|2016-08-01 00:00:00|        5000.0|
|06096520000126|2016-11-01 00:00:00|           0.0|
|24577838000100|2017-05-01 00:00:00|       20000.0|
|24307783000119|2016-12-01 00:00:00|       20000.0|
|03010577000172|2016-10-01 00:00:00|       10000.0|
|00093055000193|2017-04-01 00:00:00|       10000.0|
|24714083000149|2016-11-01 00:00:00|      600000.0|
|09449193000137|2016-09-01 00:00:00|       20000.0|
|15359390000168|2016-08-01 00:00:00|       10000.0|
|03614341000145|2017-03-01 00:00:00|           0.0|
|14449434000188|2016-08-01 00:00:00|       30000.0|
|08511882000161|2017-03-01 00:00:00|        5000.0|
|20329263000156|2016-12-01 00:00:00|       30000.0|
|15268534000170|2016-09-01 00:00:00|       10000.0|
|07755174000103|2016-12-01 00:00:00|       10000.0|
|00308823000189|2016-10-01 00:00:00|      170000.0|
|12035210000

In [38]:
df_final = df_final.join(df_aux,['CNPJ','REF_DATE'], 'left' )
df_final.limit(10).toPandas()

Unnamed: 0,CNPJ,REF_DATE,DATA_REF,CS_ALVO,QTD_SOCIOS,NATUREZA_JURIDICA,IDADE,CAPITAL_SOCIAL
0,4330000155,2016-10-01,2016-10,0,1,EMPRESA INDIVIDUAL DE RESPONSABILIDADE LIMITAD...,706244400,90000.0
1,78090000133,2017-03-01,2017-03,0,2,SOCIEDADE EMPRESARIA LIMITADA,718081200,0.0
2,109982000154,2016-09-01,2016-09,1,2,SOCIEDADE EMPRESARIA LIMITADA,698814000,10000.0
3,281129000115,2017-06-01,2017-06,0,3,SOCIEDADE EMPRESARIA LIMITADA,712465200,30000.0
4,294916000100,2017-02-01,2017-02,0,0,EMPRESARIO (INDIVIDUAL),701492400,0.0
5,370197000150,2016-11-01,2016-11,1,3,SOCIEDADE EMPRESARIA LIMITADA,1351047600,100000.0
6,711050000187,2016-11-01,2016-11,0,0,EMPRESARIO (INDIVIDUAL),674276400,0.0
7,951519000155,2017-01-01,2017-01,0,0,EMPRESARIO (INDIVIDUAL),665204400,0.0
8,984383000180,2016-09-01,2016-09,0,1,SOCIEDADE EMPRESARIA LIMITADA,652590000,100000.0
9,1013673000149,2017-02-01,2017-02,0,2,SOCIEDADE EMPRESARIA LIMITADA,670129200,0.0


<h1> Tipo (Matriz ou Filial) </h1>

In [39]:
df_aux = df_base.select('CNPJ','REF_DATE','TIPO').dropDuplicates()
df_final = df_final.join(df_aux,['CNPJ','REF_DATE'], 'left' )
df_final.limit(10).toPandas()

Unnamed: 0,CNPJ,REF_DATE,DATA_REF,CS_ALVO,QTD_SOCIOS,NATUREZA_JURIDICA,IDADE,CAPITAL_SOCIAL,TIPO
0,4330000155,2016-10-01,2016-10,0,1,EMPRESA INDIVIDUAL DE RESPONSABILIDADE LIMITAD...,706244400,90000.0,MATRIZ
1,78090000133,2017-03-01,2017-03,0,2,SOCIEDADE EMPRESARIA LIMITADA,718081200,0.0,MATRIZ
2,109982000154,2016-09-01,2016-09,1,2,SOCIEDADE EMPRESARIA LIMITADA,698814000,10000.0,MATRIZ
3,281129000115,2017-06-01,2017-06,0,3,SOCIEDADE EMPRESARIA LIMITADA,712465200,30000.0,MATRIZ
4,294916000100,2017-02-01,2017-02,0,0,EMPRESARIO (INDIVIDUAL),701492400,0.0,MATRIZ
5,370197000150,2016-11-01,2016-11,1,3,SOCIEDADE EMPRESARIA LIMITADA,1351047600,100000.0,MATRIZ
6,711050000187,2016-11-01,2016-11,0,0,EMPRESARIO (INDIVIDUAL),674276400,0.0,MATRIZ
7,951519000155,2017-01-01,2017-01,0,0,EMPRESARIO (INDIVIDUAL),665204400,0.0,MATRIZ
8,984383000180,2016-09-01,2016-09,0,1,SOCIEDADE EMPRESARIA LIMITADA,652590000,100000.0,MATRIZ
9,1013673000149,2017-02-01,2017-02,0,2,SOCIEDADE EMPRESARIA LIMITADA,670129200,0.0,MATRIZ


In [40]:
df_final.show(100,False)

+--------------+-------------------+--------+-------+----------+------------------------------------------------------------------------+----------+--------------+------+
|CNPJ          |REF_DATE           |DATA_REF|CS_ALVO|QTD_SOCIOS|NATUREZA_JURIDICA                                                       |IDADE     |CAPITAL_SOCIAL|TIPO  |
+--------------+-------------------+--------+-------+----------+------------------------------------------------------------------------+----------+--------------+------+
|00004330000155|2016-10-01 00:00:00|2016-10 |0      |1         |EMPRESA INDIVIDUAL DE RESPONSABILIDADE LIMITADA (DE NATUREZA EMPRESARIA)|706244400 |90000.0       |MATRIZ|
|00078090000133|2017-03-01 00:00:00|2017-03 |0      |2         |SOCIEDADE EMPRESARIA LIMITADA                                           |718081200 |0.0           |MATRIZ|
|00109982000154|2016-09-01 00:00:00|2016-09 |1      |2         |SOCIEDADE EMPRESARIA LIMITADA                                           |69881400

<h1> Situação </h1>

In [41]:
df_aux = df_base.select('CNPJ','REF_DATE','SITUACAO').dropDuplicates()
df_final = df_final.join(df_aux,['CNPJ','REF_DATE'], 'left' )

<h1> Atividade Principal </h1>

In [42]:
df_aux = df_base.select('CNPJ','REF_DATE','ATIVIDADE_PRINCIPAL').dropDuplicates()
df_final = df_final.join(df_aux,['CNPJ','REF_DATE'], 'left' )
df_final.show(20,False)

+--------------+-------------------+--------+-------+----------+------------------------------------------------------------------------+----------+--------------+------+--------+----------------------------------------------------------------------------------------------------------------------------+
|CNPJ          |REF_DATE           |DATA_REF|CS_ALVO|QTD_SOCIOS|NATUREZA_JURIDICA                                                       |IDADE     |CAPITAL_SOCIAL|TIPO  |SITUACAO|ATIVIDADE_PRINCIPAL                                                                                                         |
+--------------+-------------------+--------+-------+----------+------------------------------------------------------------------------+----------+--------------+------+--------+----------------------------------------------------------------------------------------------------------------------------+
|00004330000155|2016-10-01 00:00:00|2016-10 |0      |1         |EMPRESA INDIVIDUAL DE

<h1> Unidade Federativa </h1>

In [47]:
df.groupby('UF').count().orderBy('count').show(30)

+----+-----+
|  UF|count|
+----+-----+
|  RR|  112|
|  AP|  129|
|  AC|  161|
|  TO|  502|
|  RO|  510|
|  PI|  555|
|  SE|  675|
|  AL|  675|
|  MA|  859|
|  RN|  875|
|  PB| 1115|
|  AM| 1123|
|  PA| 1342|
|  MS| 1602|
|  MT| 1623|
|  CE| 3280|
|  ES| 4203|
|  DF| 4284|
|  GO| 4312|
|  PE| 5080|
|  BA| 7906|
|  SC| 8411|
|null| 9013|
|  RS|13363|
|  PR|14587|
|  MG|20187|
|  RJ|25541|
|  SP|82660|
+----+-----+



In [48]:
df_aux = df_base.select('CNPJ','REF_DATE','UF').dropDuplicates()
df_final = df_final.join(df_aux,['CNPJ','REF_DATE'], 'left' )
df_final.show(20,False)

+--------------+-------------------+--------+-------+----------+------------------------------------------------------------------------+----------+--------------+------+--------+----------------------------------------------------------------------------------------------------------------------------+---+
|CNPJ          |REF_DATE           |DATA_REF|CS_ALVO|QTD_SOCIOS|NATUREZA_JURIDICA                                                       |IDADE     |CAPITAL_SOCIAL|TIPO  |SITUACAO|ATIVIDADE_PRINCIPAL                                                                                                         |UF |
+--------------+-------------------+--------+-------+----------+------------------------------------------------------------------------+----------+--------------+------+--------+----------------------------------------------------------------------------------------------------------------------------+---+
|00004330000155|2016-10-01 00:00:00|2016-10 |0      |1         |EMPRESA I