In [1]:
'''
Machine Learning para Analise de Dados
CESAR Scholl
Projeto de analise de credito
Recife, 2019


Equipe:
    Claudio Alves Monteiro
    Marcos Antonio Almeida Souto Júnior
    Virgínia Heimann
    Kayo Renato
    Rosely
'''

'\nMachine Learning para Analise de Dados\nCESAR Scholl\nProjeto de analise de credito\nRecife, 2019\n\n\nEquipe:\n    Claudio Alves Monteiro\n    Marcos Antonio Almeida Souto Júnior\n    Virgínia Heimann\n    Kayo Renato\n    Rosely\n'

In [2]:
# no Spark SQL importa SparkSession enquanto no RDD importa o SparkContext
from pyspark.sql import SparkSession, Row
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()


In [3]:
# import modules
import os
import pandas as pd
from pyspark.sql import functions as SF
import pyspark.sql.types as ST

# paths to spark and python3
os.environ['PYSPARK_SUBMIT_ARGS'] = '--executor-memory 1G pyspark-shell'
os.environ["SPARK_HOME"] = "/home/pacha/spark"
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"

In [4]:
# import data

df = spark.read.csv('data/BASE_JUR001.csv',
                    sep='\t',
                    encoding='utf-8',
                    header=True,
                    inferSchema=False)


df_cursor = spark.read.csv('data/Base_Des-TRN',
                    sep='\t',
                    encoding='utf-8',
                    header=True,
                    inferSchema=False)


In [5]:
#=========================#
# descriptive statistics
#========================#

df.groupby('NATUREZA_JURIDICA').count().show(100)

df.groupby('ATIVIDADE_PRINCIPAL').count().show(100)

df.groupby('ATIVIDADES_SECUNDARIAS').count().show(100)

df.groupby('TIPO').count().show(100)

df.groupby('CARGO').count().show(100)

+--------------------+------+
|   NATUREZA_JURIDICA| count|
+--------------------+------+
|SOCIEDADE ANONIMA...|   381|
|SOCIEDADE EMPRESA...|    14|
|         COOPERATIVA|   289|
|EMPRESA INDIVIDUA...|     9|
|EMPRESARIO (INDIV...| 48218|
|SERVICO SOCIAL AU...|    10|
|EMPRESA INDIVIDUA...| 14925|
|SOCIEDADE EMPRESA...|150601|
|SOCIEDADE SIMPLES...|    29|
|ORGANIZACAO RELIG...|     2|
|SOCIEDADE SIMPLES...|   155|
|  ASSOCIACAO PRIVADA|    28|
|   ENTIDADE SINDICAL|     7|
|SOCIEDADE ANONIMA...|     9|
|    FUNDACAO PRIVADA|     8|
+--------------------+------+

+--------------------+-----+
| ATIVIDADE_PRINCIPAL|count|
+--------------------+-----+
|COMERCIO A VAREJO...|    2|
|CORRESPONDENTES D...|    6|
|CRIACAO DE PEIXES...|    1|
|COMERCIO SOB CONS...|    1|
|FABRICACAO DE SUC...|    2|
|COMERCIO VAREJIST...|    6|
|COMERCIO ATACADIS...|    3|
|OPERADORES TURIST...|    4|
|COMERCIO ATACADIS...|    2|
|ALUGUEL DE MAQUIN...|    2|
|INSTALACAO DE POR...|    1|
|RECUPERACAO DE MA...| 

In [6]:
#------ tratar inicio do CNPJ sem o 0

@SF.udf('string')
def fill_cnpj(value):
    aux = '00000000000000'+value
    return aux[len(value):]

In [7]:
df_cursor = df_cursor.withColumn('CNPJ',fill_cnpj('CNPJ'))
df_cursor.show(5)

+--------------+--------+-------+
|          CNPJ|DATA_REF|CS_ALVO|
+--------------+--------+-------+
|11507110000160| 2016-10|      0|
|15062189000114| 2016-07|      0|
|05841171000167| 2016-08|      0|
|08792807000116| 2016-08|      0|
|24060073000137| 2017-01|      1|
+--------------+--------+-------+
only showing top 5 rows



In [8]:
#------ Convertendo REF_DATE do df_cursor para timestamp

df_cursor = df_cursor.withColumn('REF_DATE',SF.from_unixtime(
            SF.unix_timestamp('DATA_REF','yyyy-MM')).cast('timestamp'))

In [9]:
df_cursor.show(5)

+--------------+--------+-------+-------------------+
|          CNPJ|DATA_REF|CS_ALVO|           REF_DATE|
+--------------+--------+-------+-------------------+
|11507110000160| 2016-10|      0|2016-10-01 00:00:00|
|15062189000114| 2016-07|      0|2016-07-01 00:00:00|
|05841171000167| 2016-08|      0|2016-08-01 00:00:00|
|08792807000116| 2016-08|      0|2016-08-01 00:00:00|
|24060073000137| 2017-01|      1|2017-01-01 00:00:00|
+--------------+--------+-------+-------------------+
only showing top 5 rows



In [10]:
#------ Convertendo REF_DATE do df para timestamp

df = df.withColumn('REF_DATE',SF.col('REF_DATE').cast('timestamp'))

In [11]:
df = df.withColumnRenamed('CNPJ','CNPJ_DF').withColumnRenamed('REF_DATE','REF_DATE_DF')

In [12]:
#------ Criando o df_base. Aqui foi ignorada a condição de REF_DATE apenas para testes, por que caso contrário não 
#------ retornaria nenhum valor. QUestionar os professores o que fazer !!!!!

'''
df_base = df_cursor.join(df,
                 (df_cursor['CNPJ']==df['CNPJ_DF']) & 
                 (df_cursor['REF_DATE'] >= df['REF_DATE_DF']),
                 'left')
'''

df_base = df_cursor.join(df,
                 (df_cursor['CNPJ']==df['CNPJ_DF']),
                 'left')


In [13]:
df_base.show(5)

+--------------+--------+-------+-------------------+--------------+--------------------+--------------------+--------------------+--------------------+----------------------+------------------+------+--------------------+------+-----------+--------------------+--------+----------------+---+----------+----------+--------------------+-------------------+--------------+---------------+--------+-------------------+-----------------+----------------------+----+--------------------+-----+--------------+---------------+-------------------+--------------------+------+----------+-------------+------------+--------------------+----+-----+
|          CNPJ|DATA_REF|CS_ALVO|           REF_DATE|       CNPJ_DF|        NOME_EMPRESA|            FANTASIA|   NATUREZA_JURIDICA| ATIVIDADE_PRINCIPAL|ATIVIDADES_SECUNDARIAS|    CNPJ_FORMATADO|  TIPO|          LOGRADOURO|NUMERO|COMPLEMENTO|              BAIRRO|     CEP|       MUNICIPIO| UF|TELEFONE_1|TELEFONE_2|               EMAIL|           ABERTURA|CAPITAL

<h1>1) Quantidade de EMAILS</h1>

In [14]:
df_aux = df_base.select('CNPJ','EMAIL','REF_DATE').dropDuplicates().\
            groupBy('CNPJ','REF_DATE').agg(SF.count('EMAIL').alias('QTD_EMAIL')).sort(SF.desc('QTD_EMAIL'))
df_aux.show(10)

+--------------+-------------------+---------+
|          CNPJ|           REF_DATE|QTD_EMAIL|
+--------------+-------------------+---------+
|02079245000181|2016-10-01 00:00:00|        1|
|17558182000113|2016-10-01 00:00:00|        1|
|19375312000108|2016-12-01 00:00:00|        1|
|06267716000136|2017-01-01 00:00:00|        1|
|04483246000112|2016-12-01 00:00:00|        1|
|20106766000162|2016-09-01 00:00:00|        1|
|16675505000196|2017-04-01 00:00:00|        1|
|21602873000144|2016-11-01 00:00:00|        1|
|04542657000131|2017-01-01 00:00:00|        1|
|16974905000100|2017-05-01 00:00:00|        1|
+--------------+-------------------+---------+
only showing top 10 rows



In [15]:
df_final = df_cursor.join(df_aux,['CNPJ','REF_DATE'],'left')
df_final.show()

+--------------+-------------------+--------+-------+---------+
|          CNPJ|           REF_DATE|DATA_REF|CS_ALVO|QTD_EMAIL|
+--------------+-------------------+--------+-------+---------+
|00004330000155|2016-10-01 00:00:00| 2016-10|      0|        0|
|00078090000133|2017-03-01 00:00:00| 2017-03|      0|        0|
|00109982000154|2016-09-01 00:00:00| 2016-09|      1|        1|
|00281129000115|2017-06-01 00:00:00| 2017-06|      0|        0|
|00294916000100|2017-02-01 00:00:00| 2017-02|      0|        0|
|00370197000150|2016-11-01 00:00:00| 2016-11|      1|        0|
|00711050000187|2016-11-01 00:00:00| 2016-11|      0|        1|
|00951519000155|2017-01-01 00:00:00| 2017-01|      0|        0|
|00984383000180|2016-09-01 00:00:00| 2016-09|      0|        0|
|01013673000149|2017-02-01 00:00:00| 2017-02|      0|        0|
|01024580000110|2016-09-01 00:00:00| 2016-09|      0|        0|
|01163316000167|2017-01-01 00:00:00| 2017-01|      0|        1|
|01252815000120|2016-07-01 00:00:00| 201

<h1>2) Quantidade de Telefones</h1>

In [16]:
df_aux = df_base.select(['CNPJ','REF_DATE','TELEFONE_1','TELEFONE_2']).dropDuplicates()

In [17]:
df_aux = df_aux.withColumn('QTD_TLF1',SF.when(SF.col('TELEFONE_1').isNotNull(),1).otherwise(0)).\
                withColumn('QTD_TLF2',SF.when(SF.col('TELEFONE_2').isNotNull(),1).otherwise(0)).\
                withColumn('QTD_TELEFONES',SF.col('QTD_TLF1')+SF.col('QTD_TLF2')).\
                groupBy('CNPJ','REF_DATE').agg(SF.sum('QTD_TELEFONES').alias('QTD_TELEFONES'))
df_aux.show()

+--------------+-------------------+-------------+
|          CNPJ|           REF_DATE|QTD_TELEFONES|
+--------------+-------------------+-------------+
|00231887000129|2017-05-01 00:00:00|            0|
|00256837000104|2017-01-01 00:00:00|            0|
|00501368000133|2016-09-01 00:00:00|            0|
|00560191000146|2017-01-01 00:00:00|            2|
|00642972000180|2017-06-01 00:00:00|            0|
|00663069000103|2016-07-01 00:00:00|            0|
|00691020000156|2017-05-01 00:00:00|            0|
|00701583000188|2016-12-01 00:00:00|            1|
|00931390000113|2016-07-01 00:00:00|            0|
|01276895000154|2017-03-01 00:00:00|            0|
|01351627000150|2016-10-01 00:00:00|            0|
|01685806000123|2017-04-01 00:00:00|            2|
|01716865000111|2016-11-01 00:00:00|            1|
|01840520000175|2016-12-01 00:00:00|            0|
|01928075000108|2016-08-01 00:00:00|            2|
|01940503000100|2016-10-01 00:00:00|            1|
|02231876000174|2017-04-01 00:0

In [18]:
df_final = df_final.join(df_aux, ['CNPJ','REF_DATE'], 'left')
df_final.show()

+--------------+-------------------+--------+-------+---------+-------------+
|          CNPJ|           REF_DATE|DATA_REF|CS_ALVO|QTD_EMAIL|QTD_TELEFONES|
+--------------+-------------------+--------+-------+---------+-------------+
|00004330000155|2016-10-01 00:00:00| 2016-10|      0|        0|            0|
|00078090000133|2017-03-01 00:00:00| 2017-03|      0|        0|            1|
|00109982000154|2016-09-01 00:00:00| 2016-09|      1|        1|            2|
|00281129000115|2017-06-01 00:00:00| 2017-06|      0|        0|            0|
|00294916000100|2017-02-01 00:00:00| 2017-02|      0|        0|            0|
|00370197000150|2016-11-01 00:00:00| 2016-11|      1|        0|            0|
|00711050000187|2016-11-01 00:00:00| 2016-11|      0|        1|            2|
|00951519000155|2017-01-01 00:00:00| 2017-01|      0|        0|            0|
|00984383000180|2016-09-01 00:00:00| 2016-09|      0|        0|            1|
|01013673000149|2017-02-01 00:00:00| 2017-02|      0|        0| 

<h1>3) Classificação por CEP : Região, Subregião, Setor, Subsetor, Divisão de Subsetor e Logradouro</h1>

In [19]:
df_aux = df_base.select(['CNPJ', 'REF_DATE','CEP']).dropDuplicates()

In [20]:
df_aux = df_aux.withColumn('REGIAO',SF.col('CEP').substr(1, 1)).\
         withColumn('SUBREGIAO',SF.col('CEP').substr(2, 1)).\
         withColumn('SETOR',SF.col('CEP').substr(3, 1)).\
         withColumn('SUBSETOR',SF.col('CEP').substr(4, 1)).\
         withColumn('DIV_SUBSETOR',SF.col('CEP').substr(5, 1)).\
         withColumn('LOGRADOURO',SF.col('CEP').substr(6, 3)).\
         drop('CEP')

In [21]:
df_aux.show()

+--------------+-------------------+------+---------+-----+--------+------------+----------+
|          CNPJ|           REF_DATE|REGIAO|SUBREGIAO|SETOR|SUBSETOR|DIV_SUBSETOR|LOGRADOURO|
+--------------+-------------------+------+---------+-----+--------+------------+----------+
|24168501000140|2017-03-01 00:00:00|     1|        4|    8|       0|           0|       370|
|07488555000173|2016-12-01 00:00:00|     3|        8|    4|       1|           1|       120|
|13378985000135|2016-11-01 00:00:00|     0|        3|    1|       7|           8|       000|
|03058631000150|2016-11-01 00:00:00|     8|        5|    9|       8|           0|       000|
|20447042000182|2017-01-01 00:00:00|     8|        8|    0|       4|           0|       445|
|15110694000197|2017-02-01 00:00:00|     9|        4|    9|       3|           0|       075|
|13002350000139|2017-02-01 00:00:00|     8|        0|    0|       3|           5|       000|
|27554813000180|2016-09-01 00:00:00|     0|        5|    4|       3|  

In [22]:
df_final = df_final.join(df_aux, ['CNPJ','REF_DATE'], 'left')
df_final.show()

+--------------+-------------------+--------+-------+---------+-------------+------+---------+-----+--------+------------+----------+
|          CNPJ|           REF_DATE|DATA_REF|CS_ALVO|QTD_EMAIL|QTD_TELEFONES|REGIAO|SUBREGIAO|SETOR|SUBSETOR|DIV_SUBSETOR|LOGRADOURO|
+--------------+-------------------+--------+-------+---------+-------------+------+---------+-----+--------+------------+----------+
|00004330000155|2016-10-01 00:00:00| 2016-10|      0|        0|            0|     1|        3|    0|       9|           2|       500|
|00078090000133|2017-03-01 00:00:00| 2017-03|      0|        0|            1|     9|        1|    2|       6|           0|       000|
|00109982000154|2016-09-01 00:00:00| 2016-09|      1|        1|            2|     1|        1|    4|       4|           6|       130|
|00281129000115|2017-06-01 00:00:00| 2017-06|      0|        0|            0|     0|        3|    2|       2|           1|       200|
|00294916000100|2017-02-01 00:00:00| 2017-02|      0|        0

<h1>4) Quantidade de Sócios</h1>

In [23]:
df_aux = df_base.select('CNPJ','NOME_SOCIO','REF_DATE').dropDuplicates().\
            groupBy('CNPJ','REF_DATE').agg(SF.count('NOME_SOCIO').alias('QTD_SOCIOS')).sort(SF.desc('QTD_SOCIOS'))
df_aux.show(10)

+--------------+-------------------+----------+
|          CNPJ|           REF_DATE|QTD_SOCIOS|
+--------------+-------------------+----------+
|31130537000280|2016-10-01 00:00:00|        35|
|10297699000238|2017-04-01 00:00:00|        34|
|09456178000116|2016-09-01 00:00:00|        20|
|61087367000189|2017-03-01 00:00:00|        19|
|04907334000102|2016-09-01 00:00:00|        19|
|09643807000117|2017-04-01 00:00:00|        18|
|10392044000168|2017-01-01 00:00:00|        17|
|75492694000392|2016-11-01 00:00:00|        16|
|48093892000149|2016-11-01 00:00:00|        16|
|09541072000110|2017-04-01 00:00:00|        15|
+--------------+-------------------+----------+
only showing top 10 rows



In [24]:
df_final = df_final.join(df_aux, ['CNPJ','REF_DATE'], 'left')
df_final.show()

+--------------+-------------------+--------+-------+---------+-------------+------+---------+-----+--------+------------+----------+----------+
|          CNPJ|           REF_DATE|DATA_REF|CS_ALVO|QTD_EMAIL|QTD_TELEFONES|REGIAO|SUBREGIAO|SETOR|SUBSETOR|DIV_SUBSETOR|LOGRADOURO|QTD_SOCIOS|
+--------------+-------------------+--------+-------+---------+-------------+------+---------+-----+--------+------------+----------+----------+
|00004330000155|2016-10-01 00:00:00| 2016-10|      0|        0|            0|     1|        3|    0|       9|           2|       500|         1|
|00078090000133|2017-03-01 00:00:00| 2017-03|      0|        0|            1|     9|        1|    2|       6|           0|       000|         2|
|00109982000154|2016-09-01 00:00:00| 2016-09|      1|        1|            2|     1|        1|    4|       4|           6|       130|         2|
|00281129000115|2017-06-01 00:00:00| 2017-06|      0|        0|            0|     0|        3|    2|       2|           1|       2

<h1> 5) Quantidade de Filiais do Grupo</h1>

In [25]:
df_aux = df_base.select('CNPJ','REF_DATE').dropDuplicates()
df_aux = df_aux.withColumn('GRUPO_EMP',SF.col('CNPJ').substr(1, 8)).\
         withColumn('FILIAL',SF.col('CNPJ').substr(9, 4)).\
         groupBy('CNPJ','REF_DATE','GRUPO_EMP').agg(SF.max('FILIAL').alias('QTD_FILIAIS')).drop('GRUPO_EMP').sort(SF.desc('QTD_FILIAIS'))



df_aux.show(10)

+--------------+-------------------+-----------+
|          CNPJ|           REF_DATE|QTD_FILIAIS|
+--------------+-------------------+-----------+
|03777341044880|2016-07-01 00:00:00|       0448|
|09967852014934|2016-10-01 00:00:00|       0149|
|09967852013105|2017-02-01 00:00:00|       0131|
|30147995009135|2017-03-01 00:00:00|       0091|
|47866934008825|2017-04-01 00:00:00|       0088|
|93209765004880|2016-12-01 00:00:00|       0048|
|02223966004615|2017-01-01 00:00:00|       0046|
|09160226003905|2016-09-01 00:00:00|       0039|
|03667884003065|2016-08-01 00:00:00|       0030|
|62413877002962|2017-01-01 00:00:00|       0029|
+--------------+-------------------+-----------+
only showing top 10 rows



In [26]:
df_final = df_final.join(df_aux, ['CNPJ','REF_DATE'], 'left')
df_final.show()

+--------------+-------------------+--------+-------+---------+-------------+------+---------+-----+--------+------------+----------+----------+-----------+
|          CNPJ|           REF_DATE|DATA_REF|CS_ALVO|QTD_EMAIL|QTD_TELEFONES|REGIAO|SUBREGIAO|SETOR|SUBSETOR|DIV_SUBSETOR|LOGRADOURO|QTD_SOCIOS|QTD_FILIAIS|
+--------------+-------------------+--------+-------+---------+-------------+------+---------+-----+--------+------------+----------+----------+-----------+
|00004330000155|2016-10-01 00:00:00| 2016-10|      0|        0|            0|     1|        3|    0|       9|           2|       500|         1|       0001|
|00078090000133|2017-03-01 00:00:00| 2017-03|      0|        0|            1|     9|        1|    2|       6|           0|       000|         2|       0001|
|00109982000154|2016-09-01 00:00:00| 2016-09|      1|        1|            2|     1|        1|    4|       4|           6|       130|         2|       0001|
|00281129000115|2017-06-01 00:00:00| 2017-06|      0|     