In [1]:
'''
Machine Learning para Analise de Dados
CESAR Scholl
Projeto de analise de credito
Recife, 2019


Equipe:
    Claudio Alves Monteiro
    Marcos Antonio Almeida Souto Júnior
    Virgínia Heimann
    Kayo Renato da Silva Nascimento
    Rosely Cabral
'''



'\nMachine Learning para Analise de Dados\nCESAR Scholl\nProjeto de analise de credito\nRecife, 2019\n\n\nEquipe:\n    Claudio Alves Monteiro\n    Marcos Antonio Almeida Souto Júnior\n    Virgínia Heimann\n    Kayo Renato da Silva Nascimento\n    Rosely Cabral\n'

In [1]:
# Spark Session
from pyspark.sql import SparkSession, Row
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()


In [3]:
# import modules
import os
import pandas as pd
from pyspark.sql import functions as SF
import pyspark.sql.types as ST

# paths to spark and python3
os.environ['PYSPARK_SUBMIT_ARGS'] = '--executor-memory 1G pyspark-shell'
os.environ["SPARK_HOME"] = "/home/pacha/spark"
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"

In [4]:
# import data

df = spark.read.csv('data/jur.csv',
                    sep='\t',
                    encoding='utf-8',
                    header=True,
                    inferSchema=False)


df_cursor = spark.read.csv('data/Base_Des-TRN',
                    sep='\t',
                    encoding='utf-8',
                    header=True,
                    inferSchema=False)


In [5]:
df.printSchema()

root
 |-- CNPJ: string (nullable = true)
 |-- NOME_EMPRESA: string (nullable = true)
 |-- FANTASIA: string (nullable = true)
 |-- NATUREZA_JURIDICA: string (nullable = true)
 |-- ATIVIDADE_PRINCIPAL: string (nullable = true)
 |-- ATIVIDADES_SECUNDARIAS: string (nullable = true)
 |-- CNPJ_FORMATADO: string (nullable = true)
 |-- TIPO: string (nullable = true)
 |-- LOGRADOURO: string (nullable = true)
 |-- NUMERO: string (nullable = true)
 |-- COMPLEMENTO: string (nullable = true)
 |-- BAIRRO: string (nullable = true)
 |-- CEP: string (nullable = true)
 |-- MUNICIPIO: string (nullable = true)
 |-- UF: string (nullable = true)
 |-- TELEFONE_1: string (nullable = true)
 |-- TELEFONE_2: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- ABERTURA: string (nullable = true)
 |-- CAPITAL_SOCIAL: string (nullable = true)
 |-- MOTIVO_SITUACAO: string (nullable = true)
 |-- SITUACAO: string (nullable = true)
 |-- DATA_SITUACAO: string (nullable = true)
 |-- SITUACAO_ESPECIAL: strin

In [6]:
df.show(5)

+--------------+--------------------+--------------------+--------------------+--------------------+----------------------+------------------+------+----------------+------+-----------+----------------+--------+--------------+---+--------------+--------------+--------------------+--------------------+--------------+---------------+--------+--------------------+-----------------+----------------------+----+--------------------+-----+--------------+---------------+----------+--------------------+------+----------+-------------+------------+--------------------+----+-----+
|          CNPJ|        NOME_EMPRESA|            FANTASIA|   NATUREZA_JURIDICA| ATIVIDADE_PRINCIPAL|ATIVIDADES_SECUNDARIAS|    CNPJ_FORMATADO|  TIPO|      LOGRADOURO|NUMERO|COMPLEMENTO|          BAIRRO|     CEP|     MUNICIPIO| UF|    TELEFONE_1|    TELEFONE_2|               EMAIL|            ABERTURA|CAPITAL_SOCIAL|MOTIVO_SITUACAO|SITUACAO|       DATA_SITUACAO|SITUACAO_ESPECIAL|DATA_SITUACAO_ESPECIAL| EFR|          NOME

In [8]:
#------ tratar inicio do CNPJ sem o 0

@SF.udf('string')
def fill_cnpj(value):
    aux = '00000000000000'+value
    return aux[len(value):]

#------ tratar inicio do CEP sem o 0

@SF.udf('string')
def fill_cep(value):
    aux = '00000000'+value
    return aux[len(value):]



In [9]:
df_cursor = df_cursor.withColumn('CNPJ',fill_cnpj('CNPJ'))
#df_cursor.show(5)

In [10]:
df = df.withColumn('CEP',fill_cep('CEP'))
#df.show(20)

In [11]:
df = df.withColumn('CNPJ',fill_cnpj('CNPJ'))
#df.show(5)

In [12]:
#------ Convertendo REF_DATE do df_cursor para timestamp

df_cursor = df_cursor.withColumn('REF_DATE',SF.from_unixtime(
            SF.unix_timestamp('DATA_REF','yyyy-MM')).cast('timestamp'))

#df_cursor.show(5)

In [13]:
#------ Convertendo REF_DATE do df para timestamp

df = df.withColumn('REF_DATE',SF.col('REF_DATE').cast('timestamp'))

In [14]:
df = df.withColumnRenamed('CNPJ','CNPJ_DF').withColumnRenamed('REF_DATE','REF_DATE_DF')

In [15]:
#------ Criando o df_base. O df_base é construído a partir do cursor. Todas as entradas do cursor são preservadas 
# e é feito um join com as entradas do df que correspondem à condição dupla: mesmo "CNPJ" e "Data de Referência" do df
# anterior à "Dara de Referência" do cursor  

df_base = df_cursor.join(df,
                 (df_cursor['CNPJ']==df['CNPJ_DF']) & 
                 (df_cursor['REF_DATE'] >= df['REF_DATE_DF']),
                 'left')

<h1>1) Quantidade de EMAILS</h1>

In [16]:
# Considera todas as tuplas (CNPJ, REF_DATE, EMAIL) do df_base. Elimina as duplicadas. Assim, encontra a quantidade de e-mails
# diferentes.

df_aux = df_base.select('CNPJ','REF_DATE','EMAIL').dropDuplicates().\
            groupBy('CNPJ','REF_DATE').agg(SF.count('EMAIL').alias('QTD_EMAIL')).sort(SF.desc('QTD_EMAIL'))
df_aux.show(10)

+--------------+-------------------+---------+
|          CNPJ|           REF_DATE|QTD_EMAIL|
+--------------+-------------------+---------+
|16675505000196|2017-04-01 00:00:00|        1|
|09399733000115|2017-01-01 00:00:00|        1|
|24346190000161|2017-03-01 00:00:00|        1|
|23677964000174|2016-12-01 00:00:00|        1|
|12537413000199|2016-09-01 00:00:00|        1|
|12237397000119|2016-08-01 00:00:00|        1|
|26191111000116|2017-05-01 00:00:00|        1|
|26193135000104|2016-10-01 00:00:00|        1|
|20106766000162|2016-09-01 00:00:00|        1|
|01516980000142|2016-12-01 00:00:00|        1|
+--------------+-------------------+---------+
only showing top 10 rows



In [17]:
# Cria um df_final (que será a nossa base de dados para rodar o algoritmo de ML). Nesse momento será o cursor + Qtd de e-mails

df_final = df_cursor.join(df_aux,['CNPJ','REF_DATE'],'left')
df_final.show()

+--------------+-------------------+--------+-------+---------+
|          CNPJ|           REF_DATE|DATA_REF|CS_ALVO|QTD_EMAIL|
+--------------+-------------------+--------+-------+---------+
|00004330000155|2016-10-01 00:00:00| 2016-10|      0|        0|
|00078090000133|2017-03-01 00:00:00| 2017-03|      0|        0|
|00109982000154|2016-09-01 00:00:00| 2016-09|      1|        1|
|00281129000115|2017-06-01 00:00:00| 2017-06|      0|        0|
|00294916000100|2017-02-01 00:00:00| 2017-02|      0|        0|
|00370197000150|2016-11-01 00:00:00| 2016-11|      1|        0|
|00711050000187|2016-11-01 00:00:00| 2016-11|      0|        1|
|00951519000155|2017-01-01 00:00:00| 2017-01|      0|        0|
|00984383000180|2016-09-01 00:00:00| 2016-09|      0|        0|
|01013673000149|2017-02-01 00:00:00| 2017-02|      0|        0|
|01024580000110|2016-09-01 00:00:00| 2016-09|      0|        0|
|01163316000167|2017-01-01 00:00:00| 2017-01|      0|        1|
|01252815000120|2016-07-01 00:00:00| 201

<h1>2) Quantidade de Telefones</h1>

In [18]:
df_aux = df_base.select(['CNPJ','REF_DATE','TELEFONE_1','TELEFONE_2']).dropDuplicates()

In [19]:
df_aux = df_aux.withColumn('QTD_TLF1',SF.when(SF.col('TELEFONE_1').isNotNull(),1).otherwise(0)).\
                withColumn('QTD_TLF2',SF.when(SF.col('TELEFONE_2').isNotNull(),1).otherwise(0)).\
                withColumn('QTD_TELEFONES',SF.col('QTD_TLF1')+SF.col('QTD_TLF2')).\
                groupBy('CNPJ','REF_DATE').agg(SF.sum('QTD_TELEFONES').alias('QTD_TELEFONES'))
df_aux.show()

+--------------+-------------------+-------------+
|          CNPJ|           REF_DATE|QTD_TELEFONES|
+--------------+-------------------+-------------+
|49020969000113|2016-09-01 00:00:00|            0|
|02709787000190|2017-03-01 00:00:00|            0|
|03349324000128|2017-04-01 00:00:00|            0|
|04261023000100|2016-08-01 00:00:00|            0|
|48087589000133|2016-08-01 00:00:00|            0|
|55815658000143|2017-04-01 00:00:00|            0|
|62136569000136|2017-05-01 00:00:00|            0|
|50657246000109|2016-11-01 00:00:00|            0|
|66603796000185|2017-02-01 00:00:00|            0|
|07177514000166|2017-02-01 00:00:00|            0|
|01689785000114|2017-02-01 00:00:00|            0|
|07497152000190|2017-01-01 00:00:00|            0|
|68811017000162|2017-05-01 00:00:00|            0|
|08981904000157|2016-09-01 00:00:00|            0|
|39810270000101|2017-04-01 00:00:00|            0|
|03406077000154|2016-07-01 00:00:00|            0|
|05198620000109|2017-03-01 00:0

In [20]:
df_final = df_final.join(df_aux, ['CNPJ','REF_DATE'], 'left')
df_final.show()

+--------------+-------------------+--------+-------+---------+-------------+
|          CNPJ|           REF_DATE|DATA_REF|CS_ALVO|QTD_EMAIL|QTD_TELEFONES|
+--------------+-------------------+--------+-------+---------+-------------+
|00004330000155|2016-10-01 00:00:00| 2016-10|      0|        0|            0|
|00078090000133|2017-03-01 00:00:00| 2017-03|      0|        0|            1|
|00109982000154|2016-09-01 00:00:00| 2016-09|      1|        1|            2|
|00281129000115|2017-06-01 00:00:00| 2017-06|      0|        0|            0|
|00294916000100|2017-02-01 00:00:00| 2017-02|      0|        0|            0|
|00370197000150|2016-11-01 00:00:00| 2016-11|      1|        0|            0|
|00711050000187|2016-11-01 00:00:00| 2016-11|      0|        1|            2|
|00951519000155|2017-01-01 00:00:00| 2017-01|      0|        0|            0|
|00984383000180|2016-09-01 00:00:00| 2016-09|      0|        0|            1|
|01013673000149|2017-02-01 00:00:00| 2017-02|      0|        0| 

<h1>3) Classificação por CEP : Região, Subregião, Setor, Subsetor, Divisão de Subsetor e Logradouro</h1>

In [21]:
df_aux = df_base.select(['CNPJ', 'REF_DATE','CEP']).dropDuplicates()

In [22]:
df_aux = df_aux.withColumn('REGIAO',SF.col('CEP').substr(1, 1)).\
         withColumn('SUBREGIAO',SF.col('CEP').substr(1, 2)).\
         withColumn('SETOR',SF.col('CEP').substr(1, 3)).\
         withColumn('SUBSETOR',SF.col('CEP').substr(1, 4)).\
         withColumn('DIV_SUBSETOR',SF.col('CEP').substr(1, 5)).\
         withColumn('LOGRADOURO',SF.col('CEP').substr(1, 9)).\
         drop('CEP')

In [23]:
df_aux.show()

+--------------+-------------------+------+---------+-----+--------+------------+----------+
|          CNPJ|           REF_DATE|REGIAO|SUBREGIAO|SETOR|SUBSETOR|DIV_SUBSETOR|LOGRADOURO|
+--------------+-------------------+------+---------+-----+--------+------------+----------+
|24168501000140|2017-03-01 00:00:00|     1|       14|  148|    1480|       14800|  14800370|
|07488555000173|2016-12-01 00:00:00|     3|       38|  384|    3841|       38411|  38411120|
|13378985000135|2016-11-01 00:00:00|     0|       03|  031|    0317|       03178|  03178000|
|03058631000150|2016-11-01 00:00:00|     8|       85|  859|    8598|       85980|  85980000|
|20447042000182|2017-01-01 00:00:00|     8|       88|  880|    8804|       88040|  88040445|
|15110694000197|2017-02-01 00:00:00|     9|       94|  949|    9493|       94930|  94930075|
|13002350000139|2017-02-01 00:00:00|     8|       80|  800|    8003|       80035|  80035000|
|27554813000180|2016-09-01 00:00:00|     0|       05|  054|    0543|  

In [24]:
df_final = df_final.join(df_aux, ['CNPJ','REF_DATE'], 'left')
df_final.show()

+--------------+-------------------+--------+-------+---------+-------------+------+---------+-----+--------+------------+----------+
|          CNPJ|           REF_DATE|DATA_REF|CS_ALVO|QTD_EMAIL|QTD_TELEFONES|REGIAO|SUBREGIAO|SETOR|SUBSETOR|DIV_SUBSETOR|LOGRADOURO|
+--------------+-------------------+--------+-------+---------+-------------+------+---------+-----+--------+------------+----------+
|00004330000155|2016-10-01 00:00:00| 2016-10|      0|        0|            0|     1|       13|  130|    1309|       13092|  13092500|
|00078090000133|2017-03-01 00:00:00| 2017-03|      0|        0|            1|     9|       91|  912|    9126|       91260|  91260000|
|00109982000154|2016-09-01 00:00:00| 2016-09|      1|        1|            2|     1|       11|  114|    1144|       11446|  11446130|
|00281129000115|2017-06-01 00:00:00| 2017-06|      0|        0|            0|     0|       03|  032|    0322|       03221|  03221200|
|00294916000100|2017-02-01 00:00:00| 2017-02|      0|        0

In [25]:
df_final.groupBy('REGIAO').count().toPandas()

Unnamed: 0,REGIAO,count
0,7.0,3928
1,3.0,6109
2,8.0,6995
3,0.0,20257
4,,3
5,5.0,2574
6,6.0,2555
7,9.0,4286
8,1.0,7980
9,4.0,2851


<h1>4) Quantidade de Sócios</h1>

In [26]:
df_aux = df_base.select('CNPJ','NOME_SOCIO','REF_DATE').dropDuplicates().\
            groupBy('CNPJ','REF_DATE').agg(SF.count('NOME_SOCIO').alias('QTD_SOCIOS')).sort(SF.desc('QTD_SOCIOS'))
df_aux.show(10)

+--------------+-------------------+----------+
|          CNPJ|           REF_DATE|QTD_SOCIOS|
+--------------+-------------------+----------+
|31130537000280|2016-10-01 00:00:00|        35|
|10297699000238|2017-04-01 00:00:00|        34|
|09456178000116|2016-09-01 00:00:00|        20|
|61087367000189|2017-03-01 00:00:00|        19|
|04907334000102|2016-09-01 00:00:00|        19|
|09643807000117|2017-04-01 00:00:00|        18|
|10392044000168|2017-01-01 00:00:00|        17|
|75492694000392|2016-11-01 00:00:00|        16|
|48093892000149|2016-11-01 00:00:00|        16|
|09541072000110|2017-04-01 00:00:00|        15|
+--------------+-------------------+----------+
only showing top 10 rows



In [27]:
df_final = df_final.join(df_aux, ['CNPJ','REF_DATE'], 'left')
df_final.show()

+--------------+-------------------+--------+-------+---------+-------------+------+---------+-----+--------+------------+----------+----------+
|          CNPJ|           REF_DATE|DATA_REF|CS_ALVO|QTD_EMAIL|QTD_TELEFONES|REGIAO|SUBREGIAO|SETOR|SUBSETOR|DIV_SUBSETOR|LOGRADOURO|QTD_SOCIOS|
+--------------+-------------------+--------+-------+---------+-------------+------+---------+-----+--------+------------+----------+----------+
|00004330000155|2016-10-01 00:00:00| 2016-10|      0|        0|            0|     1|       13|  130|    1309|       13092|  13092500|         1|
|00078090000133|2017-03-01 00:00:00| 2017-03|      0|        0|            1|     9|       91|  912|    9126|       91260|  91260000|         2|
|00109982000154|2016-09-01 00:00:00| 2016-09|      1|        1|            2|     1|       11|  114|    1144|       11446|  11446130|         2|
|00281129000115|2017-06-01 00:00:00| 2017-06|      0|        0|            0|     0|       03|  032|    0322|       03221|  032212

<h1> 5) Quantidade de Filiais do Grupo</h1>

In [28]:
df_aux = df_base.select('CNPJ','REF_DATE').dropDuplicates()
df_aux = df_aux.withColumn('GRUPO_EMP',SF.col('CNPJ').substr(1, 8)).\
         withColumn('FILIAL',SF.col('CNPJ').substr(9, 4)).\
         groupBy('CNPJ','REF_DATE','GRUPO_EMP').agg(SF.max('FILIAL').alias('QTD_FILIAIS')).drop('GRUPO_EMP').sort(SF.desc('QTD_FILIAIS'))



df_aux.show(10)

+--------------+-------------------+-----------+
|          CNPJ|           REF_DATE|QTD_FILIAIS|
+--------------+-------------------+-----------+
|03777341044880|2016-07-01 00:00:00|       0448|
|09967852014934|2016-10-01 00:00:00|       0149|
|09967852013105|2017-02-01 00:00:00|       0131|
|30147995009135|2017-03-01 00:00:00|       0091|
|47866934008825|2017-04-01 00:00:00|       0088|
|93209765004880|2016-12-01 00:00:00|       0048|
|02223966004615|2017-01-01 00:00:00|       0046|
|09160226003905|2016-09-01 00:00:00|       0039|
|03667884003065|2016-08-01 00:00:00|       0030|
|62413877002962|2017-01-01 00:00:00|       0029|
+--------------+-------------------+-----------+
only showing top 10 rows



In [29]:
df_final = df_final.join(df_aux, ['CNPJ','REF_DATE'], 'left')
df_final.limit(10).toPandas()

Unnamed: 0,CNPJ,REF_DATE,DATA_REF,CS_ALVO,QTD_EMAIL,QTD_TELEFONES,REGIAO,SUBREGIAO,SETOR,SUBSETOR,DIV_SUBSETOR,LOGRADOURO,QTD_SOCIOS,QTD_FILIAIS
0,4330000155,2016-10-01,2016-10,0,0,0,1,13,130,1309,13092,13092500,1,1
1,78090000133,2017-03-01,2017-03,0,0,1,9,91,912,9126,91260,91260000,2,1
2,109982000154,2016-09-01,2016-09,1,1,2,1,11,114,1144,11446,11446130,2,1
3,281129000115,2017-06-01,2017-06,0,0,0,0,3,32,322,3221,3221200,3,1
4,294916000100,2017-02-01,2017-02,0,0,0,0,3,33,336,3361,3361000,0,1
5,370197000150,2016-11-01,2016-11,1,0,0,7,70,703,7035,70351,70351535,3,1
6,711050000187,2016-11-01,2016-11,0,1,2,3,31,317,3174,31742,31742173,0,1
7,951519000155,2017-01-01,2017-01,0,0,0,1,13,135,1357,13575,13575170,0,1
8,984383000180,2016-09-01,2016-09,0,0,1,8,85,856,8566,85660,85660000,1,1
9,1013673000149,2017-02-01,2017-02,0,0,0,2,22,224,2244,22440,22440032,2,1


In [30]:
df_final.orderBy("QTD_FILIAIS", ascending=False).show(10)

+--------------+-------------------+--------+-------+---------+-------------+------+---------+-----+--------+------------+----------+----------+-----------+
|          CNPJ|           REF_DATE|DATA_REF|CS_ALVO|QTD_EMAIL|QTD_TELEFONES|REGIAO|SUBREGIAO|SETOR|SUBSETOR|DIV_SUBSETOR|LOGRADOURO|QTD_SOCIOS|QTD_FILIAIS|
+--------------+-------------------+--------+-------+---------+-------------+------+---------+-----+--------+------------+----------+----------+-----------+
|03777341044880|2016-07-01 00:00:00| 2016-07|      0|        1|            1|     8|       89|  890|    8901|       89015|  89015201|         0|       0448|
|09967852014934|2016-10-01 00:00:00| 2016-10|      0|        1|            2|     0|       00|  000|    0000|       00000|  00000000|        14|       0149|
|09967852013105|2017-02-01 00:00:00| 2017-02|      0|        0|            0|     0|       06|  064|    0646|       06460|  06460040|        14|       0131|
|30147995009135|2017-03-01 00:00:00| 2017-03|      0|     

<h1> 6) Natureza Jurídica </h1>

In [31]:
df_aux = df_base.select('CNPJ','REF_DATE','NATUREZA_JURIDICA')
df_aux.limit(20).toPandas()

Unnamed: 0,CNPJ,REF_DATE,NATUREZA_JURIDICA
0,11507110000160,2016-10-01,EMPRESARIO (INDIVIDUAL)
1,15062189000114,2016-07-01,SOCIEDADE EMPRESARIA LIMITADA
2,15062189000114,2016-07-01,SOCIEDADE EMPRESARIA LIMITADA
3,5841171000167,2016-08-01,SOCIEDADE EMPRESARIA LIMITADA
4,5841171000167,2016-08-01,SOCIEDADE EMPRESARIA LIMITADA
5,5841171000167,2016-08-01,SOCIEDADE EMPRESARIA LIMITADA
6,5841171000167,2016-08-01,SOCIEDADE EMPRESARIA LIMITADA
7,5841171000167,2016-08-01,SOCIEDADE EMPRESARIA LIMITADA
8,5841171000167,2016-08-01,SOCIEDADE EMPRESARIA LIMITADA
9,5841171000167,2016-08-01,SOCIEDADE EMPRESARIA LIMITADA


In [32]:
df_final = df_final.join(df_aux,['CNPJ','REF_DATE'], 'left' )
df_final.limit(10).toPandas()

Unnamed: 0,CNPJ,REF_DATE,DATA_REF,CS_ALVO,QTD_EMAIL,QTD_TELEFONES,REGIAO,SUBREGIAO,SETOR,SUBSETOR,DIV_SUBSETOR,LOGRADOURO,QTD_SOCIOS,QTD_FILIAIS,NATUREZA_JURIDICA
0,4330000155,2016-10-01,2016-10,0,0,0,1,13,130,1309,13092,13092500,1,1,EMPRESA INDIVIDUAL DE RESPONSABILIDADE LIMITAD...
1,78090000133,2017-03-01,2017-03,0,0,1,9,91,912,9126,91260,91260000,2,1,SOCIEDADE EMPRESARIA LIMITADA
2,78090000133,2017-03-01,2017-03,0,0,1,9,91,912,9126,91260,91260000,2,1,SOCIEDADE EMPRESARIA LIMITADA
3,109982000154,2016-09-01,2016-09,1,1,2,1,11,114,1144,11446,11446130,2,1,SOCIEDADE EMPRESARIA LIMITADA
4,109982000154,2016-09-01,2016-09,1,1,2,1,11,114,1144,11446,11446130,2,1,SOCIEDADE EMPRESARIA LIMITADA
5,281129000115,2017-06-01,2017-06,0,0,0,0,3,32,322,3221,3221200,3,1,SOCIEDADE EMPRESARIA LIMITADA
6,281129000115,2017-06-01,2017-06,0,0,0,0,3,32,322,3221,3221200,3,1,SOCIEDADE EMPRESARIA LIMITADA
7,281129000115,2017-06-01,2017-06,0,0,0,0,3,32,322,3221,3221200,3,1,SOCIEDADE EMPRESARIA LIMITADA
8,294916000100,2017-02-01,2017-02,0,0,0,0,3,33,336,3361,3361000,0,1,EMPRESARIO (INDIVIDUAL)
9,370197000150,2016-11-01,2016-11,1,0,0,7,70,703,7035,70351,70351535,3,1,SOCIEDADE EMPRESARIA LIMITADA


In [33]:
df_aux.show(10)

+--------------+-------------------+--------------------+
|          CNPJ|           REF_DATE|   NATUREZA_JURIDICA|
+--------------+-------------------+--------------------+
|11507110000160|2016-10-01 00:00:00|EMPRESARIO (INDIV...|
|15062189000114|2016-07-01 00:00:00|SOCIEDADE EMPRESA...|
|15062189000114|2016-07-01 00:00:00|SOCIEDADE EMPRESA...|
|05841171000167|2016-08-01 00:00:00|SOCIEDADE EMPRESA...|
|05841171000167|2016-08-01 00:00:00|SOCIEDADE EMPRESA...|
|05841171000167|2016-08-01 00:00:00|SOCIEDADE EMPRESA...|
|05841171000167|2016-08-01 00:00:00|SOCIEDADE EMPRESA...|
|05841171000167|2016-08-01 00:00:00|SOCIEDADE EMPRESA...|
|05841171000167|2016-08-01 00:00:00|SOCIEDADE EMPRESA...|
|05841171000167|2016-08-01 00:00:00|SOCIEDADE EMPRESA...|
+--------------+-------------------+--------------------+
only showing top 10 rows



In [34]:
'''from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator

indexer = StringIndexer(inputCol="NATUREZA_JURIDICA", outputCol="NATUREZA_JURIDICA_Index")
indexed = indexer.fit(df_final).transform(df_final)
indexed.select(indexed.NATUREZA_JURIDICA, indexed.NATUREZA_JURIDICA_Index).show()

encoder = OneHotEncoderEstimator(inputCols=["NATUREZA_JURIDICA_Index"],
                                 outputCols=["NATUREZA_JURIDICA_Vec"])

model = encoder.fit(indexed)
encoded = model.transform(indexed)
encoded.select(encoded.NATUREZA_JURIDICA_Vec).show()
'''

'from pyspark.ml.feature import StringIndexer\nfrom pyspark.ml.feature import OneHotEncoderEstimator\n\nindexer = StringIndexer(inputCol="NATUREZA_JURIDICA", outputCol="NATUREZA_JURIDICA_Index")\nindexed = indexer.fit(df_final).transform(df_final)\nindexed.select(indexed.NATUREZA_JURIDICA, indexed.NATUREZA_JURIDICA_Index).show()\n\nencoder = OneHotEncoderEstimator(inputCols=["NATUREZA_JURIDICA_Index"],\n                                 outputCols=["NATUREZA_JURIDICA_Vec"])\n\nmodel = encoder.fit(indexed)\nencoded = model.transform(indexed)\nencoded.select(encoded.NATUREZA_JURIDICA_Vec).show()\n'

In [35]:
#df_final = df_final.drop('NATUREZA_JURIDICA', 'NATUREZA_JURIDICA_Index')
#df_final.limit(10).toPandas()

In [36]:
#indexed.limit(20).toPandas()