<a href="https://colab.research.google.com/github/claudioalvesmonteiro/machinelearning-CNPJ/blob/master/modelos/MODELO_GradientBoostedTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

   # Projeto Machine Learning para Análise de Crédito
   
   ## Modelo Gradient Boosted Tree
   
    Equipe:
    Claudio Alves Monteiro
    Marcos Antonio Almeida Souto Júnior
    Virgínia Heimann
    Kayo Renato da Silva Nascimento
    Rosely Cabral
   

   


In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://mirror.nbtelecom.com.br/apache/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz
!tar xf spark-2.4.3-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.3-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
# import modules
import os
import pandas as pd
from pyspark.sql import functions as SF
import pyspark.sql.types as ST

## Importar dados

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# import data
path = 'drive/My Drive/Colab Notebooks/ProjetoCESARSchool/codes/data/'


df = spark.read.csv(path+'model_data5.csv',
                    sep=',',
                    encoding='utf-8',
                    header=True,
                    inferSchema=True)

df.show(5)

+---+--------------+--------+-------+----------+--------------------+--------------------+----------+------------------------+-------------------------+-------------------------------+--------------+----------------+--------+------------------+-------------------+-----------------------+------------------+--------------------+-------------------------+----------------------+----------------------------+--------------------+-------------------+-------------------+-------------------+-------------------+
|_c0|          CNPJ|DATA_REF|CS_ALVO|QTD_SOCIOS|      CAPITAL_SOCIAL|               IDADE|tipo_index|natureza_socied_limitada|natureza_empresario_indiv|natureza_empresa_indiv_limitada|situacao_ativa|situacao_baixada|UF_index|atividade_alimento|          populacao|municipio_metropolitano|      dist_capital|      area_municipio|taxa_atividade_18anosmais|porcent_pop_saneamento|porcent_mulheres10a17_filhos|mortalidade_infantil|               IDHM|         IDHM_renda| taxa_analfabetismo|     

In [0]:
# rename CS_ALVO as label
df = df.withColumnRenamed('CS_ALVO', 'label')

## Construção do Modelo

In [0]:
# ENCODE 
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator

df = df.drop('UF_encoded')

encoder = OneHotEncoderEstimator(inputCols=['UF_index'],
                                 outputCols=['UF_encoded'])

model = encoder.fit(df)
df = model.transform(df)

In [0]:
df = df.withColumn('populacao', SF.col('populacao').cast('double'))
df = df.withColumn('taxa_analfabetismo', SF.col('taxa_analfabetismo').cast('double'))
df.show(5)

+---+--------------+--------+-----+----------+--------------------+--------------------+----------+------------------------+-------------------------+-------------------------------+--------------+----------------+--------+------------------+-------------------+-----------------------+------------------+--------------------+-------------------------+----------------------+----------------------------+--------------------+-------------------+-------------------+-------------------+-------------------+--------------+
|_c0|          CNPJ|DATA_REF|label|QTD_SOCIOS|      CAPITAL_SOCIAL|               IDADE|tipo_index|natureza_socied_limitada|natureza_empresario_indiv|natureza_empresa_indiv_limitada|situacao_ativa|situacao_baixada|UF_index|atividade_alimento|          populacao|municipio_metropolitano|      dist_capital|      area_municipio|taxa_atividade_18anosmais|porcent_pop_saneamento|porcent_mulheres10a17_filhos|mortalidade_infantil|               IDHM|         IDHM_renda| taxa_analfabe

## Gradient-boosted tree classifier

In [0]:
# import 
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
#======== VECTOR ASSEMBLER

from pyspark.ml.feature import VectorAssembler




inputcol = ['QTD_SOCIOS', 
           'CAPITAL_SOCIAL', 'IDADE', 'tipo_index', 
           'natureza_socied_limitada', 'natureza_empresario_indiv',
           'situacao_ativa',
           'situacao_baixada', 'UF_encoded', 'populacao',
           'municipio_metropolitano','dist_capital','area_municipio',
           'taxa_atividade_18anosmais','porcent_pop_saneamento',
           'porcent_mulheres10a17_filhos','mortalidade_infantil','IDHM',
           'IDHM_renda','taxa_analfabetismo','porcent_pobres']



assembler = VectorAssembler(
    inputCols = inputcol,
    outputCol = "features",
handleInvalid = "keep")

dataModel = assembler.transform(df)

dataModel = dataModel.select('features', 'label')

dataModel.show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                                                                                                                                                                                                                                  |label|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = dataModel.randomSplit([0.7, 0.3])

In [0]:
# funcao para contagem de valores categoricos
def countGenTab(df, column):
    # groupby, count and sort pyspark
    tab = df.select(column).groupby(column).count().alias('count').sort('count', ascending=False)
    # transform to pandas df
    tab = tab.toPandas()
    # calculate proportion
    tab['prop'] = round(tab['count']/sum(tab['count'])*100, 2)
    return tab

  
countGenTab(trainingData, 'label')

Unnamed: 0,label,count,prop
0,0,40190,88.69
1,1,5127,11.31


In [0]:
## BALANCEAMENTO

dif = trainingData.where(SF.col('label')==0).count()-trainingData.where(SF.col('label')==1).count()

# filtrar casos onde alvo == 1
alvo = trainingData.filter(trainingData.label == 1)

# adicionar 10 amostras de 1000 casos do alvo a df
amostra = spark.createDataFrame(alvo.rdd.takeSample(True, dif, seed=0))
trainingData = trainingData.union(amostra)

countGenTab(trainingData, 'label')

Unnamed: 0,label,count,prop
0,0,40190,50.0
1,1,40190,50.0


In [0]:
# Train a GBT model.
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=30)

# Train model. 
model = gbt.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

In [0]:
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|    0|(32,[0,1,2,3,4,6,...|
|       0.0|    0|(32,[0,1,2,3,4,6,...|
|       0.0|    0|(32,[0,1,2,3,4,6,...|
|       0.0|    0|(32,[0,1,2,3,4,6,...|
|       0.0|    0|(32,[0,1,2,3,4,6,...|
+----------+-----+--------------------+
only showing top 5 rows



In [0]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
accuracy

0.6110481294445017

## Evaluate Model

In [0]:
crosst = predictions.crosstab('prediction', 'label')
crosst = crosst.toPandas()
crosst

Unnamed: 0,prediction_label,0,1
0,1.0,6555,1221
1,0.0,10637,993


In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics

pred = predictions.select('prediction', 'label')
metricsp = MulticlassMetrics(pred.rdd)
# metricsp.recall(1)

tp = pred[(pred.label == 1) & (pred.prediction == 1)].count()
tn = pred[(pred.label == 0) & (pred.prediction == 0)].count()
fp = pred[(pred.label == 0) & (pred.prediction == 1)].count()
fn = pred[(pred.label == 1) & (pred.prediction == 0)].count()
print("True Positives: %f" % tp)
print("True Negatives: %f" % tn)
print("False Positives: %f" % fp)
print("False Negatives: %f" % fn)
print("Total: %d" % pred.count())

r = float(tp)/(tp + fn)
print("recall %f" % r)

p = float(tp) / (tp + fp)
print("precision %f" % p)

True Positives: 1221.000000
True Negatives: 10637.000000
False Positives: 6555.000000
False Negatives: 993.000000
Total: 19406
recall 0.551491
precision 0.157022


In [0]:
teste = predictions.select('label', 'rawPrediction',
                          'prediction', 'probability').sort(SF.col("probability").desc()).limit(6527)

inad = teste.where(SF.col('label')==1).count()/6527*100

inad

7.660487206986365