<a href="https://colab.research.google.com/github/claudioalvesmonteiro/machinelearning-CNPJ/blob/master/modelos/CLUSTER_PCA_GradientBoostedTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

   # Projeto Machine Learning para Análise de Crédito
   
   ## Modelo PCA Gradient Boosted Tree
   
    Equipe:
    Claudio Alves Monteiro
    Marcos Antonio Almeida Souto Júnior
    Virgínia Heimann
    Kayo Renato da Silva Nascimento
    Rosely Cabral
   

   


In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://mirror.nbtelecom.com.br/apache/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz
!tar xf spark-2.4.3-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.3-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
# import modules
import os
import pandas as pd
from pyspark.sql import functions as SF
import pyspark.sql.types as ST

## Importar dados

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# import data
path = 'drive/My Drive/Colab Notebooks/ProjetoCESARSchool/codes/data/'

model_data = 'model_data4'

df = spark.read.csv(path + model_data + '.csv',
                    sep=',',
                    encoding='utf-8',
                    header=True,
                    inferSchema=True)

df.show(5)

+---+----+--------+-------+----------+------------+--------------+-------------------------+----------------------+----------------------------+--------------------+-----+----------+------------------+--------------+-----------------------+---------+-------------------+------------------+------------------+----------+------------------------+-------------------------+-------------------------------+--------------+----------------+--------+--------------+------------------+
|_c0|UF12|DATA_REF|CS_ALVO|QTD_SOCIOS|dist_capital|area_municipio|taxa_atividade_18anosmais|porcent_pop_saneamento|porcent_mulheres10a17_filhos|mortalidade_infantil| IDHM|IDHM_renda|taxa_analfabetismo|porcent_pobres|municipio_metropolitano|populacao|  municipio_nome_uf|log_capital_social|         log_idade|tipo_index|natureza_socied_limitada|natureza_empresario_indiv|natureza_empresa_indiv_limitada|situacao_ativa|situacao_baixada|UF_index|    UF_encoded|atividade_alimento|
+---+----+--------+-------+----------+------

In [0]:
# ENCODE 
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator

df = df.drop('UF_encoded')

encoder = OneHotEncoderEstimator(inputCols=['UF_index'],
                                 outputCols=['UF_encoded'])

model = encoder.fit(df)
df = model.transform(df)

In [0]:
# TRANSFORM TYPE
from pyspark.sql.types import DoubleType, IntegerType
df = df.withColumn("taxa_analfabet",df["taxa_analfabetismo"].cast(DoubleType()))
df = df.withColumn("pop",df["populacao"].cast(IntegerType()))

# Principal Componenet Analysis PCA

In [0]:
#=========================== VECTORIZE DATA

from pyspark.ml.feature import VectorAssembler

# selecionar features
pcasel =  ['dist_capital',
             'area_municipio',
             'taxa_atividade_18anosmais',
             'porcent_pop_saneamento',
             'porcent_mulheres10a17_filhos',
             'mortalidade_infantil',
             'IDHM',
             'IDHM_renda',
             'taxa_analfabet',
             'porcent_pobres',
             'municipio_metropolitano',
             'pop']

# vetorizar 
assembler = VectorAssembler(
    inputCols = pcasel,
    outputCol = "features_pca",
 handleInvalid='skip')

df = assembler.transform(df)

In [0]:
# criar modelo
from pyspark.ml.feature import PCA

pca = PCA(k=3, inputCol="features_pca", outputCol="pca_output")
model = pca.fit(df)

data = model.transform(df)
data.show(10, truncate=False)

+---+----+--------+-------+----------+------------+--------------+-------------------------+----------------------+----------------------------+--------------------+-----+----------+------------------+--------------+-----------------------+---------+-------------------+------------------+------------------+----------+------------------------+-------------------------+-------------------------------+--------------+----------------+--------+------------------+--------------+--------------+------+-------------------------------------------------------------------------+----------------------------------------------------------+
|_c0|UF12|DATA_REF|CS_ALVO|QTD_SOCIOS|dist_capital|area_municipio|taxa_atividade_18anosmais|porcent_pop_saneamento|porcent_mulheres10a17_filhos|mortalidade_infantil|IDHM |IDHM_renda|taxa_analfabetismo|porcent_pobres|municipio_metropolitano|populacao|municipio_nome_uf  |log_capital_social|log_idade         |tipo_index|natureza_socied_limitada|natureza_empresario_ind

# Clustering with K-MEANS

In [0]:
#======== VECTOR ASSEMBLER

inputcol = ['QTD_SOCIOS',
           'log_capital_social',
           'log_idade',
           'tipo_index',
           'natureza_socied_limitada',
           'natureza_empresario_indiv',
           'natureza_empresa_indiv_limitada',
           'situacao_ativa',
           'situacao_baixada',
           'atividade_alimento',
           'UF_encoded',
           'pca_output'
           ]

assembler = VectorAssembler(
    inputCols = inputcol,
    outputCol = "features",
 handleInvalid='skip')

data = assembler.transform(data)

In [0]:
# funcao para contagem de valores categoricos
def countGenTab(df, column):
    # groupby, count and sort pyspark
    tab = df.select(column).groupby(column).count().alias('count').sort('count', ascending=False)
    # transform to pandas df
    tab = tab.toPandas()
    # calculate proportion
    tab['prop'] = round(tab['count']/sum(tab['count'])*100, 2)
    return tab

In [0]:
from pyspark.ml.clustering import KMeans

# Trains a k-means model.
kmeans = KMeans().setK(5).setSeed(1)
model = kmeans.fit(data)

# Make predictions
clusterized = model.transform(data)

clusterized.show(5)

+---+----+--------+-------+----------+------------+--------------+-------------------------+----------------------+----------------------------+--------------------+-----+----------+------------------+--------------+-----------------------+---------+-------------------+------------------+------------------+----------+------------------------+-------------------------+-------------------------------+--------------+----------------+--------+------------------+--------------+--------------+------+--------------------+--------------------+--------------------+----------+
|_c0|UF12|DATA_REF|CS_ALVO|QTD_SOCIOS|dist_capital|area_municipio|taxa_atividade_18anosmais|porcent_pop_saneamento|porcent_mulheres10a17_filhos|mortalidade_infantil| IDHM|IDHM_renda|taxa_analfabetismo|porcent_pobres|municipio_metropolitano|populacao|  municipio_nome_uf|log_capital_social|         log_idade|tipo_index|natureza_socied_limitada|natureza_empresario_indiv|natureza_empresa_indiv_limitada|situacao_ativa|situacao_

In [0]:
countGenTab(clusterized, 'prediction')

Unnamed: 0,prediction,count,prop
0,0,44305,44.87
1,1,22735,23.02
2,4,14306,14.49
3,3,9244,9.36
4,2,8152,8.26


# Gradient-boosted tree classifier

In [0]:
# import 
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [0]:
clusters = [0, 1, 2, 3, 4]

list_accuracy =[]
list_inad = []

tp=0
tn=0
fp=0
fn=0

for i in clusters:
  dt = clusterized.filter(clusterized.prediction == i)
  
  #======== VECTOR ASSEMBLER

  inputcol = ['QTD_SOCIOS', 
             'log_capital_social', 'log_idade', 'tipo_index', 
             'natureza_socied_limitada', 'natureza_empresario_indiv',
             'situacao_ativa',
             'situacao_baixada', 'UF_encoded', 'pop',
             'municipio_metropolitano','dist_capital','area_municipio',
             'taxa_atividade_18anosmais','porcent_pop_saneamento',
             'porcent_mulheres10a17_filhos','mortalidade_infantil','IDHM',
             'IDHM_renda','taxa_analfabet','porcent_pobres']

  assembler = VectorAssembler(
      inputCols = inputcol,
      outputCol = "features",
      handleInvalid='skip')

  dt = dt.drop('features', 'prediction')

  dataModel = assembler.transform(dt)

  dataModel = dataModel.withColumnRenamed('CS_ALVO', 'label')

  dataModel = dataModel.select('features', 'label')
  
  # Split the data into training and test sets (30% held out for testing)
  (trainingData, testData) = dataModel.randomSplit([0.7, 0.3])
  
  
  #=========== GBT MODEL
  
  gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

  # Train model. 
  model = gbt.fit(trainingData)

  # Make predictions.
  predictions = model.transform(testData)
  
  #============ EVALUATE

  # Select (prediction, true label) and compute test error
  evaluator = MulticlassClassificationEvaluator(
      labelCol="label", predictionCol="prediction", metricName="accuracy")
  accuracy = evaluator.evaluate(predictions)
  
  list_accuracy.append(accuracy)
  
  teste = predictions.select('label', 'rawPrediction',
                          'prediction', 'probability').sort(SF.col("probability").desc()).limit(6527)

  inad = teste.where(SF.col('label')==1).count()/(dataModel.count()*0.1)*100

  list_inad.append(inad)
  
  #============ EVAL 2
  
  pred = predictions.select('prediction', 'label')
  
  tp = tp + pred[(pred.label == 1) & (pred.prediction == 1)].count()
  tn = tn + pred[(pred.label == 0) & (pred.prediction == 0)].count()
  fp = fp + pred[(pred.label == 0) & (pred.prediction == 1)].count()
  fn = fn + pred[(pred.label == 1) & (pred.prediction == 0)].count()
  
  

In [0]:
# ACCURACY
sum(list_accuracy) / float(len(list_accuracy))

0.6429053867214456

In [0]:
# INAD
sum(list_inad) / float(len(list_inad))

141.6639170010946

In [0]:
print("True Positives: %f" % tp)
print("True Negatives: %f" % tn)
print("False Positives: %f" % fp)
print("False Negatives: %f" % fn)

r = float(tp)/(tp + fn)
print("recall %f" % r)

p = float(tp) / (tp + fp)
print("precision %f" % p)

True Positives: 10678.000000
True Negatives: 8126.000000
False Positives: 6127.000000
False Negatives: 4717.000000
recall 0.693602
precision 0.635406


In [0]:
print(clusters)
print(list_accuracy)
print(list_inad)

[0, 1, 2, 3, 4]
[0.6290952810339645, 0.6189359519273047, 0.6558521560574949, 0.6743432889528608, 0.6363002556356031]
[53.06398826317571, 138.94875742247638, 173.45436702649656, 175.78970142795325, 167.06277086537116]
