<a href="https://colab.research.google.com/github/claudioalvesmonteiro/machinelearning-CNPJ/blob/master/modelos/MODELO_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

   # Projeto Machine Learning para Análise de Crédito
   
   ## Modelo Random Forests
   
    Equipe:
    Claudio Alves Monteiro
    Marcos Antonio Almeida Souto Júnior
    Virgínia Heimann
    Kayo Renato da Silva Nascimento
    Rosely Cabral
   
   


In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://mirror.nbtelecom.com.br/apache/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz
!tar xf spark-2.4.3-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.3-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
# import modules
import os
import pandas as pd
from pyspark.sql import functions as SF
import pyspark.sql.types as ST

## Importar dados

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# import data
path = 'drive/My Drive/Colab Notebooks/ProjetoCESARSchool/'

df = spark.read.csv(path+'data.csv',
                    sep=',',
                    encoding='utf-8',
                    header=True,
                    inferSchema=True)

## FEATURE

In [0]:
##LOG

df = df.withColumn('log_capital_social', SF.log('CAPITAL_SOCIAL'))
df = df.withColumn('log_capital_social', SF.when(df.log_capital_social.isNull(), 0).otherwise(df.log_capital_social))


df = df.withColumn('log_idade', SF.log('IDADE'))
df = df.withColumn('log_idade', SF.when(df.log_idade.isNull(), 0).otherwise(df.log_idade))


df.show(10)

+---+-------+----------+-----------+--------------+----------+--------------+------------------+------------------+
|_c0|CS_ALVO|QTD_SOCIOS|      IDADE|CAPITAL_SOCIAL|tipo_index|natureza_index|log_capital_social|         log_idade|
+---+-------+----------+-----------+--------------+----------+--------------+------------------+------------------+
|  0|      0|         1| 7.062444E8|       90000.0|       0.0|           2.0|11.407564949312402|20.375471911189948|
|  1|      0|         2| 7.180812E8|           0.0|       0.0|           0.0|               0.0|20.392093212540093|
|  2|      1|         2|  6.98814E8|       10000.0|       0.0|           0.0| 9.210340371976184|20.364895170368083|
|  3|      0|         3| 7.124652E8|       30000.0|       0.0|           0.0|10.308952660644293|   20.384241626809|
|  4|      0|         0| 7.014924E8|           0.0|       0.0|           1.0|               0.0|20.368720623520804|
|  5|      1|         3|1.3510476E9|      100000.0|       0.0|          

In [0]:
# importar pacotes
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler

encoder = OneHotEncoderEstimator(inputCols=['natureza_index'],
                                 outputCols=['natureza_encoded'])

model = encoder.fit(df)
encoded = model.transform(df)

encoded.show(5)

+-------+----------+-----------+--------------+----------+------------------+------------------+----------------+
|CS_ALVO|QTD_SOCIOS|      IDADE|CAPITAL_SOCIAL|tipo_index|log_capital_social|         log_idade|natureza_encoded|
+-------+----------+-----------+--------------+----------+------------------+------------------+----------------+
|      0|         1| 7.062444E8|       90000.0|       0.0|11.407564949312402|20.375471911189948|       (2,[],[])|
|      0|         2| 7.180812E8|           0.0|       0.0|               0.0|20.392093212540093|   (2,[0],[1.0])|
|      1|         2|  6.98814E8|       10000.0|       0.0| 9.210340371976184|20.364895170368083|   (2,[0],[1.0])|
|      0|         3| 7.124652E8|       30000.0|       0.0|10.308952660644293|   20.384241626809|   (2,[0],[1.0])|
|      0|         0| 7.014924E8|           0.0|       0.0|               0.0|20.368720623520804|   (2,[1],[1.0])|
|      1|         3|1.3510476E9|      100000.0|       0.0|11.512925464970229|21.02414612

## Random Forest

In [0]:
# import 
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
#======== VECTOR ASSEMBLER

from pyspark.ml.feature import VectorAssembler

inputcol = ['tipo_index', 'natureza_encoded', 'QTD_SOCIOS', 'IDADE', 'CAPITAL_SOCIAL']

assembler = VectorAssembler(
    inputCols = inputcol,
    outputCol = "features")

output = assembler.transform(encoded)

dataModel = output.select(['features', 'CS_ALVO'])

dataModel = dataModel.withColumnRenamed('CS_ALVO', 'label')

dataModel.show(5, truncate=False)

+--------------------------------------------------------------------------------+------+
|features                                                                        |target|
+--------------------------------------------------------------------------------+------+
|[0.0,1.0,20.375471911189948,7.062444E8,0.0,11.407564949312402,0.0,0.0,90000.0]  |0     |
|(9,[1,2,3,6],[2.0,20.392093212540093,7.180812E8,1.0])                           |0     |
|[1.0,2.0,20.364895170368083,6.98814E8,0.0,9.210340371976184,1.0,0.0,10000.0]    |1     |
|[0.0,3.0,20.384241626809,7.124652E8,0.0,10.308952660644293,1.0,0.0,30000.0]     |0     |
|(9,[2,3,7],[20.368720623520804,7.014924E8,1.0])                                 |0     |
|[1.0,3.0,21.024146128464423,1.3510476E9,0.0,11.512925464970229,1.0,0.0,100000.0]|1     |
|(9,[2,3,7],[20.329150673833833,6.742764E8,1.0])                                 |0     |
|(9,[2,3,7],[20.315604919813186,6.652044E8,1.0])                                 |0     |
|[0.0,1.0,

In [0]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = dataModel.randomSplit([0.7, 0.3])

In [0]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="target", featuresCol="features", numTrees=5)

# RF in a Pipeline
pipeline = Pipeline(stages=[rf])

# Train model. 
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

In [0]:
# Select example rows to display.
predictions.select("prediction", "target", "features").show(5)

+----------+------+--------------------+
|prediction|target|            features|
+----------+------+--------------------+
|       1.0|     1|(9,[0,1,2,3],[1.0...|
|       1.0|     1|(9,[0,1,2,3],[1.0...|
|       1.0|     1|(9,[0,1,2,3],[1.0...|
|       1.0|     1|(9,[0,1,2,3],[1.0...|
|       1.0|     1|(9,[0,1,2,3],[1.0...|
+----------+------+--------------------+
only showing top 5 rows



## Evaluate Model


In [0]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="target", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
accuracy

1.0