UNIVERSIDADE ESTADUAL DO CEARÁ \
MESTRADO ACADÊMICO EM CIÊNCIA DA COMPUTAÇÃO \
MINERAÇÃO MASSIVA DE DADOS

Daniel Gleison Moreira Lira \
daniel.gleison@aluno.uece.br

---
# Mecanismo de predição de diagnóstico de diabetes utilizando aprendizado de máquina e processamento distribuído
---


## Dataset

http://archive.ics.uci.edu/ml/datasets/Early+stage+diabetes+risk+prediction+dataset.# \
Date created: 2020-07-12

### Associated Tasks:
Classification

### Predicted attribute:
diabetes diagnostic

### Number of Instances:
520

### Number of Attributes:
17 (5 Decimal, 3 Integer, 3 String)

### Attribute Information:

Age 1.20-65 \
Sex 1. Male, 2.Female \
Polyuria 1.Yes, 2.No. \
Polydipsia 1.Yes, 2.No. \
sudden weight loss 1.Yes, 2.No. \
weakness 1.Yes, 2.No. \
Polyphagia 1.Yes, 2.No. \
Genital thrush 1.Yes, 2.No. \
visual blurring 1.Yes, 2.No. \
Itching 1.Yes, 2.No. \
Irritability 1.Yes, 2.No. \
delayed healing 1.Yes, 2.No. \
partial paresis 1.Yes, 2.No. \
muscle stiffness 1.Yes, 2.No. \
Alopecia 1.Yes, 2.No. \
Obesity 1.Yes, 2.No. \
Class 1.Positive, 2.Negative. 

### Missing Attribute Values: 
Yes

### Class Distribution: 
2 Classes \
320 Positive and 200 Negative

---

In [1]:
# Spark Lib
import findspark
findspark.init()

In [2]:
# Load libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnull, when, count, col, regexp_replace
from pyspark.sql.types import *

from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.util import MLUtils

from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.feature import VectorAssembler, VectorIndexer


from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LinearSVC, OneVsRest

from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

from pyspark.ml.linalg import Vectors
from pyspark.mllib.util import MLUtils

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

import time
start_time = time.time()
%matplotlib inline

#Extras
##from pyspark.sql.functions import isnull, when, count, col
from pyspark.ml.classification import DecisionTreeClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator

## Criação do ambiente Spark

In [3]:
#Versão do Spark
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("PredictionDiabetes") \
        .getOrCreate()
spark

## Importação do dataset

In [4]:
data_path = './data/'
data_name = 'diabetes_data_upload.csv'
df_original = spark.read.format('csv')\
                   .options(sep=',', header='true',inferschema='true')\
                   .load(data_path+data_name)

In [5]:
print("Quantidade de linhas:",df_original.count())
print("Quantidade de colunas:",len(df_original.columns))

Quantidade de linhas: 520
Quantidade de colunas: 17


## Análise exploratória dos dados

In [6]:
df_original.createOrReplaceTempView("tab_original")
df_sql = spark.sql('select count(class) from tab_original where class = "Positive"')
df_sql.show()

+------------+
|count(class)|
+------------+
|         320|
+------------+



In [7]:
df_sql = spark.sql('select count(class) from tab_original where class = "Negative"')
df_sql.show()

+------------+
|count(class)|
+------------+
|         200|
+------------+



In [None]:
df_original.groupBy('class').count().show()

In [None]:
df_original.select("*").toPandas()

In [None]:
df_original.describe().toPandas().transpose()

In [None]:
df_original.show(vertical=True,truncate=False, n=1)

In [None]:
from collections import defaultdict
data_types = defaultdict(list)
for entry in df_original.schema.fields:
    data_types[str(entry.dataType)].append(entry.name)
data_types

### Identificação de valores ausentes

In [None]:
df_original.select([count(when(isnull(c), c)).alias(c) for c in df_original.columns]).toPandas()

### Matrix de correlação

In [None]:
df_original.toPandas().corr()

## Transformação do dataset

### Indexação dos atributos de entrada

In [None]:
df_indexado = df_original

col = list(range(1, len(df_original.columns) - 1))

for x in col:

    indexer = StringIndexer(inputCol=df_original.columns[x], outputCol='index_'+df_original.columns[x]).fit(df_original)
    df_indexado = indexer.transform(df_indexado)
    labelReverse  = IndexToString().setInputCol(df_original.columns[x])
df_indexado.toPandas()

### Indexação da classe

In [None]:
df_indexado.select('class').toPandas()

In [None]:
df_indexado = df_indexado.withColumn('class', regexp_replace('class', 'Positive', '1'))
df_indexado = df_indexado.withColumn('class', regexp_replace('class', 'Negative', '0'))
df_indexado = df_indexado.withColumn('class',df_indexado['class'].cast('Integer'))
df_indexado.select('class').toPandas()

In [None]:
df_indexado = df_indexado.withColumn('gender', regexp_replace('gender', 'Male', '1'))
df_indexado = df_indexado.withColumn('gender', regexp_replace('gender', 'Female', '0'))
df_indexado = df_indexado.withColumn('gender',df_indexado['gender'].cast('Integer'))
df_indexado.select('gender').toPandas()

In [None]:
df_indexado = df_indexado.withColumnRenamed ('class', 'label')

In [None]:
df_indexado.groupBy('label').count().show()

In [None]:
from collections import defaultdict
data_types = defaultdict(list)
for entry in df_indexado.schema.fields:
    data_types[str(entry.dataType)].append(entry.name)
data_types

### Exclusão de atributos

In [None]:
df_indexado.show(vertical=True,truncate=False, n=1)

In [None]:
df_selecionado = df_indexado.drop('Gender',\
                                  'Polyuria',\
                                  'Polydipsia',\
                                  'sudden weight loss',\
                                  'weakness',\
                                  'Polyphagia',\
                                  'Genital thrush',\
                                  'visual blurring',\
                                  'Itching',\
                                  'Irritability',\
                                  'delayed healing',\
                                  'partial paresis',\
                                  'muscle stiffness',\
                                  'Alopecia',\
                                  'Obesity')\

df_selecionado.show(vertical=True,truncate=False, n=1)

In [None]:
from collections import defaultdict
data_types = defaultdict(list)
for entry in df_selecionado.schema.fields:
    data_types[str(entry.dataType)].append(entry.name)
data_types

In [None]:
df = df_selecionado.filter(df_selecionado['label'] == 1).sort(df_selecionado['label'])
df.toPandas()

## Seleção dos atributos

In [None]:
df_selecionado.toPandas()

## Criação da matrix de classificação

In [None]:
ignore = ['label']
list = [x for x in df_selecionado.columns if x not in ignore]

assembler = VectorAssembler(
            inputCols= list,
            outputCol='features')

df_transformado = (assembler.transform(df_selecionado).select('label','features'))
df_transformado.show(truncate = False, n = 5)

## Divisão do dataset para treinamento e teste

In [None]:
train_sample = 0.7
test_sample = 0.3
seed = 1234

(train, test) = df_transformado.randomSplit([train_sample, test_sample],seed)

num_train = df_transformado.count() * train_sample
num_test = df_transformado.count() * test_sample

print('Percentual da base de treinamento', train_sample*100, '%')
print('Percentual da base de teste', test_sample*100, '%')
print('Quantidade de registros da base de treinamento:', train.count())
print('Quantidade de registros da base de treinamento:', test.count())

In [None]:
train.groupby('label').count().show()

In [None]:
test.groupby('label').count().show()

## Treinamento, teste e avaliação dos modelos de predição

### Decision Tree (DT)

In [None]:
# Treinamento do modelo de predição
start_time = time.time()
trainer_dt = DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', probabilityCol='probability',\
                                 rawPredictionCol='rawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,\
                                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='gini', seed=None)
model_dt = trainer_dt.fit(train)
time_dt_train = time.time() - start_time

In [None]:
# Execução do modelo de predição na base de teste
start_time = time.time()
result_dt = model_dt.transform(test)
time_dt_pred = time.time() - start_time

In [None]:
# Cálculo da acurácia do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='accuracy')
accuracy_dt = evaluator.evaluate(result_dt) * 100

In [None]:
# Cálculo do recall do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='weightedRecall')
recall_dt = evaluator.evaluate(result_dt) * 100

In [None]:
# Cálculo da precisão do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='weightedPrecision')
precision_dt = evaluator.evaluate(result_dt) * 100

In [None]:
# Cálculo da F1 score do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='f1')
f1_dt = evaluator.evaluate(result_dt) * 100

In [None]:
# Matriz de confusão
y_true = result_dt.select("label").toPandas()
y_pred = result_dt.select("prediction").toPandas()
mc_dt = confusion_matrix(y_true, y_pred, labels = [0,1])
tn_dt, fp_dt, fn_dt, tp_dt = confusion_matrix(y_true, y_pred).ravel()
print(mc_dt)

In [None]:
group_names = ['Verdadeiro Negativo','Falso Positivo','Falso Negativo','Verdadeiro Positivo']
group_counts = ['{0:0.0f}'.format(value) for value in mc_dt.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in mc_dt.flatten()/np.sum(mc_dt)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(mc_dt, annot=labels, fmt='', cmap='Blues')

In [None]:
# Exibição dos resultados
evaluator_dt = spark.createDataFrame(
    [(round(accuracy_dt,2), round(recall_dt,2), round(precision_dt,2), round(f1_dt,2),\
      int(fp_dt), int(fn_dt),\
      round(time_dt_train,2), round(time_dt_pred,2))],\
    ['acurácia','recall','precisão','f1 score',\
     'falso positivo', 'falso negativo',\
     'tempo treinamento','tempo predição'])
print("Resultados do modelo Decision Tree:")
evaluator_dt.show()

In [None]:
result_dt.toPandas()

In [None]:
result_dt.createOrReplaceTempView("tab_result_dt")
dt_sql = spark.sql('select * from tab_result_dt where label != Prediction')
dt_sql.toPandas()

### Random Forest (RF)

In [None]:
# Treinamento do modelo de predição
start_time = time.time()
trainer = RandomForestClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', probabilityCol='probability',\
                                 rawPredictionCol='rawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,\
                                 numTrees=50, featureSubsetStrategy='auto', seed=None, subsamplingRate=1.0,\
                                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='gini')
model_rf = trainer.fit(train)
time_rf_train = time.time() - start_time

In [None]:
# Execução do modelo de predição na base de teste
start_time = time.time()
result_rf = model_rf.transform(test)
time_rf_pred = time.time() - start_time

In [None]:
# Cálculo da acurácia do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='accuracy')
accuracy_rf = evaluator.evaluate(result_rf) * 100

In [None]:
# Cálculo do recall do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='weightedRecall')
recall_rf = evaluator.evaluate(result_rf) * 100

In [None]:
# Cálculo da precisão do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='weightedPrecision')
precision_rf = evaluator.evaluate(result_rf) * 100

In [None]:
# Cálculo da F1 score do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='f1')
f1_rf = evaluator.evaluate(result_rf) * 100

In [None]:
# Matriz de confusão
y_true = result_rf.select("label").toPandas()
y_pred = result_rf.select("prediction").toPandas()
mc_rf = confusion_matrix(y_true, y_pred)
tn_rf, fp_rf, fn_rf, tp_rf = confusion_matrix(y_true, y_pred).ravel()
print(mc_rf)

In [None]:
group_names = ['Verdadeiro Negativo','Falso Positivo','Falso Negativo','Verdadeiro Positivo']
group_counts = ['{0:0.0f}'.format(value) for value in mc_dt.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in mc_rf.flatten()/np.sum(mc_rf)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(mc_rf, annot=labels, fmt='', cmap='Blues')

In [None]:
# Exibição dos resultados
evaluator_rf = spark.createDataFrame(
    [(round(accuracy_rf,2), round(recall_rf,2), round(precision_rf,2), round(f1_rf,2),\
      int(fp_rf), int(fn_rf),\
      round(time_rf_train,2), round(time_rf_pred,2))],\
    ['acurácia','recall','precisão','f1 score',\
     'falso positivo', 'falso negativo',\
     'tempo treinamento','tempo predição'])
print("Resultados do modelo Random Forest:")
evaluator_rf.show()

In [None]:
result_rf.toPandas()

### Neural Network Perceptron (NNP)

In [None]:
# Treinamento do modelo de predição
start_time =  time.time()
layers = [16, 5, 5, 2]
trainer = MultilayerPerceptronClassifier(featuresCol='features', labelCol='label',\
          maxIter=100, layers=layers, blockSize=128, seed=1234)
model_nnp = trainer.fit(train)
time_nnp_train = time.time() - start_time

In [None]:
# Execução do modelo de predição na base de teste
start_time =  time.time()
result_nnp = model_nnp.transform(test)
time_nnp_pred = time.time() - start_time

In [None]:
# Cálculo da acurácia do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='accuracy')
accuracy_nnp = evaluator.evaluate(result_nnp) * 100

In [None]:
# Cálculo do recall do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='weightedRecall')
recall_nnp = evaluator.evaluate(result_nnp) * 100

In [None]:
# Cálculo da precisão do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='weightedPrecision')
precision_nnp = evaluator.evaluate(result_nnp) * 100

In [None]:
# Cálculo da F1 score do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='f1')
f1_nnp = evaluator.evaluate(result_nnp) * 100

In [None]:
# Matriz de confusão
y_true = result_nnp.select("label").toPandas()
y_pred = result_nnp.select("prediction").toPandas()
mc_nnp = confusion_matrix(y_true, y_pred)
tn_nnp, fp_nnp, fn_nnp, tp_nnp = confusion_matrix(y_true, y_pred).ravel()
print(mc_nnp)

In [None]:
group_names = ['Verdadeiro Negativo','Falso Positivo','Falso Negativo','Verdadeiro Positivo']
group_counts = ['{0:0.0f}'.format(value) for value in mc_dt.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in mc_nnp.flatten()/np.sum(mc_nnp)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(mc_nnp, annot=labels, fmt='', cmap='Blues')

In [None]:
# Exibição dos resultados
evaluator_nnp = spark.createDataFrame(
    [(round(accuracy_nnp,2), round(recall_nnp,2), round(precision_nnp,2), round(f1_nnp,2),\
      int(fp_nnp), int(fn_nnp),\
      round(time_nnp_train,2), round(time_nnp_pred,2))],\
    ['acurácia','recall','precisão','f1 score',\
     'falso positivo', 'falso negativo',\
     'tempo treinamento','tempo predição'])
print("Resultados do modelo Neural Network Perceptron:")
evaluator_nnp.show()

In [None]:
result_nnp.toPandas()

### Naive Bayes (NB)

In [None]:
# Treinamento do modelo de predição
start_time =  time.time()
trainer = NaiveBayes(smoothing=1.0, modelType='multinomial')
model_nb = trainer.fit(train)
time_nb_train = time.time() - start_time

In [None]:
# Execução do modelo de predição na base de teste
start_time =  time.time()
result_nb = model_nb.transform(test)
time_nb_pred = time.time() - start_time

In [None]:
# Cálculo da acurácia do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='accuracy')
accuracy_nb = evaluator.evaluate(result_nb) * 100

In [None]:
# Cálculo do recall do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='weightedRecall')
recall_nb = evaluator.evaluate(result_nb) * 100

In [None]:
# Cálculo da precisão do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='weightedPrecision')
precision_nb = evaluator.evaluate(result_nb) * 100

In [None]:
# Cálculo da F1 score do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='f1')
f1_nb = evaluator.evaluate(result_nb) * 100

In [None]:
# Matriz de confusão
y_true = result_nb.select("label").toPandas()
y_pred = result_nb.select("prediction").toPandas()
mc_nb = confusion_matrix(y_true, y_pred)
tn_nb, fp_nb, fn_nb, tp_nb = confusion_matrix(y_true, y_pred).ravel()
print(mc_nb)

In [None]:
group_names = ['Verdadeiro Negativo','Falso Positivo','Falso Negativo','Verdadeiro Positivo']
group_counts = ['{0:0.0f}'.format(value) for value in mc_dt.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in mc_nb.flatten()/np.sum(mc_nb)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(mc_nb, annot=labels, fmt='', cmap='Blues')

In [None]:
# Exibição dos resultados
evaluator_nb = spark.createDataFrame(
    [(round(accuracy_nb,2), round(recall_nb,2), round(precision_nb,2), round(f1_nb,2),\
      int(fp_nb), int(fn_nb),\
      round(time_nb_train,2), round(time_nb_pred,2))],\
    ['acurácia','recall','precisão','f1 score',\
     'falso positivo', 'falso negativo',\
     'tempo treinamento','tempo predição'])
print("Resultados do modelo Naive Bayes:")
evaluator_nb.show()

In [None]:
result_nb.toPandas()

### Logistic Regression (LR)

In [None]:
# Treinamento do modelo de predição
start_time =  time.time()
trainer = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
model_lr = trainer.fit(train)
time_lr_train = time.time() - start_time

In [None]:
# Execução do modelo de predição na base de teste
start_time =  time.time()
result_lr = model_lr.transform(test)
time_lr_pred = time.time() - start_time

In [None]:
# Cálculo da acurácia do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='accuracy')
accuracy_lr = evaluator.evaluate(result_lr) * 100

In [None]:
# Cálculo do recall do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='weightedRecall')
recall_lr = evaluator.evaluate(result_lr) * 100

In [None]:
# Cálculo da precisão do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='weightedPrecision')
precision_lr = evaluator.evaluate(result_lr) * 100

In [None]:
# Cálculo da F1 score do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='f1')
f1_lr = evaluator.evaluate(result_lr) * 100

In [None]:
# Matriz de confusão
y_true = result_lr.select("label").toPandas()
y_pred = result_lr.select("prediction").toPandas()
mc_lr = confusion_matrix(y_true, y_pred)
tn_lr, fp_lr, fn_lr, tp_lr = confusion_matrix(y_true, y_pred).ravel()
print(mc_lr)

In [None]:
group_names = ['Verdadeiro Negativo','Falso Positivo','Falso Negativo','Verdadeiro Positivo']
group_counts = ['{0:0.0f}'.format(value) for value in mc_dt.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in mc_lr.flatten()/np.sum(mc_lr)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(mc_lr, annot=labels, fmt='', cmap='Blues')

In [None]:
# Exibição dos resultados
evaluator_lr = spark.createDataFrame(
    [(round(accuracy_lr,2), round(recall_lr,2), round(precision_lr,2), round(f1_lr,2),\
      int(fp_lr), int(fn_lr),\
      round(time_lr_train,2), round(time_lr_pred,2))],\
    ['acurácia','recall','precisão','f1 score',\
     'falso positivo', 'falso negativo',\
     'tempo treinamento','tempo predição'])
print("Resultados do modelo Logistic Regression:")
evaluator_lr.show()

In [None]:
result_lr.toPandas()

## Suport Vector Machines (SVM)

In [None]:
# Treinamento do modelo de predição
start_time =  time.time()
trainer = LinearSVC(featuresCol='features', labelCol='label',\
                    maxIter=100, regParam=0.1)
model_svm = trainer.fit(train)
time_svm_train = time.time() - start_time

In [None]:
# Execução do modelo de predição na base de teste
start_time =  time.time()
result_svm = model_svm.transform(test)
time_svm_pred = time.time() - start_time

In [None]:
# Cálculo da acurácia do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='accuracy')
accuracy_svm = evaluator.evaluate(result_svm) * 100

In [None]:
# Cálculo do recall do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='weightedRecall')
recall_svm = evaluator.evaluate(result_svm) * 100

In [None]:
# Cálculo da precisão do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='weightedPrecision')
precision_svm = evaluator.evaluate(result_svm) * 100

In [None]:
# Cálculo da F1 score do modelo de predição
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',\
            metricName='f1')
f1_svm = evaluator.evaluate(result_svm) * 100

In [None]:
# Matriz de confusão
y_true = result_svm.select("label").toPandas()
y_pred = result_svm.select("prediction").toPandas()
mc_svm = confusion_matrix(y_true, y_pred)
tn_svm, fp_svm, fn_svm, tp_svm = confusion_matrix(y_true, y_pred).ravel()
print(mc_svm)

In [None]:
group_names = ['Verdadeiro Negativo','Falso Positivo','Falso Negativo','Verdadeiro Positivo']
group_counts = ['{0:0.0f}'.format(value) for value in mc_dt.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in mc_svm.flatten()/np.sum(mc_svm)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(mc_svm, annot=labels, fmt='', cmap='Blues')

In [None]:
# Exibição dos resultados
evaluator_svm = spark.createDataFrame(
    [(round(accuracy_svm,2), round(recall_svm,2), round(precision_svm,2), round(f1_svm,2),\
      int(fp_svm), int(fn_svm),\
      round(time_svm_train,2), round(time_svm_pred,2))],\
    ['acurácia','recall','precisão','f1 score',\
     'falso positivo', 'falso negativo',\
     'tempo treinamento','tempo predição'])
print("Resultados do modelo Suport Vector Machines:")
evaluator_svm.show()

In [None]:
result_svm.toPandas()

## Resultados

In [None]:
models = ['Decision Tree','Random Forest','Neural Network Perceptron','Naive Bayes','Logistic Regression','Suport Vector Machines']

### Accuracy

In [None]:
print('Ranking Accuracy %')

list = ((models[0],accuracy_dt),\
        (models[1],accuracy_rf),\
        (models[2],accuracy_nnp),\
        (models[3],accuracy_nb),\
        (models[4],accuracy_lr),\
        (models[5],accuracy_nb))
df_acuracia = spark.createDataFrame(list, ['Modelo', 'Acuracia'])
df_acuracia.sort(df_acuracia.Acuracia.desc()).toPandas()


### Recall

In [None]:
print('Ranking')

list = ((models[0],recall_dt),\
        (models[1],recall_rf),\
        (models[2],recall_nnp),\
        (models[3],recall_nb),\
        (models[4],recall_lr),\
        (models[5],recall_svm))
df_recall = spark.createDataFrame(list, ['Modelo', 'Recall'])
df_recall.sort(df_recall.Recall.desc()).toPandas()

### Precision

In [None]:
print('Ranking Precision %')

list = ((models[0],precision_dt),\
        (models[1],precision_rf),\
        (models[2],precision_nnp),\
        (models[3],precision_nb),\
        (models[4],precision_lr),\
        (models[5],precision_svm))
df_precision = spark.createDataFrame(list, ['Modelo', 'Precisao'])
df_precision.sort(df_precision.Precisao.desc()).toPandas()

### F1 score

In [None]:
list = ((models[0],f1_dt),\
        (models[1],f1_rf),\
        (models[2],f1_nnp),\
        (models[3],f1_nb),\
        (models[4],f1_lr),\
        (models[5],f1_svm))
df_f1 = spark.createDataFrame(list, ['Modelo', 'F1'])
df_f1.sort(df_f1.F1.desc()).toPandas()

### Tempo de Treinamento

In [None]:
list = ((models[0],time_dt_train),\
        (models[1],time_rf_train),\
        (models[2],time_nnp_train),\
        (models[3],time_nb_train),\
        (models[4],time_lr_train),\
        (models[5],time_svm_train))
df_time_train = spark.createDataFrame(list, ['Modelo', 'Tempo_Treinamento'])
df_time_train.sort(df_time_train.Tempo_Treinamento.asc()).toPandas()

### Tempo de Predição

In [None]:
list = ((models[0],time_dt_pred),\
        (models[1],time_rf_pred),\
        (models[2],time_nnp_pred),\
        (models[3],time_nnp_pred),\
        (models[4],time_nnp_pred),\
        (models[5],time_nnp_pred))
df_time_pred = spark.createDataFrame(list, ['Modelo', 'Tempo_Predicao'])
df_time_pred.sort(df_time_pred.Tempo_Predicao.asc()).toPandas()

### Falso Positivo

In [None]:
list = ((models[0],int(fp_dt)),\
        (models[1],int(fp_rf)),\
        (models[2],int(fp_nnp)),\
        (models[3],int(fp_nb)),\
        (models[4],int(fp_lr)),\
        (models[5],int(fp_svm)))
df_fp = spark.createDataFrame(list, ['Modelo', 'Falso_Positivo'])
df_fp.sort(df_fp.Falso_Positivo.asc()).toPandas()

### Falso Negativo

In [None]:
list = ((models[0],int(fn_dt)),\
        (models[1],int(fn_rf)),\
        (models[2],int(fn_nnp)),\
        (models[3],int(fn_nb)),\
        (models[4],int(fn_lr)),\
        (models[5], int(fn_svm)))
df_fn = spark.createDataFrame(list, ['Modelo', 'Falso_Negativo'])
df_fn.sort(df_fn.Falso_Negativo.asc()).toPandas()

### Comparativo

In [None]:
df = df_acuracia.join(df_fp, "Modelo")
df = df.join(df_fn, "Modelo")
df = df.join(df_time_train, "Modelo")
df = df.join(df_time_pred, "Modelo")
df.sort(df.Acuracia.desc()).toPandas()

## Exportação dos modelos para o disco

In [None]:
# Modelo Decision Tree
model_dt.save('model_dt')

In [None]:
# Modelo Random Forest
model_rf.save('model_rf')

In [None]:
# Modelo Neural Network Perceptron
model_nnp.save('model_nnp')

In [None]:
# Modelo Suport Vector Machines
model_svm.save('model_svm')

In [None]:
# Modelo Naive Bayes
model_nb.save('model_nb')

In [None]:
# Modelo Logistic Regression
model_lr.save('model_lr')

---

In [None]:
spark.stop()