# Avaliação dos algoritmos
* Naive Bayes: 93.0
* árvore de decisão: 98.20
* Random forest: 98.40
* Regras: 97.7
* KNN: 98.60
* Regressão logistica: 94.60
* SVM: 98.80
* Redes neurais: 99.6

## Tuning dos parâmetros com GridSearch
### Preparação dos dados

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pickle
with open('credit.pkl', 'rb') as f: #Chama a variavel salva com todas as configurações do algoritmo 
    x_credit_treinamento, y_credit_treinamento, x_credit_teste, y_credit_teste = pickle.load(f)

In [5]:
# Utilizando a metodologia K-fold, não precisa mais fazer a divisão da bases em treino e teste 
# assim, usando o numpy, iremos concatenar as duas bases
x_credit = np.concatenate((x_credit_treinamento,x_credit_teste), axis = 0)
x_credit.shape 

(2000, 3)

In [6]:
y_credit = np.concatenate((y_credit_treinamento,y_credit_teste), axis = 0)
y_credit.shape

(2000,)

### Árvore de decisão

In [7]:
parametros = {'criterion':['gini','entropy'],
              'splitter': ['best','random'],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,5,10]             
             }

In [8]:
grid_search = GridSearchCV(estimator= DecisionTreeClassifier(), param_grid= parametros)
grid_search.fit(x_credit,y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'best'}
0.983


### Random forest

In [9]:
parametros = {'criterion':['gini','entropy'],
              'n_estimators': [10,40,100,150],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,5,10]             
             }

In [10]:
grid_search = GridSearchCV(estimator= RandomForestClassifier(), param_grid= parametros)
grid_search.fit(x_credit,y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 40}
0.9865


### Knn

In [11]:
parametros = {'n_neighbors': [3,5,10,20],
              'p': [1,2]
             }

In [12]:
grid_search = GridSearchCV(estimator= KNeighborsClassifier(), param_grid= parametros)
grid_search.fit(x_credit,y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'n_neighbors': 20, 'p': 1}
0.9800000000000001


### Regressão logistica

In [13]:
parametros = {'tol': [0.0001,0.00001,0.000001],
              'C':[1.0,1.5,2.0],
              'solver': ['lbfgs','sag', 'saga'] 
             }

In [14]:
grid_search = GridSearchCV(estimator= LogisticRegression(), param_grid= parametros)
grid_search.fit(x_credit,y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'C': 1.0, 'solver': 'lbfgs', 'tol': 0.0001}
0.9484999999999999


### SVM

In [15]:
parametros = {'tol': [0.001,0.0001,0.00001],
              'C':[1.0,1.5,2.0],
              'kernel': ['rbf','linear', 'poly','sigmoid'] 
             }

In [16]:
grid_search = GridSearchCV(estimator= SVC(), param_grid= parametros)
grid_search.fit(x_credit,y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'C': 1.5, 'kernel': 'rbf', 'tol': 0.001}
0.9829999999999999


### Redes neurais

In [17]:
parametros = {'activation': ['relu','logistic','tahn'],
              'solver': ['adam', 'sgd'],
              'batch_size': [10,15]   
             }

In [18]:
grid_search = GridSearchCV(estimator= MLPClassifier(), param_grid= parametros)
grid_search.fit(x_credit,y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'activation': 'relu', 'batch_size': 10, 'solver': 'adam'}
0.9964999999999999


# Validação cruzada

In [19]:
from sklearn.model_selection import cross_val_score, KFold

In [20]:
%%time
resultados_arvore = []
resultados_random_forest = []
resultados_knn = []
resultados_logistica = []
resultados_svm = []
resultados_rede_neural = []
for i in range(30):
    print(i)
    kfold = KFold(n_splits=10, shuffle=True, random_state=i)
    arvore = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=1,min_samples_split=5, splitter='best')
    scores = cross_val_score(arvore,x_credit,y_credit,cv = kfold)
    resultados_arvore.append(scores.mean())
    
    random_forest = RandomForestClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, n_estimators=10 )
    scores = cross_val_score(random_forest,x_credit,y_credit, cv = kfold)
    resultados_random_forest.append(scores.mean())
    
    knn = KNeighborsClassifier()
    scores = cross_val_score(knn,x_credit,y_credit, cv = kfold)
    resultados_knn.append(scores.mean())
    
    logistica = LogisticRegression(C = 1.0, solver='lbfgs', tol = 0.0001)
    scores = cross_val_score(logistica,x_credit,y_credit, cv=kfold)
    resultados_logistica.append(scores.mean())
    
    svm = SVC(kernel='rbf',C=2.0)
    scores = cross_val_score(svm, x_credit,y_credit,cv=kfold)
    resultados_svm.append(scores.mean())
    
    rede_neural = MLPClassifier(activation='relu',batch_size=56, solver = 'adam')
    scores = cross_val_score(rede_neural,x_credit,y_credit,cv=kfold)
    resultados_rede_neural.append(scores.mean())
    
    
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
CPU times: user 24min 40s, sys: 39.4 s, total: 25min 20s
Wall time: 24min 27s


In [41]:
resultados_rede_neural

[0.9964999999999999,
 0.998,
 0.9969999999999999,
 0.9969999999999999,
 0.9969999999999999,
 0.9975000000000002,
 0.9970000000000001,
 0.9970000000000001,
 0.9969999999999999,
 0.9970000000000001,
 0.9959999999999999,
 0.9964999999999999,
 0.998,
 0.9974999999999999,
 0.9964999999999999,
 0.9974999999999999,
 0.9964999999999999,
 0.9970000000000001,
 0.9974999999999999,
 0.9974999999999999,
 0.9970000000000001,
 0.9974999999999999,
 0.9974999999999999,
 0.9969999999999999,
 0.9970000000000001,
 0.9969999999999999,
 0.9974999999999999,
 0.9970000000000001,
 0.998,
 0.9970000000000001]

In [26]:
resultados = pd.DataFrame({'Arvore':resultados_arvore,'Random forest': resultados_random_forest,
                           'KNN': resultados_knn, 'Logistica': resultados_logistica, 'SVM': resultados_svm,
                          'Rede Neural': resultados_rede_neural 
                         })

In [27]:
resultados    

Unnamed: 0,Arvore,Random forest,KNN,Logistica,SVM,Rede Neural
0,0.9865,0.983,0.9815,0.9475,0.9845,0.9975
1,0.9835,0.986,0.98,0.9465,0.984,0.9975
2,0.9905,0.9835,0.9795,0.947,0.9865,0.997
3,0.987,0.98,0.978,0.946,0.985,0.9965
4,0.988,0.985,0.982,0.9465,0.985,0.9975
5,0.9885,0.9815,0.978,0.9465,0.9845,0.9975
6,0.988,0.985,0.9805,0.947,0.986,0.9965
7,0.9875,0.9865,0.98,0.948,0.985,0.9975
8,0.986,0.988,0.9795,0.9465,0.984,0.997
9,0.9875,0.986,0.982,0.9465,0.9845,0.9975


In [29]:
resultados.describe()

Unnamed: 0,Arvore,Random forest,KNN,Logistica,SVM,Rede Neural
count,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.987167,0.983867,0.98005,0.94695,0.985083,0.9972
std,0.001783,0.002259,0.001533,0.000687,0.00128,0.000484
min,0.983,0.98,0.977,0.9455,0.982,0.996
25%,0.986125,0.982,0.979,0.9465,0.984125,0.997
50%,0.9875,0.9835,0.98,0.947,0.985,0.9975
75%,0.988,0.986,0.981,0.9475,0.986375,0.9975
max,0.9905,0.988,0.9825,0.9485,0.9875,0.998


In [30]:
resultados.var()

Arvore           3.178161e-06
Random forest    5.102299e-06
KNN              2.350862e-06
Logistica        4.715517e-07
SVM              1.639368e-06
Rede Neural      2.344828e-07
dtype: float64

In [33]:
(resultados.std() / resultados.mean())*100 #Coeficiente de variação

Arvore           0.180592
Random forest    0.229587
KNN              0.156446
Logistica        0.072517
SVM              0.129977
Rede Neural      0.048559
dtype: float64