In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_predict, cross_val_score, KFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

## Preparacao dos dados

In [7]:
import pickle
with open('credit.pkl', 'rb') as f:
  x_credit_treinamento, y_credit_treinamento, x_credit_teste, y_credit_teste =pickle.load(f)

In [8]:
x_credit_treinamento.shape, y_credit_treinamento.shape

((1500, 3), (1500,))

In [9]:
x_credit_teste.shape, y_credit_teste.shape

((500, 3), (500,))

In [10]:
# como faremos a selecao ods parametros por KFold, vamos juntar as bass
x_credit = np.concatenate((x_credit_treinamento, x_credit_teste), axis=0)
x_credit.shape

(2000, 3)

In [11]:
y_credit = np.concatenate((y_credit_treinamento, y_credit_teste),axis=0 )
y_credit.shape

(2000,)

## Validação Cruzada

In [12]:
#  a cada ve que se executa uma validação cruzada, há 10 KFolds (amostras)

In [13]:
resultados_arvore = []
resultados_random_forest = []
resultados_knn = []
resultados_logistica = []
resultados_svm = []
resultados_rede_neural = []

for i in range(30): # logo, serão 300 testes (10*30)
    kfold = KFold(n_splits=10, shuffle=True, random_state=i)

## Decision Trees
    arvore = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, splitter='best') # criterios conhecidos no ajuste dos parametros
    scores = cross_val_score(arvore, x_credit, y_credit, cv=kfold)

    # print(scores)
    # print(scores.mean())
    resultados_arvore.append(scores.mean())

## Random Forest
    random_forest = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, splitter='best') # criterios conhecidos no ajuste dos parametros
    scores = cross_val_score(random_forest, x_credit, y_credit, cv=kfold)

    resultados_random_forest.append(scores.mean())

## KNN
    knn = KNeighborsClassifier()
    scores = cross_val_score(random_forest, x_credit, y_credit, cv=kfold)
    resultados_knn.append(scores.mean())

## Logistica
    logistica = LogisticRegression(C = 1.0, solver = 'lbfgs', tol = 0.0001)
    scores = cross_val_score(logistica, x_credit, y_credit, cv=kfold)
    resultados_logistica.append(scores.mean())

## SVM
    svm = SVC(kernel='rbf', C=2.0)
    scores = cross_val_score(svm, x_credit, y_credit, cv=kfold)
    resultados_svm.append(scores.mean())

## Redes Neurais
# não vou rodar pra não ficar muito pesado, mas a ideia é a mesma


In [14]:
# média de cada linha dos resultados -- print(scores)
print('Decision Tree:')
resultados_arvore

Decision Tree:


[0.9864999999999998,
 0.986,
 0.9905000000000002,
 0.9875,
 0.9879999999999999,
 0.9890000000000001,
 0.9880000000000001,
 0.9875,
 0.9865,
 0.9869999999999999,
 0.9865,
 0.9899999999999999,
 0.9884999999999999,
 0.9864999999999998,
 0.9835,
 0.986,
 0.9855,
 0.9904999999999999,
 0.9875,
 0.9869999999999999,
 0.983,
 0.9870000000000001,
 0.9889999999999999,
 0.9869999999999999,
 0.9865,
 0.9880000000000001,
 0.9879999999999999,
 0.986,
 0.986,
 0.9875]

In [15]:
print('Random Forest:')
resultados_random_forest

Random Forest:


[0.9869999999999999,
 0.985,
 0.9905000000000002,
 0.9869999999999999,
 0.9884999999999999,
 0.9884999999999999,
 0.9880000000000001,
 0.9875,
 0.9870000000000001,
 0.9869999999999999,
 0.9860000000000001,
 0.9894999999999999,
 0.9884999999999999,
 0.9864999999999998,
 0.9845,
 0.9859999999999998,
 0.9855,
 0.9904999999999999,
 0.9875,
 0.9869999999999999,
 0.985,
 0.9865,
 0.9884999999999999,
 0.9875,
 0.9870000000000001,
 0.9880000000000001,
 0.9875,
 0.986,
 0.9869999999999999,
 0.9884999999999999]

In [16]:
print('KNN:')
resultados_knn

KNN:


[0.9864999999999998,
 0.986,
 0.9905000000000002,
 0.9875,
 0.9884999999999999,
 0.9884999999999999,
 0.9884999999999999,
 0.9875,
 0.9865,
 0.9869999999999999,
 0.9860000000000001,
 0.9894999999999999,
 0.9889999999999999,
 0.9864999999999998,
 0.9825000000000002,
 0.9864999999999998,
 0.9855,
 0.9904999999999999,
 0.9875,
 0.9869999999999999,
 0.984,
 0.9865,
 0.9884999999999999,
 0.9869999999999999,
 0.9865,
 0.9884999999999999,
 0.9879999999999999,
 0.986,
 0.9864999999999998,
 0.9880000000000001]

In [17]:
print('Regressão Logística:')
resultados_logistica

Regressão Logística:


[0.9475,
 0.9465,
 0.9470000000000001,
 0.946,
 0.9465,
 0.9465,
 0.9469999999999998,
 0.9480000000000001,
 0.9465,
 0.9465,
 0.9475,
 0.9479999999999998,
 0.9475,
 0.9475,
 0.9484999999999999,
 0.9475,
 0.946,
 0.9470000000000001,
 0.9465,
 0.9464999999999998,
 0.9465,
 0.9469999999999998,
 0.9455,
 0.9465,
 0.9470000000000001,
 0.9469999999999998,
 0.9475,
 0.9465,
 0.9480000000000001,
 0.9465]

In [18]:

print('SVM:')
resultados_svm

SVM:


[0.9845,
 0.984,
 0.9864999999999998,
 0.985,
 0.985,
 0.9845,
 0.986,
 0.9849999999999998,
 0.984,
 0.9845,
 0.984,
 0.9845,
 0.9855,
 0.9855,
 0.983,
 0.9865,
 0.986,
 0.9865,
 0.985,
 0.9844999999999999,
 0.984,
 0.985,
 0.9865,
 0.9875,
 0.9869999999999999,
 0.982,
 0.984,
 0.9865,
 0.9834999999999999,
 0.9865]

#### Avaliando os resultados

In [20]:
resultados = pd.DataFrame({'Decision Tree': resultados_arvore, 'Random Forest': resultados_random_forest, 'KNN': resultados_knn,
                          'Logistica': resultados_logistica, 'SVM': resultados_svm})

In [21]:
resultados

Unnamed: 0,Decision Tree,Random Forest,KNN,Logistica,SVM
0,0.9865,0.987,0.9865,0.9475,0.9845
1,0.986,0.985,0.986,0.9465,0.984
2,0.9905,0.9905,0.9905,0.947,0.9865
3,0.9875,0.987,0.9875,0.946,0.985
4,0.988,0.9885,0.9885,0.9465,0.985
5,0.989,0.9885,0.9885,0.9465,0.9845
6,0.988,0.988,0.9885,0.947,0.986
7,0.9875,0.9875,0.9875,0.948,0.985
8,0.9865,0.987,0.9865,0.9465,0.984
9,0.987,0.987,0.987,0.9465,0.9845


In [22]:
resultados.describe()

Unnamed: 0,Decision Tree,Random Forest,KNN,Logistica,SVM
count,30.0,30.0,30.0,30.0,30.0
mean,0.9872,0.9873,0.987233,0.94695,0.985083
std,0.001695,0.001466,0.001695,0.000687,0.00128
min,0.983,0.9845,0.9825,0.9455,0.982
25%,0.9865,0.9865,0.9865,0.9465,0.984125
50%,0.987,0.987,0.987,0.947,0.985
75%,0.988,0.988375,0.9885,0.9475,0.986375
max,0.9905,0.9905,0.9905,0.9485,0.9875


In [23]:
resultados.var()

Decision Tree    2.872414e-06
Random Forest    2.148276e-06
KNN              2.874713e-06
Logistica        4.715517e-07
SVM              1.639368e-06
dtype: float64

In [25]:
cv = resultados.std() / resultados.mean() * 100
cv

Decision Tree    0.171679
Random Forest    0.148455
KNN              0.171742
Logistica        0.072517
SVM              0.129977
dtype: float64