Estaremos trabalhando no conjunto de dados: **Breast Cancer Wisconsin**

In [1]:
import pandas as pd
import numpy as np
import scipy as sp

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [3]:
#Converte em dataframe
df = pd.DataFrame(np.c_[data['data'], data['target']],
                  columns= np.append(data['feature_names'], ['target']))

In [4]:
#Visualizando
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [5]:
#Selecionando algumas colunas
features= list(df.columns[0:10])
features

['mean radius',
 'mean texture',
 'mean perimeter',
 'mean area',
 'mean smoothness',
 'mean compactness',
 'mean concavity',
 'mean concave points',
 'mean symmetry',
 'mean fractal dimension']

In [6]:
# Concate features e target
data = df[features + ['target']]
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.0


In [7]:
#Separamos em X e y como e Train e Test
X = data.drop(['target'],axis=1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
#Criamos nosso objeto KNN
knn = KNeighborsClassifier()

# GridSearch CV

In [9]:
#Definição de Hyperparámetros
param_grid = {'n_neighbors':np.arange(1, 10),
              'weights': ['uniform', 'distance'], 
              'leaf_size':[1,3,5,7,10],
              'algorithm':['auto', 'kd_tree']}

model = GridSearchCV(knn, param_grid=param_grid, cv=5)

In [10]:
%%time
#Nosso modelo de KNNcom a grade já definida e CV com Fold = 5
model.fit(X_train, y_train)

CPU times: total: 8 s
Wall time: 8.77 s


Então... Como sabemos quais são os melhores hiperparâmetros? Para fazer isso, teremos que analisar as seguintes funções:
  * best_params_
  * best_score_
  * cv_results_
 
Esclarecimento: Recomenda-se aprofundar a documentação associada.

Links de interesse:
* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [24]:
print("Melhores parametros: "+str(model.best_params_))
print("Melhor Score: "+str(model.best_score_)+'\n')

Melhores parametros: {'algorithm': 'kd_tree', 'leaf_size': 7, 'n_neighbors': 6, 'weights': 'distance'}
Melhor Score: 0.8849247606019152



In [25]:
#Resultados
scores = pd.DataFrame(model.cv_results_)
scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003754,0.000940,0.003407,0.000497,auto,6,1,distance,"{'algorithm': 'auto', 'leaf_size': 6, 'n_neigh...",0.895349,0.858824,0.858824,0.823529,0.800000,0.847305,0.032792,83
1,0.003996,0.000011,0.003526,0.000445,kd_tree,4,8,distance,"{'algorithm': 'kd_tree', 'leaf_size': 4, 'n_ne...",0.895349,0.882353,0.941176,0.823529,0.870588,0.882599,0.038020,4
2,0.003599,0.000489,0.006999,0.000625,kd_tree,6,3,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 6, 'n_ne...",0.895349,0.894118,0.941176,0.811765,0.870588,0.882599,0.042162,4
3,0.003400,0.000490,0.006799,0.000749,kd_tree,7,9,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 7, 'n_ne...",0.883721,0.870588,0.941176,0.823529,0.870588,0.877921,0.037685,32
4,0.004199,0.000399,0.004002,0.000002,auto,2,7,distance,"{'algorithm': 'auto', 'leaf_size': 2, 'n_neigh...",0.895349,0.882353,0.941176,0.823529,0.858824,0.880246,0.039040,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.004109,0.000214,0.007728,0.000594,kd_tree,1,1,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 1, 'n_ne...",0.895349,0.858824,0.858824,0.823529,0.800000,0.847305,0.032792,83
96,0.004474,0.000524,0.008359,0.000217,auto,4,2,uniform,"{'algorithm': 'auto', 'leaf_size': 4, 'n_neigh...",0.895349,0.835294,0.847059,0.835294,0.811765,0.844952,0.027685,99
97,0.003591,0.000467,0.003773,0.000448,kd_tree,7,7,distance,"{'algorithm': 'kd_tree', 'leaf_size': 7, 'n_ne...",0.895349,0.882353,0.941176,0.823529,0.858824,0.880246,0.039040,28
98,0.003316,0.000410,0.006749,0.000676,auto,9,8,uniform,"{'algorithm': 'auto', 'leaf_size': 9, 'n_neigh...",0.860465,0.870588,0.905882,0.823529,0.870588,0.866211,0.026332,71


In [26]:
#Predições
prediction = model.predict(X_test)

In [27]:
#Acurácia
print('Acuracia:', accuracy_score(y_test, prediction))

Acuracia: 0.9090909090909091


In [28]:
# Matriz de Confusão
cm = confusion_matrix(y_test,prediction)
print("Matriz de Confusão:")
print(cm)

Matriz de Confusão:
[[47  7]
 [ 6 83]]



[=============================BASE DADOS=========================]

> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

[================TREINO=================|=========TESTE==========]

GridSearch

[================TREINO=================]

> model = GridSearchCV(knn, param_grid=param_grid, cv=5)

CV

[===V===|=======|=======|=======|=======]
[=======|===V===|=======|=======|=======]
[=======|=======|===V===|=======|=======]
[=======|=======|=======|===V===|=======]
[=======|=======|=======|=======|===V===]


# Random Search

In [29]:
# Grid para Random Search
param_dist = {'n_neighbors':sp.stats.randint(1, 10),
              'weights': ['uniform', 'distance'], 
              'leaf_size':sp.stats.randint(1, 10),
              'algorithm':['auto', 'kd_tree']}

model = RandomizedSearchCV(knn, param_dist,n_iter=100, random_state=0, cv=5)

In [30]:
%%time
model.fit(X_train, y_train)

CPU times: total: 3.8 s
Wall time: 5.17 s


In [31]:
print("Melhores parametros: "+str(model.best_params_))
print("Melhor Score: "+str(model.best_score_)+'\n')

Melhores parametros: {'algorithm': 'kd_tree', 'leaf_size': 7, 'n_neighbors': 6, 'weights': 'distance'}
Melhor Score: 0.8849247606019152



In [32]:
#Analisamos o que temos
scores = pd.DataFrame(model.cv_results_)
scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005122,0.001292,0.004814,0.000437,auto,6,1,distance,"{'algorithm': 'auto', 'leaf_size': 6, 'n_neigh...",0.895349,0.858824,0.858824,0.823529,0.800000,0.847305,0.032792,83
1,0.003990,0.000016,0.004379,0.000465,kd_tree,4,8,distance,"{'algorithm': 'kd_tree', 'leaf_size': 4, 'n_ne...",0.895349,0.882353,0.941176,0.823529,0.870588,0.882599,0.038020,4
2,0.003602,0.000492,0.007056,0.000967,kd_tree,6,3,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 6, 'n_ne...",0.895349,0.894118,0.941176,0.811765,0.870588,0.882599,0.042162,4
3,0.003758,0.000397,0.007004,0.000010,kd_tree,7,9,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 7, 'n_ne...",0.883721,0.870588,0.941176,0.823529,0.870588,0.877921,0.037685,32
4,0.004894,0.000827,0.004265,0.000618,auto,2,7,distance,"{'algorithm': 'auto', 'leaf_size': 2, 'n_neigh...",0.895349,0.882353,0.941176,0.823529,0.858824,0.880246,0.039040,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.004124,0.000258,0.006797,0.000377,kd_tree,1,1,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 1, 'n_ne...",0.895349,0.858824,0.858824,0.823529,0.800000,0.847305,0.032792,83
96,0.003734,0.000387,0.006398,0.000487,auto,4,2,uniform,"{'algorithm': 'auto', 'leaf_size': 4, 'n_neigh...",0.895349,0.835294,0.847059,0.835294,0.811765,0.844952,0.027685,99
97,0.003200,0.000400,0.003998,0.000639,kd_tree,7,7,distance,"{'algorithm': 'kd_tree', 'leaf_size': 7, 'n_ne...",0.895349,0.882353,0.941176,0.823529,0.858824,0.880246,0.039040,28
98,0.003495,0.000428,0.006415,0.000507,auto,9,8,uniform,"{'algorithm': 'auto', 'leaf_size': 9, 'n_neigh...",0.860465,0.870588,0.905882,0.823529,0.870588,0.866211,0.026332,71


In [20]:
#Predição
prediction = model.predict(X_test)

In [21]:
#Acurácia
print(accuracy_score(y_test, prediction))

0.9090909090909091


In [23]:
# Matriz de confusão
cm = confusion_matrix(y_test,prediction)
print("Matriz de confusão:")
print(cm)

Matriz de confusão:
[[47  7]
 [ 6 83]]


O que podemos interpretar do processo realizado? Encontrou algo semelhante ao método Grid Search? Foi mais rápido?

**Esclarecimento:** Recomenda-se alterar os hiperparâmetros para ambos os tipos de métodos, para avaliar e comparar diferenças significativas.