Estaremos trabajando en el dataset: **Breast Cancer Wisconsin**

In [72]:
#Importamos librerias y el Dataset
import pandas as pd
import numpy as np
import scipy as sp

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [73]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [74]:
#Convertimos en dataframe
df = pd.DataFrame(np.c_[data['data'], data['target']],
                  columns= np.append(data['feature_names'], ['target']))

In [75]:
#Visualizamos el objeto
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [76]:
#Como son muchos atributos nos vamos a quedar unicamente con algunos de ellos
features= list(df.columns[0:10])
features

['mean radius',
 'mean texture',
 'mean perimeter',
 'mean area',
 'mean smoothness',
 'mean compactness',
 'mean concavity',
 'mean concave points',
 'mean symmetry',
 'mean fractal dimension']

In [77]:
#A lo que ya tenemos le agregamos la variable: target
data = df[features + ['target']]
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.0


In [78]:
#Separamos en X e y como así también en Train y Test
X = data.drop(['target'],axis=1)
y = data['target']

# Dividimos los datos en Train y Test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [79]:
#Creamos nuestro objeto KNN
knn = KNeighborsClassifier()

# GridSearch CV

In [80]:
#Definicion de Hyperparámetros
param_grid = {'n_neighbors':np.arange(1, 10),
              'weights': ['uniform', 'distance'], 
              'leaf_size':[1,3,5,7,10],
              'algorithm':['auto', 'kd_tree']}

#Utilizamos la grilla definida anteriormente...
model = GridSearchCV(knn, param_grid=param_grid, cv=5)

In [81]:
#Entrenamos nuestro modelo de KNN con la grilla ya definida y CV con tamaño de Fold=5
model.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['auto', 'kd_tree'],
                         'leaf_size': [1, 3, 5, 7, 10],
                         'n_neighbors': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                         'weights': ['uniform', 'distance']})

Entonces ... ¿Cómo sabemos cuales son los mejores hyperparámetros? Para ello, tendremos que analizar las siguientes funciones: 
 * best_params_
 * best_score_
 * cv_results_
 
Aclaración: Se recomienda profundizar en la documentación asociada.

Link de Interes:
* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [82]:
print("Mejores parametros: "+str(model.best_params_))
print("Mejor Score: "+str(model.best_score_)+'\n')

Mejores parametros: {'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 6, 'weights': 'distance'}
Mejor Score: 0.8849247606019152



In [83]:
#Veamos los resultados obtenidos
scores = pd.DataFrame(model.cv_results_)
scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.007593,0.009537,0.004868,0.006361,auto,1,1,uniform,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.895349,0.858824,0.858824,0.823529,0.800000,0.847305,0.032792,141
1,0.005942,0.006094,0.002776,0.004214,auto,1,1,distance,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.895349,0.858824,0.858824,0.823529,0.800000,0.847305,0.032792,141
2,0.000799,0.001599,0.005664,0.006083,auto,1,2,uniform,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.895349,0.835294,0.847059,0.835294,0.811765,0.844952,0.027685,171
3,0.003124,0.006249,0.003135,0.006270,auto,1,2,distance,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.895349,0.858824,0.858824,0.823529,0.800000,0.847305,0.032792,141
4,0.006239,0.007641,0.004437,0.006145,auto,1,3,uniform,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.895349,0.894118,0.941176,0.811765,0.870588,0.882599,0.042162,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,0.002801,0.000399,0.002800,0.000397,kd_tree,10,7,distance,"{'algorithm': 'kd_tree', 'leaf_size': 10, 'n_n...",0.895349,0.882353,0.941176,0.823529,0.858824,0.880246,0.039040,51
176,0.002790,0.000395,0.004798,0.000396,kd_tree,10,8,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 10, 'n_n...",0.860465,0.870588,0.905882,0.823529,0.870588,0.866211,0.026332,121
177,0.003001,0.000628,0.002397,0.000486,kd_tree,10,8,distance,"{'algorithm': 'kd_tree', 'leaf_size': 10, 'n_n...",0.895349,0.882353,0.941176,0.823529,0.870588,0.882599,0.038020,11
178,0.003391,0.000534,0.004832,0.000421,kd_tree,10,9,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 10, 'n_n...",0.883721,0.870588,0.941176,0.823529,0.870588,0.877921,0.037685,61


In [84]:
#Ahora sí ya estamos en condición de realizar nuestras predicciones
prediction = model.predict(X_test)

In [85]:
#Accuracy
print('Exactitud:', accuracy_score(y_test, prediction))

Exactitud: 0.9090909090909091


In [86]:
# Matriz de Confusion
cm = confusion_matrix(y_test,prediction)
print("Matriz de confusión:")
print(cm)

Matriz de confusión:
[[47  7]
 [ 6 83]]


# Random Search

In [87]:
# Grilla para Random Search
param_dist = {'n_neighbors':sp.stats.randint(1, 10),
              'weights': ['uniform', 'distance'], 
              'leaf_size':sp.stats.randint(1, 10),
              'algorithm':['auto', 'kd_tree']}

#Aplicamos la grilla al modelo
model = RandomizedSearchCV(knn, param_dist,n_iter=100, random_state=0, cv=5)

In [88]:
#Entrenamos KNN con la grilla definida arriba y CV con tamaño de Fold=5
model.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(), n_iter=100,
                   param_distributions={'algorithm': ['auto', 'kd_tree'],
                                        'leaf_size': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000232D301BDC0>,
                                        'n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000232D32B4820>,
                                        'weights': ['uniform', 'distance']},
                   random_state=0)

In [89]:
print("Mejores parametros: "+str(model.best_params_))
print("Mejor Score: "+str(model.best_score_)+'\n')

Mejores parametros: {'algorithm': 'kd_tree', 'leaf_size': 7, 'n_neighbors': 6, 'weights': 'distance'}
Mejor Score: 0.8849247606019152



In [90]:
#Analizamos qué obtuvimos
scores = pd.DataFrame(model.cv_results_)
scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003124,0.006248,0.003131,0.006262,auto,6,1,distance,"{'algorithm': 'auto', 'leaf_size': 6, 'n_neigh...",0.895349,0.858824,0.858824,0.823529,0.800000,0.847305,0.032792,83
1,0.003120,0.006241,0.007557,0.007010,kd_tree,4,8,distance,"{'algorithm': 'kd_tree', 'leaf_size': 4, 'n_ne...",0.895349,0.882353,0.941176,0.823529,0.870588,0.882599,0.038020,4
2,0.009375,0.007654,0.003125,0.006250,kd_tree,6,3,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 6, 'n_ne...",0.895349,0.894118,0.941176,0.811765,0.870588,0.882599,0.042162,4
3,0.004166,0.006086,0.004988,0.009976,kd_tree,7,9,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 7, 'n_ne...",0.883721,0.870588,0.941176,0.823529,0.870588,0.877921,0.037685,32
4,0.009370,0.007651,0.000000,0.000000,auto,2,7,distance,"{'algorithm': 'auto', 'leaf_size': 2, 'n_neigh...",0.895349,0.882353,0.941176,0.823529,0.858824,0.880246,0.039040,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.006240,0.007643,0.001302,0.002604,kd_tree,1,1,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 1, 'n_ne...",0.895349,0.858824,0.858824,0.823529,0.800000,0.847305,0.032792,83
96,0.000000,0.000000,0.009378,0.007657,auto,4,2,uniform,"{'algorithm': 'auto', 'leaf_size': 4, 'n_neigh...",0.895349,0.835294,0.847059,0.835294,0.811765,0.844952,0.027685,99
97,0.003139,0.006278,0.000000,0.000000,kd_tree,7,7,distance,"{'algorithm': 'kd_tree', 'leaf_size': 7, 'n_ne...",0.895349,0.882353,0.941176,0.823529,0.858824,0.880246,0.039040,28
98,0.007451,0.006753,0.003218,0.002776,auto,9,8,uniform,"{'algorithm': 'auto', 'leaf_size': 9, 'n_neigh...",0.860465,0.870588,0.905882,0.823529,0.870588,0.866211,0.026332,71


In [91]:
#Prediccion
prediction = model.predict(X_test)

In [92]:
#Accuracy
print('Exactitud:', accuracy_score(y_test, prediction))

Exactitud: 0.9090909090909091


In [93]:
# Matriz de Confusion
cm = confusion_matrix(y_test,prediction)
print("Matriz de confusión:")
print(cm)

Matriz de confusión:
[[47  7]
 [ 6 83]]


¿Qué podemos interpretar del proceso realizado?, ¿Encontró algo parecido al método de Grid Search?¿Fue más rápido?

**Aclaración:** Se recomienda cambiar los hyperparámetros para ambos tipos de métodos, para evaluar y comparar diferencias significativas. 