### En este notebook vamos a ver las diferentes herramientas que hemos aprendido para evaluar el rendimiento del clasificador y la selección de aquel modelo con el mayor poder predictivo.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

In [None]:
mam_df = pd.read_csv('../../datasets/mammographic_masses.data', na_values='?', header=None)
mam_df.dropna(inplace=True)

In [None]:
mam_df.head()

In [None]:
X=mam_df.iloc[:,1:5].values
y=mam_df.iloc[:,5].values

In [None]:
X

Hacemos las transformaciones necesarias sobre nuestros datos

In [None]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
oHe = OneHotEncoder(sparse=False, categorical_features=[1,2,3])
scaler = MinMaxScaler()

X_proc=oHe.fit_transform(X) 
X_proc = scaler.fit_transform(X_proc)

In [None]:
X_proc

### Metodo holdout

In [None]:
# Vamos a usar un KNN. Primero, creamos un objeto de la clase de este clasificador
from sklearn.neighbors import KNeighborsClassifier
clf= KNeighborsClassifier(weights = 'distance')

# Ajustamos los datos
clf.fit(X_proc, y)

# Predecimos sobre los mismos datos que hemos usado para ajustar
print(clf.score(X_proc,y))

Este resultado anterior es irreal y nada generalizable, ya que hemos usado para la predicción el mismo dataset del ajuste. Como hemos visto, para ver la generalización del modelo, tenemos que dejar una parte de la dataset fuera del ajuste (**Método holdout**)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
X_train_proc=np.concatenate((oHe.fit_transform(X_train), 
                       scaler.fit_transform(X_train[:,0].reshape(-1,1))), axis=1)
X_test_proc=np.concatenate((oHe.transform(X_test), 
                       scaler.transform(X_test[:,0].reshape(-1,1))), axis=1)

In [None]:
X_train_proc=oHe.fit_transform(X_train) 
X_train_proc = scaler.fit_transform(X_train_proc)

X_test_proc=oHe.transform(X_test) 
X_test_proc = scaler.transform(X_test_proc)

In [None]:
X_train_proc

In [None]:
clf.fit(X_train_proc, y_train)
print(" El rendimiento sobre el training:" ,clf.score(X_train_proc,y_train))
print(" El rendimiento sobre el test:", clf.score(X_test_proc,y_test))

¿Cómo dependen los resultados de los parámetros del algoritmo y el tamaño de la partición?

In [None]:
train_scores=[]
test_scores=[]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

k_range = np.arange(1,20,1)
for neighs in k_range:
    
    X_train_proc=oHe.fit_transform(X_train) 
    X_train_proc = scaler.fit_transform(X_train_proc)

    X_test_proc=oHe.transform(X_test) 
    X_test_proc = scaler.transform(X_test_proc)
    
    clf = KNeighborsClassifier(weights = 'distance',n_neighbors=neighs)

    clf.fit(X_train_proc, y_train)

    train_scores.append(clf.score(X_train_proc,y_train))
    test_scores.append(clf.score(X_test_proc,y_test))
    
    
plt.plot(np.asarray(train_scores))
plt.plot(np.asarray(test_scores))
plt.show()

Podríamos hacer lo mismo usando la funcionalidad `validation_curve` de scikit

In [None]:
from sklearn.model_selection import validation_curve
train_scores, test_scores = validation_curve(
    KNeighborsClassifier(weights = 'distance'), X_proc, y, param_name="n_neighbors", param_range=k_range,
    cv=10, scoring="accuracy", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

In [None]:
plt.title("Validation Curve with KNN")
plt.xlabel("n_neighbors")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(k_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(k_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(k_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(k_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.ylim(0.6,1)
plt.legend(loc="best")

In [None]:
train_scores=[]
test_scores=[]
clf = KNeighborsClassifier(weights = 'distance')

sizes=np.linspace(.1, 0.9, 10)

for size in sizes:
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=size, random_state=0)

    X_train_proc=oHe.fit_transform(X_train) 
    X_train_proc = scaler.fit_transform(X_train_proc)

    X_test_proc=oHe.transform(X_test) 
    X_test_proc = scaler.transform(X_test_proc)
        
    clf.fit(X_train_proc, y_train)

    train_scores.append(clf.score(X_train_proc,y_train))
    test_scores.append(clf.score(X_test_proc,y_test))
    
    
plt.plot(sizes, 1-np.asarray(train_scores))
plt.plot(sizes, 1-np.asarray(test_scores))
plt.show()

Podríamos hacer lo mismo usando la funcionalidad `learning_curve` de scikit

In [None]:
from sklearn.model_selection import learning_curve
train_sizes , train_scores, test_scores = learning_curve(
    KNeighborsClassifier(weights = 'distance'), X_proc, y, train_sizes=np.linspace(.1, 1.0, 5),
    cv=10, scoring="accuracy", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

In [None]:
plt.grid()

plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
         label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
         label="Cross-validation score")

plt.legend(loc="best")

### Cross-validation

In [None]:
# Vamos a definir un cross-validation con 10 folds estratificados
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10, random_state=0)

# Veamos cómo es cada fold
print('{} {}'.format('Training set observations', 'Testing set observations'))
for train_index, test_index in skf.split(X_proc, y):                                     
    print('Num obs training: {0}, con {1} de la calse negativa y {2} de la clase positiva. Num obs en test es: {3}'.format(len(train_index), 
                                                    sum(y[train_index]==0),
                                                    sum(y[train_index]==1), 
                                                    len(test_index)))

In [None]:
from sklearn.model_selection import cross_val_score
clf= KNeighborsClassifier(weights = 'distance')
scores = cross_val_score(clf, X, y, cv=skf, scoring='accuracy')
print(scores)
print(np.mean(scores))

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

y_pred = cross_val_predict(clf, X, y, cv=skf)
print(accuracy_score(y, y_pred))

Hagamos lo mismo que antes, pero de un forma compacta usando la funcionalidad `Pipeline`

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pip = Pipeline([('oHe', oHe),('scaler', scaler),('clf', clf)])

In [None]:
scores = cross_val_score(pip, X, y, cv=skf, scoring='accuracy')
print(scores)
print(np.mean(scores))

In [None]:
k_range = list(range(1, 20))
k_scores = []
for k in k_range:
    clf = KNeighborsClassifier(weights = 'distance', n_neighbors=k)
    pip = Pipeline([('oHe', oHe),('scaler', scaler),('clf', clf)])
    scores = cross_val_score(pip, X, y, cv=skf, scoring='accuracy')
    k_scores.append(np.mean(scores))
print(k_scores)

In [None]:
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
pass

## Mejoras a cross-validation

**Repeated cross-validation**

- Repetir cross-validation varias veces ( con **differentes particiones aleatorias** de los datos) y promediar los resultados
- Estimaciones más fiables puesto que se **reduce la varianza** asociada con un solo intento de cross-validation


In [None]:
from sklearn.model_selection import RepeatedKFold
rcv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=0)
clf = KNeighborsClassifier(weights = 'distance')
pip = Pipeline([('oHe', oHe),('scaler', scaler),('clf', clf)])
scores = cross_val_score(pip, X, y, cv=rcv, scoring='accuracy')
print(len(scores))
print(np.mean(scores))

In [None]:
k_scores = []

for k in k_range:
    clf = KNeighborsClassifier(weights = 'distance',n_neighbors=k)
    pip = Pipeline([('oHe', oHe),('scaler', scaler),('clf', clf)])
    scores = cross_val_score(pip, X, y, cv=rcv, scoring='accuracy')
    k_scores.append(np.mean(scores))

print(k_scores)

In [None]:
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
pass

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Creamos un grid con los parámetros sobre los que probar el clasificador
param_grid = dict(n_neighbors=k_range)
print(param_grid)

In [None]:
# Definimos un objeto de la clase GridSearchCV
grid = GridSearchCV(clf, param_grid, cv=skf, scoring='accuracy')

In [None]:
# Ajustamos los datos, que internamente incorpora un cross-validation
grid.fit(X, y)

In [None]:
grid.grid_scores_

In [None]:
plt.plot(k_range, grid.cv_results_['mean_test_score'])
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
pass

In [None]:
# Podemos mirar las características del mejor modelo
print(grid.best_score_)
print(grid.best_params_)

### Buscando varios parámetros simultáneamente

In [None]:
# Definimos  otra vez los parámetos
k_range = list(range(1, 20))
weight_options = ['uniform', 'distance']

In [None]:
# Ahora nuestro grid está formado por dos parámetros
param_grid = {'clf__n_neighbors':k_range, 'clf__weights':weight_options}
print(param_grid)

In [None]:
# Definimos un objeto de la clase GridSearchCV
grid = GridSearchCV(pip, param_grid, cv=skf, scoring='accuracy')
grid.fit(X,y)

In [None]:
pd.DataFrame(grid.cv_results_).loc[:,['params','mean_test_score']]

In [None]:
# Podemos mirar las características del mejor modelo
print(grid.best_score_)
print(grid.best_params_)

### `RandomizedSearchCV` para reducir la carga computacional

- Buscar sobre todas las posibles combinaciones de los diferentes hiperparámetros puede ser muy costoso computacionalmente hablando.
- `RandomizedSearchCV` coge un subset de éstos en tantas iteracciones como uno desee

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
param_dist = {'clf__n_neighbors':k_range, 'clf__weights':weight_options}

In [None]:
# n_iter controla el número de busquedas sobre los parámetros
rand = RandomizedSearchCV(pip, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=0)
rand.fit(X, y)
rand.grid_scores_

In [None]:
# Podemos mirar las características del mejor modelo
print(rand.best_score_)
print(rand.best_params_)