# Validación y elección de modelo

¿No estamos satisfechos con el modelo? Hay 2 posibles puntos a mejorar:

* La complejidad del modelo
* El número de datos

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## Cross-Validation

Estamos usando demasiado el conjunto de prueba para tomar decisiones. Data Leaking!

Queremos revisar si debemos subir o bajar la complejidad del modelo.

Validación cruzada:
* Entrenamiento-Validación-Prueba (holdout)
* K-fold
* Leave-one-out

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

## Datos falsos...

In [None]:
def make_data(N, err=1.0, rseed=1):
    # randomly sample the data
    rng = np.random.RandomState(rseed)
    X = rng.rand(N, 1) ** 2
    y = 10 - 1. / (X.ravel() + 0.1)
    if err > 0:
        y += err * rng.randn(N)
    return X, y

In [None]:
X2, y2 = make_data(40)
plt.scatter(X2.ravel(), y2);
plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

X2_test = np.linspace(-0.1, 1.1, 500)[:, None]

plt.scatter(X2.ravel(), y2, color='black', alpha=0.3)
axis = plt.axis()
for degree in [1, 5, 15]:
    y_test = PolynomialRegression(degree).fit(X2, y2).predict(X2_test)
    plt.plot(X2_test.ravel(), y_test, label='grado={0}'.format(degree))
plt.xlim(-0.1, 1.0)
plt.ylim(-2, 13)
plt.legend(loc='best')
plt.show()

In [None]:
from sklearn.model_selection import validation_curve

max_grados = 20

train_scores, val_scores = validation_curve(
    PolynomialRegression(), X2, y2, param_name="polynomialfeatures__degree", param_range=range(max_grados),
    cv=5, scoring="neg_mean_squared_error")

plt.plot(range(max_grados), train_scores.mean(axis=1), 'g', label='score entrenamiento')
plt.plot(range(max_grados), val_scores.mean(axis=1), 'r', label='score validacion')
plt.ylim([-12, 0])
plt.xlim([0,max_grados])
plt.legend()
plt.show()

In [None]:
print("El valor máximo es de:", np.max(val_scores.mean(axis=1)), ", alcanzado en el grado: ", np.argmax(val_scores.mean(axis=1)))

## Curva de aprendizaje

Tal vez para el modelo que queremos necesitamos más datos.

In [None]:
from sklearn.learning_curve import learning_curve

fig, ax = plt.subplots(1, 3, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)

for i, degree in enumerate([2, 5, 9]):
    N, train_lc, val_lc = learning_curve(PolynomialRegression(degree),
                                         X2, y2, cv=7,
                                         train_sizes=np.linspace(0.3, 1, 25),
                                        scoring='neg_mean_squared_error')

    ax[i].plot(N, np.mean(train_lc, 1), color='blue', label='score ent')
    ax[i].plot(N, np.mean(val_lc, 1), color='red', label='score val')
    ax[i].hlines(np.mean([train_lc[-1], val_lc[-1]]), N[0], N[-1],
                 color='gray', linestyle='dashed')

    ax[i].set_ylim(-10, 0)
    ax[i].set_xlim(N[0], N[-1])
    ax[i].set_xlabel('indiv entrenamiento')
    ax[i].set_ylabel('score')
    ax[i].set_title('grado = {0}'.format(degree), size=14)
    ax[i].legend(loc='best')
plt.show()

## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
grid_parametros = {
    'n_estimators': [50, 100, 200],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestRegressor()

rf_cv = GridSearchCV(rf, grid_parametros)

In [None]:
rf_cv.fit(X2, y2)

In [None]:
rf_cv.best_params_