### GridSearch & Pipelines
GridSearch is an optimization tool that we use when tuning hyperparameters. We define the grid of parameters that we want to search through, and we select the best combination of parameters for our data.

# 1 - One way
Itera un algoritmo sobre un conjunto de hiperparametros

In [1]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [17]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV, train_test_split

iris = datasets.load_iris()

X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,
                                                   random_state=42)
svc = svm.SVC()

parameters = {
    'kernel':['linear', 'rbf', 'sigmoid'],
    'C':[0.001, 0.01, 0.1, 0.5, 1., 5., 10., 100.],
    'gamma':['scale', 'auto'],
    'coef0':[-10., -1., 0., .5, 1., 10., 100.]
}
grid = GridSearchCV(estimator=svc, param_grid=parameters,
                   n_jobs=-1, cv=10, refit=False)

grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0, 100.0],
                         'coef0': [-10.0, -1.0, 0.0, 0.5, 1.0, 10.0, 100.0],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid']},
             refit=False)

In [19]:
print('Best estimator: ', grid.best_estimator_)
print('Best estimator: ', grid.best_params_)

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [21]:
#best_estimator = grid.best_estimator_
best_estimator.score(X_train, y_train)

0.9666666666666667

# 2: Almost-Pro way

La forma pro es la que hace esto mismo y va recogiendo los errores de entrenamiento, de validación y tiene la capacidad de parar el proceso cuando se requiera además de guardar el modelo en local una vez terminado si es mejor que el que había anteriormente y de cargar el modelo anterior y seguir reentrenando.

In [125]:
pipe = Pipeline(steps=[('classifier', LogisticRegression())])

logistic_params = {
    'classifier': [LogisticRegression()],
    'classifier__penalty':('l1', 'l2'),
    'classifier__C':np.arange(0,4,0.5)
}

random_forest_params = {
    'classifier':[RandomForestClassifier()],
    'classifier__n_estimators':(10,100,500,1000),
    'classifier__max_features':(1,2,3)
}

svm_params = {
    'classifier':[svm.SVC()],
    'classifier__kernel':('linear', 'rbf', 'sigmoid')
}

search_space = (logistic_params,svm_params)

grid = GridSearchCV(pipe,
                   search_space,
                   cv=10,
                   n_jobs=2)

grid.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('classifier', LogisticRegression())]),
             n_jobs=2,
             param_grid=({'classifier': [LogisticRegression(C=1.5)],
                          'classifier__C': array([0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5]),
                          'classifier__penalty': ('l1', 'l2')},
                         {'classifier': [SVC()],
                          'classifier__kernel': ('linear', 'rbf', 'sigmoid')}))

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest

In [80]:
reg_log = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('reglog', LogisticRegression())
])


svc = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('selectkbest', SelectKBest()),
    ('svc', svm.SVC())
])

rand_forest = RandomForestClassifier()

reg_log_param = {
    'imputer__strategy':('mean', 'median', 'most_frequent'),
    'reglog__penalty': ('l1', 'l2'),
    'reglog__C': np.arange(0,4,.5)
}

svc_param = {
    'selectkbest__k':(1,2,3),
    'svc__C':np.arange(0.1, 0.9, 0.1),
    'svc__kernel': ('linear', 'rbf', 'poly')
}

rand_forest_params = {
    'n_estimators':(10, 100, 500, 1000),
    'max_features':(1,2,3)
}

gs_reg_log = GridSearchCV(reg_log,
                         reg_log_param,
                         cv=10,
                         scoring='accuracy',
                         n_jobs=-1,
                         verbose=1)

gs_svm = GridSearchCV(svc,
                     svc_param,
                     cv=10,
                     scoring='accuracy',
                     n_jobs=-1,
                     verbose=1)

gs_rand_forest = GridSearchCV(rand_forest,
                     rand_forest_params,
                     cv=10,
                     scoring='accuracy',
                     n_jobs=-1,
                     verbose=1)


grids = {
    'gs_reg_log':gs_reg_log,
    'gs_svm':gs_svm,
    'gs_rand_forest':gs_rand_forest
}


#gs_reg_log.fit(X_train, y_train)

#print('Best estimator: ', gs_reg_log.best_estimator_)
#print('Best params: ', gs_reg_log.best_params_)
#print('Best score: ', gs_reg_log.best_score_)

In [81]:
%%time
for nombre, grid_search in grids.items():
    grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:    7.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:    2.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   21.7s


Wall time: 1min 16s


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  1.1min finished


In [82]:
import pandas as pd

In [83]:
best_grids = [(i, j.best_score_) for i, j in grids.items()]

best_grids = pd.DataFrame(best_grids, columns=['Grid', 'Best score'])
best_grids.sort_values(by='Best score', ascending=False, inplace=True)

In [84]:
best_model_of_all = grids[best_grids.iloc[0,0]]

In [85]:
print('Best estimator: ', best_model_of_all.best_estimator_)
print('Best params: ', best_model_of_all.best_params_)
print('Best score: ', best_model_of_all.best_score_)

Best estimator:  Pipeline(steps=[('imputer', SimpleImputer()), ('selectkbest', SelectKBest(k=3)),
                ('svc', SVC(C=0.2, kernel='linear'))])
Best params:  {'selectkbest__k': 3, 'svc__C': 0.2, 'svc__kernel': 'linear'}
Best score:  0.9666666666666666


In [86]:
my_model = best_model_of_all.best_estimator_

my_model.score(X, y)

0.98

In [87]:
my_model.predict(X_train)

array([0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 1, 2,
       1, 0, 2, 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 2, 2,
       1, 1, 2, 1, 0, 1, 2, 0, 0, 1, 2, 0, 2, 0, 0, 2, 1, 2, 2, 2, 2, 1,
       0, 0, 2, 2, 0, 0, 0, 1, 2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2,
       1, 1, 1, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2,
       1, 1, 2, 2, 0, 1, 2, 0, 1, 2])

# 3 Another way

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
import numpy as np

In [33]:
reg_log = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('reglog', LogisticRegression())
])

reg_log_param = {
    'imputer__strategy':('mean', 'median', 'most_frequent'),
    'reglog__penalty': ('l1', 'l2'),
    'reglog__C': np.arange(0,4,.5)
}

gs_reg_log = GridSearchCV(reg_log,
                         reg_log_param,
                         cv=10,
                         scoring='accuracy',
                         n_jobs=-1,
                         verbose=1)

gs_reg_log.fit(X_train, y_train)

print('Best estimator: ', gs_reg_log.best_estimator_)
print('Best params: ', gs_reg_log.best_params_)
print('Best score: ', gs_reg_log.best_score_)

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:    1.4s


Best estimator:  Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('reglog', LogisticRegression(C=0.5))])
Best params:  {'imputer__strategy': 'mean', 'reglog__C': 0.5, 'reglog__penalty': 'l2'}
Best score:  0.9416666666666667


[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:    3.5s finished


In [88]:
import pickle

In [90]:
# Escribir
with open('finished_model.model', 'wb') as archivo_salida:
    pickle.dump(my_model, archivo_salida)

In [91]:
# Leer
with open('finished_model.model', 'rb') as archivo_entrada:
    pipeline_importado = pickle.load(archivo_entrada)

In [92]:
pipeline_importado

Pipeline(steps=[('imputer', SimpleImputer()), ('selectkbest', SelectKBest(k=3)),
                ('svc', SVC(C=0.2, kernel='linear'))])

In [105]:
new_flowers = np.array([[0,0,5.1, 2.3],
                       [99990,0,3.9,1.2]])

In [106]:
pipeline_importado.predict(new_flowers)

array([2, 1])

In [98]:
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  