### Imports

In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
from sklearn.datasets import fetch_openml
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from scipy.ndimage.interpolation import shift
from sklearn.pipeline import Pipeline

<IPython.core.display.Javascript object>

In [3]:
X, y = fetch_openml("mnist_784", return_X_y=True)
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

<IPython.core.display.Javascript object>

In [4]:
print("X_train.shape: {}, X_test.shape: {}".format(X_train.shape, X_test.shape))

X_train.shape: (60000, 784), X_test.shape: (10000, 784)


<IPython.core.display.Javascript object>

In [5]:
pipeline_steps = [("scaler", StandardScaler()), ("model", RandomForestClassifier())]
pipeline = Pipeline(pipeline_steps)

<IPython.core.display.Javascript object>

In [6]:
rf_param_grid = {"model__criterion": ["entropy"], "model__n_estimators": [10, 30]}
gridsearcher = GridSearchCV(
    pipeline, rf_param_grid, cv=5, n_jobs=-1, scoring="accuracy"
)

<IPython.core.display.Javascript object>

In [7]:
# %%timeit
gridsearcher.fit(X_train, y_train)

print(
    "Grid Search Parameters: {}\n\n Best score: {}".format(
        gridsearcher.best_params_, gridsearcher.best_score_
    )
)

Grid Search Parameters: {'model__criterion': 'entropy', 'model__n_estimators': 30}

 Best score: 0.9618


<IPython.core.display.Javascript object>

#### Data Augmentation

In [8]:
def shift_image(image, dx=1, dy=1, dims=(28, 28)):
    image = image.reshape(dims)
    shifted_image = shift(image, [dx, dy], cval=0, mode="constant")
    return shifted_image.reshape([-1])

<IPython.core.display.Javascript object>

In [9]:
X_train_aug = [Xt for Xt in X_train]
y_train_aug = [yt for yt in y_train]

for dx, dy in [(0, 1), (1, 0)]:
    for img, lbl in zip(X_train, y_train):
        X_train_aug.append(shift_image(img, dx, dy))
        y_train_aug.append(lbl)

len(X_train_aug)

180000

<IPython.core.display.Javascript object>

In [10]:
# %%timeit ~ 15 mts on a 16Gig
gridsearcher.fit(X_train_aug, y_train_aug)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
   ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'model__criterion': ['entropy'], 'model__n_estimators': [10, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

<IPython.core.display.Javascript object>

In [11]:
print(
    "Grid Search Parameters: {}\n\n Best score: {}".format(
        gridsearcher.best_params_, gridsearcher.best_score_
    )
)

Grid Search Parameters: {'model__criterion': 'entropy', 'model__n_estimators': 30}

 Best score: 0.96755


<IPython.core.display.Javascript object>