# Ensemble methods

This notebook is based on the topics in chapter 7. <br>
We will answer the exercises 8 and 9 for the MNIST dataset <br>

## General preparations
### 1.1 Data loading

In [1]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)

### 1.2 Test/Train split

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, y_train = mnist['data'][:60000], mnist['target'][:60000].astype(np.uint8)
X_test, y_test = mnist['data'][60000:], mnist['target'][60000:].astype(np.uint8)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [3]:
X_train.shape, y_train.shape

((48000, 784), (48000,))

In [4]:
X_val.shape, y_val.shape

((12000, 784), (12000,))

### 1.3 Preprocessing pipeline

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

preprocessing_pipeline = Pipeline(
    steps=[
        ('std_scaler', StandardScaler())
    ]
)

## 2. Exercise 8: Ensemble classification

Load the MNIST data and split it into a training set, a validation set, and a test set. Then train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM classifier. Next, try to compbine them into an ensemble that outperforms each individual classifier on the validation set, using soft or hard voting. Once you have found one, try it on the test set. How much bettere does it perform compared to the individual classifiers? <br><br>

### 2.1 SVM

In [6]:
from sklearn.svm import SVC

svc_classifier = SVC(kernel='rbf', probability=True)

svc_pipeline = Pipeline(
    steps= [
        ('preprocessing', preprocessing_pipeline),
        ('classifier', svc_classifier)
    ]
)

svc_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('std_scaler', StandardScaler())])),
                ('classifier', SVC(probability=True))])

In [7]:
score = svc_pipeline.score(X_val, y_val)
print(f"SVC score on validation set is {score}")

SVC score on validation set is 0.9635


In [11]:
import joblib
import os

os.makedirs("models/", exist_ok=True)

joblib.dump(svc_classifier, 'models/07_svc_base_mnist.pkl')

['models/07_svc_base_mnist.pkl']

In [12]:
from sklearn.model_selection import GridSearchCV

svc_params_grid = {
    'classifier__kernel': ['rbf', 'sigmoid'],
    'classifier__C':[1, 0.1, 1.5],
    'classifier__probability': [True]
}

svc_grid_search = GridSearchCV(svc_pipeline, svc_params_grid, cv=3, return_train_score=True, n_jobs=-1)
svc_grid_search.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessing',
                                        Pipeline(steps=[('std_scaler',
                                                         StandardScaler())])),
                                       ('classifier', SVC(probability=True))]),
             n_jobs=-1,
             param_grid={'classifier__C': [1, 0.1, 1.5],
                         'classifier__kernel': ['rbf', 'sigmoid'],
                         'classifier__probability': [True]},
             return_train_score=True)

In [13]:
import joblib

joblib.dump(svc_grid_search.best_estimator_, 'models/07_svc_search_mnist.pkl')

['models/07_svc_search_mnist.pkl']

In [14]:
score = svc_grid_search.best_estimator_.score(X_val, y_val)
print(f"SVC score on validation set is {score}")

SVC score on validation set is 0.9659166666666666


In [15]:
svc_grid_search.best_params_

{'classifier__C': 1.5,
 'classifier__kernel': 'rbf',
 'classifier__probability': True}

In [17]:
svc_classifier = SVC(kernel='rbf', C=1.5, probability=True)

svc_pipeline = Pipeline(
    steps= [
        ('preprocessing', preprocessing_pipeline),
        ('classifier', svc_classifier)
    ]
)

svc_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('std_scaler', StandardScaler())])),
                ('classifier', SVC(C=1.5, probability=True))])

In [18]:
score = svc_pipeline.score(X_val, y_val)
print(f"SVC score on validation set is {score}")

SVC score on validation set is 0.9659166666666666


In [None]:
import joblib

joblib.dump(svc_grid_search.best_estimator_, 'models/07_svc_after_search_mnist.pkl')