# Ensemble methods

This notebook is based on the topics in chapter 7. <br>
We will answer the exercises 8 and 9 for the MNIST dataset <br>

## General preparations
### 1.1 Data loading

In [1]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)

### 1.2 Test/Train split

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, y_train = mnist['data'][:60000], mnist['target'][:60000].astype(np.uint8)
X_test, y_test = mnist['data'][60000:], mnist['target'][60000:].astype(np.uint8)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [3]:
X_train.shape, y_train.shape

((48000, 784), (48000,))

In [4]:
X_val.shape, y_val.shape

((12000, 784), (12000,))

### 1.3 Preprocessing pipeline

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

preprocessing_pipeline = Pipeline(
    steps=[
        ('std_scaler', StandardScaler())
    ]
)

## 2. Exercise 8: Ensemble classification

Load the MNIST data and split it into a training set, a validation set, and a test set. Then train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM classifier. Next, try to compbine them into an ensemble that outperforms each individual classifier on the validation set, using soft or hard voting. Once you have found one, try it on the test set. How much bettere does it perform compared to the individual classifiers? <br><br>


### 2.1 SVM

In [6]:
from sklearn.svm import SVC

svc_classifier = SVC(kernel='rbf', probability=True)

svc_pipeline = Pipeline(
    steps= [
        ('preprocessing', preprocessing_pipeline),
        ('classifier', svc_classifier)
    ]
)

svc_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('std_scaler', StandardScaler())])),
                ('classifier', SVC(probability=True))])

In [7]:
score = svc_pipeline.score(X_val, y_val)
print(f"SVC score on validation set is {score}")

SVC score on validation set is 0.9635


In [11]:
import joblib
import os

os.makedirs("models/", exist_ok=True)

joblib.dump(svc_classifier, 'models/07_svc_base_mnist.pkl')

['models/07_svc_base_mnist.pkl']

In [12]:
from sklearn.model_selection import GridSearchCV

svc_params_grid = {
    'classifier__kernel': ['rbf', 'sigmoid'],
    'classifier__C':[1, 0.1, 1.5],
    'classifier__probability': [True]
}

svc_grid_search = GridSearchCV(svc_pipeline, svc_params_grid, cv=3, return_train_score=True, n_jobs=-1)
svc_grid_search.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessing',
                                        Pipeline(steps=[('std_scaler',
                                                         StandardScaler())])),
                                       ('classifier', SVC(probability=True))]),
             n_jobs=-1,
             param_grid={'classifier__C': [1, 0.1, 1.5],
                         'classifier__kernel': ['rbf', 'sigmoid'],
                         'classifier__probability': [True]},
             return_train_score=True)

In [13]:
import joblib

joblib.dump(svc_grid_search.best_estimator_, 'models/07_svc_search_mnist.pkl')

['models/07_svc_search_mnist.pkl']

In [14]:
score = svc_grid_search.best_estimator_.score(X_val, y_val)
print(f"SVC score on validation set is {score}")

SVC score on validation set is 0.9659166666666666


In [15]:
svc_grid_search.best_params_

{'classifier__C': 1.5,
 'classifier__kernel': 'rbf',
 'classifier__probability': True}

In [17]:
svc_classifier = SVC(kernel='rbf', C=1.5, probability=True)

svc_pipeline = Pipeline(
    steps= [
        ('preprocessing', preprocessing_pipeline),
        ('classifier', svc_classifier)
    ]
)

svc_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('std_scaler', StandardScaler())])),
                ('classifier', SVC(C=1.5, probability=True))])

In [18]:
score = svc_pipeline.score(X_val, y_val)
print(f"SVC score on validation set is {score}")

SVC score on validation set is 0.9659166666666666


In [None]:
import joblib

joblib.dump(svc_grid_search.best_estimator_, 'models/07_svc_after_search_mnist.pkl')

### 2.2 Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

forest_classifier = RandomForestClassifier(random_state=42, n_jobs=-1)

forest_pipeline = Pipeline(
    steps= [
        ('preprocessing', preprocessing_pipeline),
        ('classifier', forest_classifier)
    ]
)

forest_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('std_scaler', StandardScaler())])),
                ('classifier',
                 RandomForestClassifier(n_jobs=-1, random_state=42))])

In [7]:
score = forest_pipeline.score(X_val, y_val)
print(f"Random forest score on validation set is {score}")

Random forest score on validation set is 0.9694166666666667


In [9]:
import joblib
joblib.dump(forest_classifier, 'models/07_forest_base_mnist.pkl')

['models/07_forest_base_mnist.pkl']

In [8]:
from sklearn.model_selection import GridSearchCV
import joblib

forest_params_grid = {
    'classifier__n_estimators': [500, 600, 1000],
    'classifier__max_depth':[20, 30, 40],
    'classifier__min_samples_leaf':[2, 5],
    'classifier__n_jobs':[-1],
    'classifier__random_state':[42]
}

forest_grid_search = GridSearchCV(forest_pipeline, forest_params_grid, cv=3, return_train_score=True, n_jobs=-1)
forest_grid_search.fit(X_train, y_train)
joblib.dump(forest_grid_search.best_estimator_, 'models/07_forest_search_mnist.pkl')

['models/07_forest_search_mnist.pkl']

In [9]:
score = forest_grid_search.best_estimator_.score(X_val, y_val)
print(f"Forest score score on validation set is {score}")

Forest score score on validation set is 0.9681666666666666


In [10]:
forest_grid_search.best_params_

{'classifier__max_depth': 40,
 'classifier__min_samples_leaf': 2,
 'classifier__n_estimators': 1000,
 'classifier__n_jobs': -1,
 'classifier__random_state': 42}

In [11]:
from sklearn.ensemble import RandomForestClassifier

forest_classifier = RandomForestClassifier(n_estimators=1000, min_samples_leaf=2,
                                           max_depth=40, random_state=42, n_jobs=-1)

forest_pipeline = Pipeline(
    steps= [
        ('preprocessing', preprocessing_pipeline),
        ('classifier', forest_classifier)
    ]
)

forest_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('std_scaler', StandardScaler())])),
                ('classifier',
                 RandomForestClassifier(max_depth=40, min_samples_leaf=2,
                                        n_estimators=1000, n_jobs=-1,
                                        random_state=42))])

In [12]:
score = forest_pipeline.score(X_val, y_val)
print(f"Random forest score on validation set is {score}")

Random forest score on validation set is 0.9681666666666666


In [13]:
import joblib
joblib.dump(forest_classifier, 'models/07_forest_after_search_mnist.pkl')

['models/07_forest_after_search_mnist.pkl']

### 2.3 Extremely random forests

In [7]:
from sklearn.ensemble import ExtraTreesClassifier
import joblib

extra_forest_classifier = ExtraTreesClassifier(random_state=42, n_jobs=-1)

extra_forest_pipeline = Pipeline(
    steps= [
        ('preprocessing', preprocessing_pipeline),
        ('classifier', extra_forest_classifier)
    ]
)

extra_forest_pipeline.fit(X_train, y_train)
joblib.dump(extra_forest_classifier, 'models/07_extra_forest_base_mnist.pkl')

['models/07_extra_forest_base_mnist.pkl']

In [8]:
score = extra_forest_pipeline.score(X_val, y_val)
print(f"Extra trees score on validation set is {score}")

Extra trees score on validation set is 0.97225


In [9]:
from sklearn.model_selection import GridSearchCV
import joblib

extra_forest_params_grid = {
    'classifier__n_estimators': [500, 600, 1000],
    'classifier__max_depth':[20, 30, 40],
    'classifier__min_samples_leaf':[2, 5],
    'classifier__n_jobs':[-1],
    'classifier__random_state':[42]
}

extra_forest_grid_search = GridSearchCV(extra_forest_pipeline, extra_forest_params_grid, cv=3, return_train_score=True, n_jobs=-1)
extra_forest_grid_search.fit(X_train, y_train)
joblib.dump(extra_forest_grid_search.best_estimator_, 'models/07_extra_forest_search_mnist.pkl')

['models/07_extra_forest_search_mnist.pkl']

In [10]:
score = extra_forest_grid_search.best_estimator_.score(X_val, y_val)
print(f"Forest score score on validation set is {score}")

Forest score score on validation set is 0.9701666666666666


In [11]:
extra_forest_grid_search.best_params_

{'classifier__max_depth': 40,
 'classifier__min_samples_leaf': 2,
 'classifier__n_estimators': 1000,
 'classifier__n_jobs': -1,
 'classifier__random_state': 42}

In [12]:
extra_forest_classifier = ExtraTreesClassifier(random_state=42, max_depth=40, 
                                               min_samples_leaf=2, n_estimators=1000,
                                               n_jobs=-1)

extra_forest_pipeline = Pipeline(
    steps= [
        ('preprocessing', preprocessing_pipeline),
        ('classifier', extra_forest_classifier)
    ]
)

extra_forest_pipeline.fit(X_train, y_train)
joblib.dump(extra_forest_classifier, 'models/07_extra_forest_after_search_mnist.pkl')

['models/07_extra_forest_after_search_mnist.pkl']

In [13]:
score = extra_forest_pipeline.score(X_val, y_val)
print(f"Extra trees score on validation set is {score}")

Extra trees score on validation set is 0.9701666666666666


### 2.4 Putting it all together into a ensemble

#### 2.4.1 Hard Voting classifier

In [15]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

svc_model = SVC(kernel='rbf', C=1.5, probability=True)
random_forest_model = RandomForestClassifier(n_estimators=1000, min_samples_leaf=2,
                                           max_depth=40, random_state=42, n_jobs=-1)
extra_trees_model = ExtraTreesClassifier(random_state=42, max_depth=40, 
                                               min_samples_leaf=2, n_estimators=1000,
                                               n_jobs=-1)

In [21]:
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import VotingClassifier
import joblib

ensemble_classifier = VotingClassifier(
    estimators=[('svc',svc_model), ('forest',random_forest_model), ('extra_trees',extra_trees_model)],
    voting='hard',
    n_jobs=-1
)

ensemble_pipeline = Pipeline(
    steps= [
        ('preprocessing', preprocessing_pipeline),
        ('classifier', ensemble_classifier)
    ]
)

ensemble_pipeline.fit(X_train, y_train)
joblib.dump(ensemble_classifier, 'models/07_ensemble_classifier_mnist.pkl')

['models/07_ensemble_classifier_mnist.pkl']

In [22]:
score = ensemble_pipeline.score(X_val, y_val)
print(f"The ensemble scored {score}")

The ensemble scored 0.9708333333333333


#### 2.4.2 scoring on the test set

In [25]:
X_test_trans = preprocessing_pipeline.transform(X_test)
score_ensemble = ensemble_classifier.score(X_test_trans, y_test)
score_svc = ensemble_classifier.named_estimators_['svc'].score(X_test_trans,y_test)
score_forest = ensemble_classifier.named_estimators_['forest'].score(X_test_trans, y_test)
score_extra = ensemble_classifier.named_estimators_['extra_trees'].score(X_test_trans, y_test)
print(f"The ensemble scored {score_ensemble}; SVC {score_svc}; Forest {score_forest}; Extra trees {score_extra}")

The ensemble scored 0.9712; SVC 0.9663; Forest 0.968; Extra trees 0.9703
