## Fetch and normalize data


In [1]:
import numpy as np
import pandas as pd
import joblib
from typing import Type
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

mnist = fetch_openml("mnist_784", parser="auto", version="1")

data: Type[pd.DataFrame] = mnist["data"]
labels: Type[pd.DataFrame] = mnist["target"]

scaler = MinMaxScaler().fit(data)
scaled_X = np.array(scaler.transform(data))

y: Type[np.ndarray] = labels.to_numpy()

## Preprocessing


### Split data into training, validation and testing sets.


In [2]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, random_state=42, test_size=1/7)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, random_state=42, test_size=1/6)

print("".ljust(20) + "X size\ty size")
print("training size:".ljust(20) + "{0}\t{1}".format(len(X_train), len(y_train)))
print("validation size:".ljust(20) + "{0}\t{1}".format(len(X_validation), len(y_validation)))
print("test size:".ljust(20) + "{0}\t{1}".format(len(X_test), len(y_test)))

                    X size	y size
training size:      50000	50000
validation size:    10000	10000
test size:          10000	10000


## Soft voting


### Training

In [3]:
# from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors import  KNeighborsClassifier
# from sklearn.ensemble import VotingClassifier
# from sklearn.ensemble import ExtraTreesClassifier

# estimators = [
#   ('svc', SVC(probability=True)),
#   ('rf_clf', RandomForestClassifier(random_state=42)),
#   ('knn_clf', KNeighborsClassifier()),
#   ('extra_tree_clf', ExtraTreesClassifier(random_state=42))
# ]

# voting_clf = VotingClassifier(estimators=estimators, voting='soft')
# voting_clf.fit(X_train, y_train)

# joblib.dump(voting_clf, "model/voting_clf.pkl")

In [4]:
# svc = SVC(probability=True)
# rf_clf = RandomForestClassifier(random_state=42)
# knn_clf = KNeighborsClassifier()
# extra_tree_clf = ExtraTreesClassifier(random_state=42)

# svc.fit(X_train, y_train)
# rf_clf.fit(X_train, y_train)
# knn_clf.fit(X_train, y_train)
# extra_tree_clf.fit(X_train, y_train)


# joblib.dump(svc, "model/svc.pkl", compress=3)
# joblib.dump(rf_clf, "model/random_forest.pkl", compress=3)
# joblib.dump(knn_clf, "model/knn.pkl", compress=3)
# joblib.dump(extra_tree_clf, "model/extra_tree.pkl", compress=3)

### Compare ensemble with individual classifier


#### Prepare estimators


In [5]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier

voting_clf: Type[VotingClassifier] = joblib.load("model/voting_clf.pkl")
svc: Type[SVC] = joblib.load('model/svc.pkl')
rf_clf: Type[RandomForestClassifier] = joblib.load('model/random_forest.pkl')
knn_clf: Type[KNeighborsClassifier] = joblib.load('model/knn.pkl')
extra_tree_clf: Type[ExtraTreesClassifier] = joblib.load('model/extra_tree.pkl')

#### Score comparison on validation set


In [6]:
voting_clf_validation_score = voting_clf.score(X_validation, y_validation)
svc_validation_score = svc.score(X_validation, y_validation)
rf_clf_validation_score = rf_clf.score(X_validation, y_validation)
knn_clf_validation_score = knn_clf.score(X_validation, y_validation)
extra_tree_clf_validation_score = extra_tree_clf.score(X_validation, y_validation)

In [7]:
print("Soft voting ensemble score:".ljust(40) + "{0}".format(voting_clf_validation_score))
print("SVC score:".ljust(40) + "{0}".format(svc_validation_score))
print("RandomForest score:".ljust(40) + "{0}".format(rf_clf_validation_score))
print("KNN score:".ljust(40) + "{0}".format(knn_clf_validation_score))
print("ExtraTree score:".ljust(40) + "{0}".format(extra_tree_clf_validation_score))

Soft voting ensemble score:             0.9809
SVC score:                              0.9788
RandomForest score:                     0.9692
KNN score:                              0.9702
ExtraTree score:                        0.9715


### Score comparison on test set


In [8]:
voting_clf_test_score = voting_clf.score(X_test, y_test)
svc_test_score = svc.score(X_test, y_test)
rf_clf_test_score = rf_clf.score(X_test, y_test)
knn_clf_test_score = knn_clf.score(X_test, y_test)
extra_tree_clf_test_score = extra_tree_clf.score(X_test, y_test)

In [9]:
print("Soft voting ensemble score:".ljust(40) + "{0}".format(voting_clf_test_score))
print("SVC score:".ljust(40) + "{0}".format(svc_test_score))
print("RandomForest score:".ljust(40) + "{0}".format(rf_clf_test_score))
print("KNN score:".ljust(40) + "{0}".format(knn_clf_test_score))
print("ExtraTree score:".ljust(40) + "{0}".format(extra_tree_clf_test_score))

Soft voting ensemble score:             0.977
SVC score:                              0.976
RandomForest score:                     0.9644
KNN score:                              0.9672
ExtraTree score:                        0.9691


## Try to improve ensemble learning performance


### Bagging


### Boosting


### Stacking
