# Working with mnist dataset using ensemble classifiers

In [2]:
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [3]:
X_mnist, y_mnist = fetch_openml('mnist_784', return_X_y=True, as_frame=False)

In [4]:
X_mnist.shape

(70000, 784)

In [5]:
y_mnist.shape

(70000,)

In [6]:
#1) split the data into training and test sets
X_train, y_train = X_mnist[:60000], y_mnist[:60000]
X_test, y_test = X_mnist[60_000:], y_mnist[60000:]

In [7]:
# 2) Use principal component analysis (PCA) to reduce the dimensions of the system and preserve 90% of
#the training set’s variance.
pca = PCA(0.90)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [9]:
print(X_train.shape)
print(X_test.shape)

(60000, 87)
(10000, 87)


In [34]:
# 3) Train a Decision Tree with maximum depth equal to 10, a Random Forest with 50 estimators, an AdaBoost with 50 estimators, a LinearSVC with maximum iterations equal to 500, and a Logistic Regression
#with maximum iterations equal to 500 classifier on the training set and calculate the score of each one of
#the estimators on the test set.
dec_trees_clf = DecisionTreeClassifier(max_depth=10, random_state=42)
random_forest_clf = RandomForestClassifier(n_estimators=50, random_state=42)
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=50, random_state=42)
svm_clf = LinearSVC(max_iter=500, tol=20, random_state=42)
lr_pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500, random_state=42))



In [35]:
# combine them in a list of tuples with the name and the estimator
named_estimators = [("random_forest", random_forest_clf),
                    ("svm", svm_clf),
                    ("dec_trees", dec_trees_clf),
                    ("adaboost", ada_clf),
                    ("Logistic_regression", lr_pipe)]

In [36]:
# train and calculate the score of each one of the estimators on the test set
scores = {}
for named_estimator in named_estimators:
    name, estimator = named_estimator
    print("Training the", estimator)
    estimator.fit(X_train, y_train)
    score = estimator.score(X_test, y_test)
    scores[name] = score

Training the RandomForestClassifier(n_estimators=50, random_state=42)
Training the LinearSVC(max_iter=500, random_state=42, tol=20)
Training the DecisionTreeClassifier(max_depth=10, random_state=42)
Training the AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   random_state=42)
Training the Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(max_iter=500, random_state=42))])


In [37]:
print("Scores:")
for clf_name, clf_score in scores.items():
    print (f"{clf_name}: {clf_score:.4f}")

Scores:
random_forest: 0.9468
svm: 0.6247
dec_trees: 0.7970
adaboost: 0.7152
Logistic_regression: 0.9193


In [38]:
#4) create the stacking classifier with 3-fold cross validation and 
# a Random Forest Classifier as the final estimator and train it.
stacking_clf = StackingClassifier(
    estimators=named_estimators,
    final_estimator=RandomForestClassifier(random_state=43),
    cv=3
    )
stacking_clf.fit(X_train, y_train)

StackingClassifier(cv=3,
                   estimators=[('random_forest',
                                RandomForestClassifier(n_estimators=50,
                                                       random_state=42)),
                               ('svm',
                                LinearSVC(max_iter=500, random_state=42,
                                          tol=20)),
                               ('dec_trees',
                                DecisionTreeClassifier(max_depth=10,
                                                       random_state=42)),
                               ('adaboost',
                                AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                                                   random_state=42)),
                               ('Logistic_regression',
                                Pipeline(steps=[('standardscaler',
                                                 StandardScaler()),
                     

In [39]:
# 5) What is the score of the Stacking Classifier on the test set? How much better does it perform compared
#to the individual classifiers?
stacking_score = stacking_clf.score(X_test, y_test)
print (f"Stacking score: {stacking_score}")

Stacking score: 0.9559


In [40]:
print("The stacking classifier performs better by:")
for clf_name, clf_score in scores.items():
    print (f"{100*(stacking_score-clf_score)/stacking_score:.1f}% than the {clf_name}")

The stacking classifier performs better by:
1.0% than the random_forest
34.6% than the svm
16.6% than the dec_trees
25.2% than the adaboost
3.8% than the Logistic_regression
