# Exercice chapitre 7

In [1]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import style
style.use("default")
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import preprocessing

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

pd.options.mode.chained_assignment = None

## Data Preparation

In [2]:
# Lecture de la base de données

mnist = fetch_openml("mnist_784", version = 1)

X, y  = mnist["data"], mnist["target"]

In [13]:
# On crée un dataframe

df = pd.DataFrame(X, index = [i for i in range(len(X))], 
                 columns = ["pixel" + str(i) for i in range(X.shape[1])])

df["target"] = y

df.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9


In [14]:
# On réduit la taille de la base de données pour accélérer le training des modèles

df = df.sample(frac = 0.50).reset_index(drop = True)
print(df.shape)

# Training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:-1], df.iloc[:,-1], 
                                                     test_size = 0.2, random_state = 42, 
                                                     shuffle = True)
# Training and testing dataset
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                     test_size = 0.25, random_state = 100, 
                                                     shuffle = True)

print(X_train.shape, X_test.shape, X_val.shape)
print(y_train.shape, y_test.shape, y_val.shape)

(35000, 785)
(21000, 784) (7000, 784) (7000, 784)
(21000,) (7000,) (7000,)


## 8. Model Training



In [15]:
models = {
#     "SVM": SVC(kernel='rbf', gamma="auto"), # trop long à rouler
    "Linear SVM": LinearSVC(max_iter=100, tol=20, random_state=42),
    "Decision Tree": DecisionTreeClassifier(),
    "Neural Network": MLPClassifier(random_state = 42),
    "Random Forest": RandomForestClassifier(random_state = 42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained")

Linear SVM trained
Decision Tree trained
Neural Network trained
Random Forest trained


In [16]:
for name, model in models.items():
    print(name + " Accuracy:", round(model.score(X_test, y_test),4))

Linear SVMAccuracy: 0.857
Decision TreeAccuracy: 0.8404
Neural NetworkAccuracy: 0.9514
Random ForestAccuracy: 0.9607


In [23]:
# On crée un voting classifier
named_models = [
    ("Linear SVM", LinearSVC(max_iter=100, tol=20, random_state=42)),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Neural Network", MLPClassifier(random_state = 42)),
    ("Random Forest", RandomForestClassifier(random_state = 42))   
]

# Création du classifier
voting_clf = VotingClassifier(named_models)
voting_clf.fit(X_train, y_train)
print("Score:", voting_clf.score(X_test, y_test))

Score: 0.9552857142857143


In [24]:
del voting_clf.estimators_[0:2]
print(voting_clf.estimators_)
voting_clf.score(X_test, y_test)

[MLPClassifier(random_state=42), RandomForestClassifier(random_state=42)]


0.9544285714285714

## 9. Stacking Ensemble

In [26]:
y_val_dic = {}

# On roule les modèles et on prédit sur le validation set
for name, model in models.items():
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    y_val_dic[name] = y_val_pred
    print(name + " prediction made.")

Linear SVM prediction made.
Decision Tree prediction made.
Neural Network prediction made.
Random Forest prediction made.


In [50]:
# On crée un dataframe avec les prédictions
y_vals = pd.DataFrame.from_dict(y_val_dic)
y_vals["true_val"] = y_val.values

# On roule un nouveau modèle
stack_model = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
stack_model.fit(y_vals.iloc[:,0:-1], y_vals.iloc[:,-1])

RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)

In [51]:
# On refait la même chose, mais avec le testing dataset
y_val_dic2 = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_test)
    y_val_dic2[name] = y_val_pred
    print(name + " prediction made.")

# On crée un nouveau dataframe
y_vals2 = pd.DataFrame.from_dict(y_val_dic2)

# On roule le modèel pour choisir les meilleurs valeurs de y
y_pred = stack_model.predict(y_vals2)

# On compare
print(accuracy_score(y_pred, y_test)) # On obtient un meilleur résultat de quelques décimales

Linear SVM prediction made.
Decision Tree prediction made.
Neural Network prediction made.
Random Forest prediction made.
0.9564285714285714
