# Classic classification algorithms

We are going to classify our datasets using multiple classification algorithms and evaluate their performances.

In [1]:
# Packages to install
packages_to_install = ['scikit-learn', 'imblearn', 'deslib']

# Check if they are already installed
import importlib
for package in packages_to_install:
    try:
        importlib.import_module(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"{package} is not installed. Installing...")
        !pip install {package}

scikit-learn is not installed. Installing...
imblearn is already installed.
deslib is not installed. Installing...
Collecting deslib
  Downloading DESlib-0.3.7-py3-none-any.whl (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.6/172.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: deslib
Successfully installed deslib-0.3.7


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

## Load Dataset

In [92]:
x1 = pd.read_csv("stand_norm_e1.txt", header=None, delimiter=" ")
x2 = pd.read_csv("stand_norm_e2.txt", header=None, delimiter=" ", names=[8, 9, 10, 11, 12, 13, 14])
y = pd.read_csv("y2_e1.txt", header=None, delimiter=" ", names=["label"])
x = pd.concat([x1, x2], axis=1)
x = x[1:]
y = y[1:]

x.describe()

Unnamed: 0,0,1,2,3,4,5,6,8,9,10,11,12,13,14
count,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0
mean,7.7158,105.17934,151.438104,550.188281,1858.821366,0.639516,0.606294,13.894588,151.443231,149.896051,400.836349,1220.861916,0.639516,0.726532
std,13.661444,118.380416,198.978664,596.648195,4357.46017,0.298755,0.355149,25.014059,184.578246,182.014356,427.81722,2481.003246,0.298755,0.40259
min,0.00403,0.497232,1.136887,1.550758,2.000634,0.0,0.009388,0.008162,1.07604,0.955428,3.392103,7.449574,0.0,0.006647
25%,0.651308,28.135965,36.20507,160.794149,572.195981,0.407475,0.32402,0.59351,22.236411,36.182643,125.101548,436.947264,0.407475,0.37392
50%,2.563679,73.496119,79.066449,341.937147,1092.008474,0.664512,0.461491,3.255613,83.524212,84.391739,255.453341,768.756167,0.664512,0.567023
75%,9.841018,145.229559,183.952667,722.194942,2030.703113,0.859797,0.844648,17.586211,223.05074,191.915684,518.186018,1335.167249,0.859797,1.063422
max,513.804645,4126.79115,3471.70362,6547.571997,304245.165,1.932471,1.933941,540.411313,3139.233639,2126.014583,6557.959843,92455.13546,1.932471,1.990678


In [93]:
# Splitting the dataset into training and test set.
x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size = 0.25, random_state=0)

# Feature Scaling
# Adjust the mean to 0 and the standard deviation to 1
st_x = StandardScaler()
x_train = st_x.fit_transform(x_train)
x_test = st_x.transform(x_test)


In [5]:
# Oversample to have same number of samples of each class
smote = SMOTE()
x_train_sampled, y_train_sampled = smote.fit_resample(x_train, y_train)

## Feature extraction : CNN
We are going to extract features from the datasets using the CNN method, knowing that it has the best performances

In [21]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout
from tensorflow.keras.utils import to_categorical

# Vérification et reformattage de y_train_sampled pour la classification multiclasse
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_sampled.ravel())
y_train_categorical = to_categorical(y_train_encoded)

# Mise à jour de la dernière couche Dense pour qu'elle corresponde au nombre de classes
#num_classes = y_train_categorical.shape[0]
num_classes = y_train_categorical.shape[1]

In [22]:
# Reshape x_train_sampled et x_test_scaled pour l'entrée du CNN
x_train_sampled_cnn = np.expand_dims(x_train_sampled, axis=2)
x_test_scaled_cnn = np.expand_dims(x_test, axis=2)

In [61]:
# Modèle CNN avec trois couches convolutionnelles
cnn_model = Sequential([
    # Augmente la taille de l'input si nécessaire ou ajuste les paramètres
    Conv1D(128, 3, activation='relu', padding='same', input_shape=(x_train_sampled.shape[1], 1)),
    MaxPooling1D(2),
    # Ajout d'une seconde couche convolutive avec padding pour conserver la dimension
    Conv1D(128, 3, activation='relu', padding='same'),
    MaxPooling1D(2),
    Conv1D(128, 3, activation='relu', padding='same'),
    MaxPooling1D(2),
    Flatten(name='flatten_layer'),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compilation du modèle avec la métrique correcte
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [62]:
# Entraînement du modèle
cnn_model.fit(x_train_sampled_cnn, y_train_categorical, epochs=20, batch_size=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7cd36cc93c40>

In [100]:
# Entraînement du modele avec 30 assages complets de l'ensemble de données d'entraînement &
# 32 batch_size (..) données d'entraînement seront divisées en lots de 32 échantillons.
# Pr chaque lot, les poids du modèle seront MAJ
cnn_model.fit(x_train_sampled_cnn, y_train_categorical, epochs=30, batch_size=32, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7cd366826b00>

# Traitemet CNN 21.05.2024 proposé

In [26]:
!pip install opencv-python-headless dlib numpy tensorflow imbalanced-learn



In [27]:
!wget http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
!bzip2 -dk shape_predictor_68_face_landmarks.dat.bz2
!wget http://dlib.net/files/dlib_face_recognition_resnet_model_v1.dat.bz2
!bzip2 -dk dlib_face_recognition_resnet_model_v1.dat.bz2


--2024-05-21 18:47:56--  http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
Resolving dlib.net (dlib.net)... 107.180.26.78
Connecting to dlib.net (dlib.net)|107.180.26.78|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 64040097 (61M)
Saving to: ‘shape_predictor_68_face_landmarks.dat.bz2’


2024-05-21 18:47:59 (22.9 MB/s) - ‘shape_predictor_68_face_landmarks.dat.bz2’ saved [64040097/64040097]

--2024-05-21 18:48:32--  http://dlib.net/files/dlib_face_recognition_resnet_model_v1.dat.bz2
Resolving dlib.net (dlib.net)... 107.180.26.78
Connecting to dlib.net (dlib.net)|107.180.26.78|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21428389 (20M)
Saving to: ‘dlib_face_recognition_resnet_model_v1.dat.bz2’


2024-05-21 18:48:33 (26.3 MB/s) - ‘dlib_face_recognition_resnet_model_v1.dat.bz2’ saved [21428389/21428389]



In [99]:
from tensorflow.keras.models import Model
# New modèle pour extraire les caractéristiques de la couche Flatten
feature_extractor = Model(inputs=cnn_model.input, outputs=cnn_model.get_layer('flatten_layer').output)

# extrait les caractéristiques pour les ensembles d'entraînement et de test
x_train_sampled_feat_extracted = feature_extractor.predict(x_train_sampled_cnn)
x_test_feat_extracted = feature_extractor.predict(x_test_scaled_cnn)





##  Ancien traitement : TODO - regarder et supprimer si besoin

In [65]:
# Predict the test set results
y_test_pred_proba = cnn_model.predict(x_test_scaled_cnn)
y_test_pred = np.argmax(y_test_pred_proba, axis=1)
y_test_true = label_encoder.transform(y_test.ravel())



In [19]:
# cnn_model récuperer la couche flatten pour retourner les features
x_train_sampled_feat_extracted = cnn_model.fit_transform(x_train_sampled, y_train_categorical)
x_test_feat_extracted = cnn.transform(x_test)

AttributeError: 'Sequential' object has no attribute 'fit_transform'

In [68]:
# Print accuracy
def print_accuracy(y_true, y_pred):
    print("Accuracy:", accuracy_score(y_true, y_pred))

# Print confusion matrix and classification report
def get_confusion_matrix_and_results(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    cr = classification_report(y_true, y_pred)
    print(cr)
    print(cm)

# Print accuracy and confusion matrix
print_accuracy(y_test_true, y_test_pred)
get_confusion_matrix_and_results(y_test_true, y_test_pred)

Accuracy: 0.7742741180143615
              precision    recall  f1-score   support

           0       0.87      0.84      0.85      2500
           1       0.49      0.55      0.52       692
           2       0.13      0.18      0.15        11

    accuracy                           0.77      3203
   macro avg       0.50      0.52      0.51      3203
weighted avg       0.78      0.77      0.78      3203

[[2098  391   11]
 [ 310  380    2]
 [   7    2    2]]


In [69]:
# La méthode score (print_score) n'existe pas pour les objets de type Sequential dans TensorFlow/Keras.
# Dans Keras, l'évaluation des modèles se fait généralement avec les méthodes
# evaluate ou predict pour obtenir des métriques


# Évaluation du modèle
loss, accuracy = cnn_model.evaluate(x_train_sampled, y_train_categorical, verbose=1)

# Affichage de la précision du modèle
print("Accuracy:", accuracy)

Accuracy: 0.9766338467597961


## Features Extraction : LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [16]:
lda = LinearDiscriminantAnalysis(n_components=None)
x_train_sampled_feat_extracted = lda.fit_transform(x_train_sampled, y_train_sampled)
x_test_feat_extracted = lda.transform(x_test)

NameError: name 'LinearDiscriminantAnalysis' is not defined

## Functions print_score, print_accuracy and get_confusion_matrix_and_results

In [95]:
def print_score(model):
    score = model.score(x_train_sampled_feat_extracted, y_train_sampled.ravel())
    print("Score : " + str(score))


def print_accuracy(y):
    print("Accuracy:", accuracy_score(y_test, y))


def get_confusion_matrix_and_results(y):
    # Create confusion matrix
    cm = confusion_matrix(y_test, y)
    cr = classification_report(y_test, y)

    print(cr)
    print(cm)

## Random Forest Algorithm

In [96]:
from sklearn.ensemble import RandomForestClassifier

In [97]:
random_forest = RandomForestClassifier(n_estimators= 20, criterion="entropy", class_weight="balanced")
# random_forest.fit(x_train_sampled, y_train_sampled.ravel())
random_forest.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())

print_score(random_forest)

Score : 0.999689633767846


In [98]:
# Predicting the test set result
y_pred = random_forest.predict(x_test_feat_extracted)

get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.87      0.86      0.87      2500
         1.0       0.52      0.55      0.53       692
         2.0       0.09      0.09      0.09        11

    accuracy                           0.79      3203
   macro avg       0.49      0.50      0.50      3203
weighted avg       0.79      0.79      0.79      3203

[[2150  344    6]
 [ 310  378    4]
 [   9    1    1]]


## Logistic Regression Algorithm

In [75]:
from sklearn.linear_model import LogisticRegression

In [76]:
logistic_regression = LogisticRegression(class_weight="balanced")

logistic_regression.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())

print_score(logistic_regression)

Score : 0.9796931808104993


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [78]:
# Prediction on the test set
y_pred = logistic_regression.predict(x_test_feat_extracted)

get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.87      0.85      0.86      2500
         1.0       0.52      0.55      0.54       692
         2.0       0.14      0.18      0.16        11

    accuracy                           0.79      3203
   macro avg       0.51      0.53      0.52      3203
weighted avg       0.79      0.79      0.79      3203

[[2136  354   10]
 [ 306  384    2]
 [   7    2    2]]


## K-Nearest Neighbours (KNN)

In [79]:
from sklearn.neighbors import KNeighborsClassifier

In [80]:
knn = KNeighborsClassifier(n_neighbors=20, weights="uniform")

knn.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())


print_score(knn)

Score : 0.9487452336614348


In [81]:
# Prediction on the test set
y_pred = knn.predict(x_test_feat_extracted)


get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.88      0.80      0.84      2500
         1.0       0.47      0.62      0.54       692
         2.0       0.11      0.18      0.13        11

    accuracy                           0.76      3203
   macro avg       0.49      0.54      0.50      3203
weighted avg       0.79      0.76      0.77      3203

[[2012  473   15]
 [ 262  428    2]
 [   8    1    2]]


## Decision Tree

In [82]:
from sklearn.tree import DecisionTreeClassifier

In [83]:
decision_tree = DecisionTreeClassifier(class_weight="balanced")

decision_tree.fit(x_train_sampled_feat_extracted, y_train_sampled)

print_score(decision_tree)

Score : 1.0


In [84]:
y_pred = decision_tree.predict(x_test_feat_extracted)


get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.86      0.80      0.83      2500
         1.0       0.43      0.53      0.48       692
         2.0       0.05      0.09      0.06        11

    accuracy                           0.74      3203
   macro avg       0.45      0.47      0.46      3203
weighted avg       0.76      0.74      0.75      3203

[[2002  482   16]
 [ 319  369    4]
 [   7    3    1]]


# Best classifier selection

In [85]:
from sklearn.ensemble import VotingClassifier

In [86]:
voting_clf = VotingClassifier(estimators=[('rf', random_forest), ('knn', knn), ('arbre decisionnel', decision_tree), ('logistic regression', logistic_regression)], voting='soft')
voting_clf.fit(x_train_sampled_feat_extracted, y_train_sampled)

print_score(voting_clf)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Score : 0.9980934645739115


In [87]:
y_pred = voting_clf.predict(x_test_feat_extracted)

get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.88      0.85      0.86      2500
         1.0       0.52      0.59      0.55       692
         2.0       0.13      0.18      0.15        11

    accuracy                           0.79      3203
   macro avg       0.51      0.54      0.52      3203
weighted avg       0.80      0.79      0.79      3203

[[2123  367   10]
 [ 284  405    3]
 [   7    2    2]]


# Boosting
We are going to create a strong model with the Boosting technique using our previous simple models.

In [88]:
from sklearn.ensemble import AdaBoostClassifier

In [89]:
base_models = [
    random_forest,
    logistic_regression,
    decision_tree
]

In [90]:
adaboost_classifiers = []
for base_model in base_models:
    adaboost_classifier = AdaBoostClassifier(base_model, n_estimators=50, learning_rate=1)
    adaboost_classifiers.append(adaboost_classifier)


In [91]:
# Train and evaluate each AdaBoost classifier
for i, adaboost_classifier in enumerate(adaboost_classifiers):
    print("Base model used :", base_models[i])

    # Train the AdaBoost classifier
    adaboost_classifier.fit(x_train_sampled_feat_extracted, y_train_sampled)

    print_score(adaboost_classifier)

    # Make predictions on the test data
    y_pred = adaboost_classifier.predict(x_test_feat_extracted)

    get_confusion_matrix_and_results(y_pred)

    print("")

Base model used : RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       n_estimators=20)
Score : 1.0
              precision    recall  f1-score   support

         0.0       0.88      0.86      0.87      2500
         1.0       0.54      0.58      0.56       692
         2.0       0.14      0.18      0.16        11

    accuracy                           0.80      3203
   macro avg       0.52      0.54      0.53      3203
weighted avg       0.80      0.80      0.80      3203

[[2149  342    9]
 [ 286  403    3]
 [   7    2    2]]

Base model used : LogisticRegression(class_weight='balanced')
Score : 0.9643965593686264
              precision    recall  f1-score   support

         0.0       0.88      0.83      0.85      2500
         1.0       0.49      0.58      0.53       692
         2.0       0.20      0.18      0.19        11

    accuracy                           0.77      3203
   macro avg       0.52      0.53      0.52      3203
weighted 

## DESlib library
We are using the DESlib library to select the best classifier.

In [None]:
# from deslib.des.des_clustering import DESClustering

In [None]:
# des = DESClustering(pool_classifiers=[knn, random_forest, decision_tree])
# des.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())

# print_score(des)

In [None]:
# y_pred = des.predict(x_test_feat_extracted)


# get_confusion_matrix_and_results(y_pred)