# Classic classification algorithms

We are going to classify our datasets using multiple classification algorithms and evaluate their performances.

In [1]:
# Packages to install
packages_to_install = ['scikit-learn', 'imblearn', 'deslib']

# Check if they are already installed
import importlib
for package in packages_to_install:
    try:
        importlib.import_module(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"{package} is not installed. Installing...")
        !pip install {package}

scikit-learn is not installed. Installing...
imblearn is already installed.
deslib is not installed. Installing...
Collecting deslib
  Downloading DESlib-0.3.7-py3-none-any.whl (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.6/172.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: deslib
Successfully installed deslib-0.3.7


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

## Load Dataset

In [7]:
x1 = pd.read_csv("./Ressources/Datasets/stand_norm_e1.txt", header=None, delimiter=" ")
x2 = pd.read_csv("./Ressources/Datasets/stand_norm_e2.txt", header=None, delimiter=" ", names=[8, 9, 10, 11, 12, 13, 14])
y = pd.read_csv("./Ressources/Datasets/y2_e1.txt", header=None, delimiter=" ", names=["label"])
x = pd.concat([x1, x2], axis=1)
x = x[1:]
y = y[1:]

x.describe()

Unnamed: 0,0,1,2,3,4,5,6,8,9,10,11,12,13,14
count,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0
mean,7.7158,105.17934,151.438104,550.188281,1858.821366,0.639516,0.606294,13.894588,151.443231,149.896051,400.836349,1220.861916,0.639516,0.726532
std,13.661444,118.380416,198.978664,596.648195,4357.46017,0.298755,0.355149,25.014059,184.578246,182.014356,427.81722,2481.003246,0.298755,0.40259
min,0.00403,0.497232,1.136887,1.550758,2.000634,0.0,0.009388,0.008162,1.07604,0.955428,3.392103,7.449574,0.0,0.006647
25%,0.651308,28.135965,36.20507,160.794149,572.195981,0.407475,0.32402,0.59351,22.236411,36.182643,125.101548,436.947264,0.407475,0.37392
50%,2.563679,73.496119,79.066449,341.937147,1092.008474,0.664512,0.461491,3.255613,83.524212,84.391739,255.453341,768.756167,0.664512,0.567023
75%,9.841018,145.229559,183.952667,722.194942,2030.703113,0.859797,0.844648,17.586211,223.05074,191.915684,518.186018,1335.167249,0.859797,1.063422
max,513.804645,4126.79115,3471.70362,6547.571997,304245.165,1.932471,1.933941,540.411313,3139.233639,2126.014583,6557.959843,92455.13546,1.932471,1.990678


In [8]:
# Splitting the dataset into training and test set.
x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size = 0.25, random_state=0)

# Feature Scaling
# Adjust the mean to 0 and the standard deviation to 1
st_x = StandardScaler()
x_train = st_x.fit_transform(x_train)
x_test = st_x.transform(x_test)


In [9]:
# Oversample to have same number of samples of each class
smote = SMOTE()
x_train_sampled, y_train_sampled = smote.fit_resample(x_train, y_train)

## Feature extraction : CNN
We are going to extract features from the datasets using the CNN method, knowing that it has the best performances

In [10]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Vérification et reformattage de y_train_sampled pour la classification multiclasse
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_sampled.ravel())
y_train_categorical = to_categorical(y_train_encoded)

# Mise à jour de la dernière couche Dense pour qu'elle corresponde au nombre de classes
#num_classes = y_train_categorical.shape[0]
num_classes = y_train_categorical.shape[1]


KeyboardInterrupt



In [80]:
cnn_model = Sequential([
    # Augmente la taille de l'input si nécessaire ou ajuster les paramètres
    Conv1D(128, 3, activation='relu', padding='same', input_shape=(x_train_sampled.shape[1], 1)),
    MaxPooling1D(2),
    # Ajout d'une seconde couche convolutive avec padding pour conserver la dimension
    Conv1D(128, 3, activation='relu', padding='same'),
    MaxPooling1D(2),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compilation du modèle avec la métrique correcte
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [81]:
# Entraînement du modèle
cnn_model.fit(x_train_sampled, y_train_categorical, epochs=20, batch_size=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7a842c0b0c40>

In [84]:
# La méthode score (print_score) n'existe pas pour les objets de type Sequential dans TensorFlow/Keras.
# Dans Keras, l'évaluation des modèles se fait généralement avec les méthodes
# evaluate ou predict pour obtenir des métriques


# Évaluation du modèle
loss, accuracy = cnn_model.evaluate(x_train_sampled, y_train_categorical, verbose=1)

# Affichage de la précision du modèle
print("Accuracy:", accuracy)

Accuracy: 0.9116498231887817


## Features Extraction : LDA

In [85]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [86]:
lda = LinearDiscriminantAnalysis(n_components=None)
x_train_sampled_feat_extracted = lda.fit_transform(x_train_sampled, y_train_sampled)
x_test_feat_extracted = lda.transform(x_test)

In [87]:
def print_score(model):
    score = model.score(x_train_sampled_feat_extracted, y_train_sampled.ravel())
    print("Score : " + str(score))


def print_accuracy(y):
    print("Accuracy:", accuracy_score(y_test, y))


def get_confusion_matrix_and_results(y):
    # Create confusion matrix
    cm = confusion_matrix(y_test, y)
    cr = classification_report(y_test, y)

    print(cr)
    print(cm)

## Random Forest Algorithm

In [88]:
from sklearn.ensemble import RandomForestClassifier

In [89]:
random_forest = RandomForestClassifier(n_estimators= 20, criterion="entropy", class_weight="balanced")
# random_forest.fit(x_train_sampled, y_train_sampled.ravel())
random_forest.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())

print_score(random_forest)

Score : 0.9946279524063222


In [90]:
# Predicting the test set result
y_pred = random_forest.predict(x_test_feat_extracted)

get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.85      0.63      0.72      2511
         1.0       0.29      0.45      0.35       682
         2.0       0.00      0.10      0.01        10

    accuracy                           0.59      3203
   macro avg       0.38      0.39      0.36      3203
weighted avg       0.73      0.59      0.64      3203

[[1578  727  206]
 [ 274  304  104]
 [   6    3    1]]


## Logistic Regression Algorithm

In [91]:
from sklearn.linear_model import LogisticRegression

In [92]:
logistic_regression = LogisticRegression(class_weight="balanced")

logistic_regression.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())

print_score(logistic_regression)

Score : 0.5879950275261943


In [93]:
# Prediction on the test set
y_pred = logistic_regression.predict(x_test_feat_extracted)

get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.88      0.65      0.75      2511
         1.0       0.34      0.40      0.37       682
         2.0       0.01      0.30      0.01        10

    accuracy                           0.60      3203
   macro avg       0.41      0.45      0.38      3203
weighted avg       0.76      0.60      0.67      3203

[[1644  531  336]
 [ 215  272  195]
 [   5    2    3]]


## K-Nearest Neighbours (KNN)

In [94]:
from sklearn.neighbors import KNeighborsClassifier

In [95]:
knn = KNeighborsClassifier(n_neighbors=20, weights="uniform")

knn.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())


print_score(knn)

Score : 0.7187444503640561


In [96]:
# Prediction on the test set
y_pred = knn.predict(x_test_feat_extracted)


get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.88      0.65      0.75      2511
         1.0       0.34      0.45      0.39       682
         2.0       0.00      0.10      0.00        10

    accuracy                           0.61      3203
   macro avg       0.41      0.40      0.38      3203
weighted avg       0.76      0.61      0.67      3203

[[1630  593  288]
 [ 223  310  149]
 [   6    3    1]]


## Decision Tree

In [97]:
from sklearn.tree import DecisionTreeClassifier

In [98]:
decision_tree = DecisionTreeClassifier(class_weight="balanced")

decision_tree.fit(x_train_sampled_feat_extracted, y_train_sampled)

print_score(decision_tree)

Score : 1.0


In [99]:
y_pred = decision_tree.predict(x_test_feat_extracted)


get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.83      0.57      0.68      2511
         1.0       0.25      0.45      0.32       682
         2.0       0.00      0.10      0.01        10

    accuracy                           0.54      3203
   macro avg       0.36      0.37      0.34      3203
weighted avg       0.71      0.54      0.60      3203

[[1439  893  179]
 [ 281  304   97]
 [   6    3    1]]


# Best classifier selection

In [100]:
from sklearn.ensemble import VotingClassifier

In [101]:
voting_clf = VotingClassifier(estimators=[('rf', random_forest), ('knn', knn), ('arbre decisionnel', decision_tree), ('logistic regression', logistic_regression)], voting='soft')
voting_clf.fit(x_train_sampled_feat_extracted, y_train_sampled)

print_score(voting_clf)

Score : 0.9946279524063222


In [102]:
y_pred = voting_clf.predict(x_test_feat_extracted)

get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.85      0.62      0.72      2511
         1.0       0.29      0.45      0.35       682
         2.0       0.00      0.10      0.01        10

    accuracy                           0.58      3203
   macro avg       0.38      0.39      0.36      3203
weighted avg       0.73      0.58      0.64      3203

[[1550  762  199]
 [ 265  308  109]
 [   6    3    1]]


# Boosting
We are going to create a strong model with the Boosting technique using our previous simple models.

In [103]:
from sklearn.ensemble import AdaBoostClassifier

In [75]:
base_models = [
    random_forest,
    logistic_regression,
    decision_tree
]

In [76]:
adaboost_classifiers = []
for base_model in base_models:
    adaboost_classifier = AdaBoostClassifier(base_model, n_estimators=50, learning_rate=1)
    adaboost_classifiers.append(adaboost_classifier)


In [77]:
# Train and evaluate each AdaBoost classifier
for i, adaboost_classifier in enumerate(adaboost_classifiers):
    print("Base model used :", base_models[i])

    # Train the AdaBoost classifier
    adaboost_classifier.fit(x_train_sampled_feat_extracted, y_train_sampled)

    print_score(adaboost_classifier)

    # Make predictions on the test data
    y_pred = adaboost_classifier.predict(x_test_feat_extracted)

    get_confusion_matrix_and_results(y_pred)

    print("")

Base model used : RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       n_estimators=20)
Score : 1.0
              precision    recall  f1-score   support

         0.0       0.86      0.62      0.72      2511
         1.0       0.29      0.45      0.36       682
         2.0       0.00      0.10      0.01        10

    accuracy                           0.58      3203
   macro avg       0.38      0.39      0.36      3203
weighted avg       0.73      0.58      0.64      3203

[[1562  733  216]
 [ 256  306  120]
 [   8    1    1]]

Base model used : LogisticRegression(class_weight='balanced')
Score : 0.5763185935002664
              precision    recall  f1-score   support

         0.0       0.88      0.66      0.76      2511
         1.0       0.34      0.36      0.35       682
         2.0       0.01      0.30      0.01        10

    accuracy                           0.60      3203
   macro avg       0.41      0.44      0.37      3203
weighted 

## DESlib library
We are using the DESlib library to select the best classifier.

In [None]:
# from deslib.des.des_clustering import DESClustering

In [None]:
# des = DESClustering(pool_classifiers=[knn, random_forest, decision_tree])
# des.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())

# print_score(des)

In [None]:
# y_pred = des.predict(x_test_feat_extracted)


# get_confusion_matrix_and_results(y_pred)