# Classic classification algorithms

We are going to classify our datasets using multiple classification algorithms and evaluate their performances.

In [1]:
# Packages to install
packages_to_install = ['scikit-learn==1.2.2', 'imblearn', 'deslib', 'joblib==1.4.2']

# Check if they are already installed
import importlib
for package in packages_to_install:
    try:
        importlib.import_module(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"{package} is not installed. Installing...")
        !pip install {package}

scikit-learn==1.2.2 is not installed. Installing...



[notice] A new release of pip available: 22.2.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


imblearn is already installed.
deslib is not installed. Installing...
Collecting deslib
  Using cached DESlib-0.3.7-py3-none-any.whl (172 kB)
Installing collected packages: deslib
Successfully installed deslib-0.3.7
joblib==1.4.2 is not installed. Installing...



[notice] A new release of pip available: 22.2.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.2.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import joblib




In [3]:
ressources_path = "../Ressources/Datasets/"
models_path = "../Interface/Server/models/"

## Load Dataset

In [4]:
x1 = pd.read_csv(ressources_path + "stand_norm_e1.txt", header=None, delimiter=" ")
x2 = pd.read_csv(ressources_path + "stand_norm_e2.txt", header=None, delimiter=" ", names=[8, 9, 10, 11, 12, 13, 14])
y = pd.read_csv(ressources_path + "y2_e1.txt", header=None, delimiter=" ", names=["label"])
x = pd.concat([x1, x2], axis=1)
x = x[1:]
y = y[1:]

x.describe()

Unnamed: 0,0,1,2,3,4,5,6,8,9,10,11,12,13,14
count,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0,12809.0
mean,7.7158,105.17934,151.438104,550.188281,1858.821366,0.639516,0.606294,13.894588,151.443231,149.896051,400.836349,1220.861916,0.639516,0.726532
std,13.661444,118.380416,198.978664,596.648195,4357.46017,0.298755,0.355149,25.014059,184.578246,182.014356,427.81722,2481.003246,0.298755,0.40259
min,0.00403,0.497232,1.136887,1.550758,2.000634,0.0,0.009388,0.008162,1.07604,0.955428,3.392103,7.449574,0.0,0.006647
25%,0.651308,28.135965,36.20507,160.794149,572.195981,0.407475,0.32402,0.59351,22.236411,36.182643,125.101548,436.947264,0.407475,0.37392
50%,2.563679,73.496119,79.066449,341.937147,1092.008474,0.664512,0.461491,3.255613,83.524212,84.391739,255.453341,768.756167,0.664512,0.567023
75%,9.841018,145.229559,183.952667,722.194942,2030.703113,0.859797,0.844648,17.586211,223.05074,191.915684,518.186018,1335.167249,0.859797,1.063422
max,513.804645,4126.79115,3471.70362,6547.571997,304245.165,1.932471,1.933941,540.411313,3139.233639,2126.014583,6557.959843,92455.13546,1.932471,1.990678


In [5]:
# Splitting the dataset into training and test set.
x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size = 0.25, random_state=0)

# Feature Scaling
# Adjust the mean to 0 and the standard deviation to 1
st_x = StandardScaler()
x_train = st_x.fit_transform(x_train)
x_test = st_x.transform(x_test)


In [6]:
# Oversample to have same number of samples of each class
smote = SMOTE()
x_train_sampled, y_train_sampled = smote.fit_resample(x_train, y_train)

## Feature extraction : CNN
We are going to extract features from the datasets using the CNN method, knowing that it has the best performances

In [7]:
# Vérification et reformattage de y_train_sampled pour la classification multiclasse
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_sampled.ravel())
y_train_categorical = to_categorical(y_train_encoded)

# Mise à jour de la dernière couche Dense pour qu'elle corresponde au nombre de classes
#num_classes = y_train_categorical.shape[0]
num_classes = y_train_categorical.shape[1]

In [8]:
# Reshape x_train_sampled et x_test_scaled pour l'entrée du CNN
x_train_sampled_cnn = np.expand_dims(x_train_sampled, axis=2)
x_test_scaled_cnn = np.expand_dims(x_test, axis=2)

In [9]:
# Modèle CNN avec trois couches convolutionnelles
cnn_model = Sequential([
    # Augmente la taille de l'input si nécessaire ou ajuste les paramètres
    Conv1D(128, 3, activation='relu', padding='same', input_shape=(x_train_sampled.shape[1], 1)),
    MaxPooling1D(2),
    # Ajout d'une seconde couche convolutive avec padding pour conserver la dimension
    Conv1D(128, 3, activation='relu', padding='same'),
    MaxPooling1D(2),
    Conv1D(128, 3, activation='relu', padding='same'),
    MaxPooling1D(2),
    Flatten(name='flatten_layer'),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compilation du modèle avec la métrique correcte
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])






In [10]:
# Entraînement du modèle
cnn_model.fit(x_train_sampled_cnn, y_train_categorical, epochs=20, batch_size=20, verbose=1)

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x1fe462d2140>

In [11]:
# Entraînement du modele avec 30 assages complets de l'ensemble de données d'entraînement &
# 32 batch_size (..) données d'entraînement seront divisées en lots de 32 échantillons.
# Pr chaque lot, les poids du modèle seront MAJ
cnn_model.fit(x_train_sampled_cnn, y_train_categorical, epochs=30, batch_size=32, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x1fe47b320b0>

In [14]:
cnn_model.save(models_path + 'cnn_model.keras')

# Traitement CNN 21.05.2024 proposé

In [15]:
# New modèle pour extraire les caractéristiques de la couche Flatten
feature_extractor = Model(inputs=cnn_model.input, outputs=cnn_model.get_layer('flatten_layer').output)

# extrait les caractéristiques pour les ensembles d'entraînement et de test
x_train_sampled_feat_extracted = feature_extractor.predict(x_train_sampled_cnn)
x_test_feat_extracted = feature_extractor.predict(x_test_scaled_cnn)




In [16]:
feature_extractor.save(models_path + "feature_extractor.keras")

In [17]:
# La méthode score (print_score) n'existe pas pour les objets de type Sequential dans TensorFlow/Keras.
# Dans Keras, l'évaluation des modèles se fait généralement avec les méthodes
# evaluate ou predict pour obtenir des métriques


# Évaluation du modèle
loss, accuracy = cnn_model.evaluate(x_train_sampled, y_train_categorical, verbose=1)

# Affichage de la précision du modèle
print("Accuracy:", accuracy)

Accuracy: 0.9790724515914917


## Features Extraction : LDA

In [18]:
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [19]:
# lda = LinearDiscriminantAnalysis(n_components=None)
# x_train_sampled_feat_extracted = lda.fit_transform(x_train_sampled, y_train_sampled)
# x_test_feat_extracted = lda.transform(x_test)

## Functions print_score, print_accuracy and get_confusion_matrix_and_results

In [20]:
def print_score(model):
    score = model.score(x_train_sampled_feat_extracted, y_train_sampled.ravel())
    print("Score : " + str(score))


def print_accuracy(y):
    print("Accuracy:", accuracy_score(y_test, y))


def get_confusion_matrix_and_results(y):
    # Create confusion matrix
    cm = confusion_matrix(y_test, y)
    cr = classification_report(y_test, y)

    print(cr)
    print(cm)

## Random Forest Algorithm

In [21]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators= 20, criterion="entropy", class_weight="balanced")
# random_forest.fit(x_train_sampled, y_train_sampled.ravel())
random_forest.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())

print_score(random_forest)

Score : 0.9999113239336703


In [22]:
# Predicting the test set result
y_pred = random_forest.predict(x_test_feat_extracted)

get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.86      0.87      0.86      2500
         1.0       0.52      0.51      0.51       692
         2.0       0.14      0.18      0.16        11

    accuracy                           0.79      3203
   macro avg       0.51      0.52      0.51      3203
weighted avg       0.79      0.79      0.79      3203

[[2166  325    9]
 [ 339  350    3]
 [   7    2    2]]


## Logistic Regression Algorithm

In [23]:
from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression(class_weight="balanced")

logistic_regression.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())

print_score(logistic_regression)

Score : 0.981821406402412


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
# Prediction on the test set
y_pred = logistic_regression.predict(x_test_feat_extracted)

get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.87      0.85      0.86      2500
         1.0       0.50      0.54      0.52       692
         2.0       0.15      0.18      0.17        11

    accuracy                           0.78      3203
   macro avg       0.51      0.52      0.51      3203
weighted avg       0.78      0.78      0.78      3203

[[2113  377   10]
 [ 318  373    1]
 [   8    1    2]]


## K-Nearest Neighbours (KNN)

In [25]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=20, weights="uniform")

knn.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())


print_score(knn)

Score : 0.9469717123348408


In [26]:
# Prediction on the test set
y_pred = knn.predict(x_test_feat_extracted)


get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.89      0.80      0.84      2500
         1.0       0.48      0.65      0.55       692
         2.0       0.15      0.27      0.19        11

    accuracy                           0.76      3203
   macro avg       0.51      0.57      0.53      3203
weighted avg       0.80      0.76      0.78      3203

[[1995  490   15]
 [ 242  448    2]
 [   6    2    3]]


## Decision Tree

In [27]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(class_weight="balanced")

decision_tree.fit(x_train_sampled_feat_extracted, y_train_sampled)

print_score(decision_tree)

Score : 1.0


In [28]:
y_pred = decision_tree.predict(x_test_feat_extracted)


get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.86      0.81      0.83      2500
         1.0       0.44      0.51      0.47       692
         2.0       0.05      0.09      0.07        11

    accuracy                           0.75      3203
   macro avg       0.45      0.47      0.46      3203
weighted avg       0.76      0.75      0.75      3203

[[2033  452   15]
 [ 335  354    3]
 [   8    2    1]]


# Best classifier selection

In [29]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[('rf', random_forest), ('knn', knn), ('arbre decisionnel', decision_tree), ('logistic regression', logistic_regression)], voting='soft')
voting_clf.fit(x_train_sampled_feat_extracted, y_train_sampled)

print_score(voting_clf)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Score : 0.9982264786734061


In [30]:
y_pred = voting_clf.predict(x_test_feat_extracted)

get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.87      0.85      0.86      2500
         1.0       0.51      0.55      0.53       692
         2.0       0.15      0.18      0.17        11

    accuracy                           0.78      3203
   macro avg       0.51      0.53      0.52      3203
weighted avg       0.79      0.78      0.79      3203

[[2123  368    9]
 [ 307  383    2]
 [   8    1    2]]


# Boosting
We are going to create a strong model with the Boosting technique using our previous simple models.

In [31]:
from sklearn.ensemble import AdaBoostClassifier

base_models = [
    random_forest,
    logistic_regression,
    decision_tree
]

model_names = [
    "random_forest",
    "logistic_regression",
    "decision_tree"
]

adaboost_classifiers = []
for base_model in base_models:
    adaboost_classifier = AdaBoostClassifier(base_model, n_estimators=50, learning_rate=1)
    adaboost_classifiers.append(adaboost_classifier)

In [32]:
# Train and evaluate each AdaBoost classifier
for i, adaboost_classifier in enumerate(adaboost_classifiers):
    print("Base model used :", base_models[i])

    # Train the AdaBoost classifier
    adaboost_classifier.fit(x_train_sampled_feat_extracted, y_train_sampled)

    joblib.dump(adaboost_classifier, models_path + model_names[i] + "_model.joblib")

    print_score(adaboost_classifier)

    # Make predictions on the test data
    y_pred = adaboost_classifier.predict(x_test_feat_extracted)

    get_confusion_matrix_and_results(y_pred)

    print("")

Base model used : RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       n_estimators=20)
Score : 1.0
              precision    recall  f1-score   support

         0.0       0.87      0.86      0.87      2500
         1.0       0.54      0.56      0.55       692
         2.0       0.17      0.18      0.17        11

    accuracy                           0.80      3203
   macro avg       0.53      0.54      0.53      3203
weighted avg       0.80      0.80      0.80      3203

[[2161  331    8]
 [ 301  389    2]
 [   8    1    2]]

Base model used : LogisticRegression(class_weight='balanced')
Score : 0.9556619668351511
              precision    recall  f1-score   support

         0.0       0.85      0.88      0.87      2500
         1.0       0.53      0.47      0.50       692
         2.0       0.29      0.18      0.22        11

    accuracy                           0.79      3203
   macro avg       0.56      0.51      0.53      3203
weighted 

## Test loading models and make predictions

In [33]:
test_data = pd.read_csv(ressources_path + "test.txt", header=None, delimiter=" ")


test_feature_extractor = load_model(models_path + "feature_extractor.keras")

extracted_data = test_feature_extractor.predict(test_data)

print(extracted_data)


[[ 387.33035  198.19728    0.      ...    0.         0.         0.     ]
 [1132.1938     0.         0.      ...    0.         0.         0.     ]
 [2092.4973     0.         0.      ...    0.         0.         0.     ]
 ...
 [  33.45929 1777.031      0.      ...    0.         0.         0.     ]
 [   0.       654.44885    0.      ...    0.         0.         0.     ]
 [   0.         0.         0.      ...    0.         0.         0.     ]]


In [36]:
test_classifier_model = joblib.load(models_path + "random_forest_model.joblib")

pred = test_classifier_model.predict(extracted_data)
print(pred)

[1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1.]
