# Classic classification algorithms

We are going to classify our datasets using multiple classification algorithms and evaluate their performances.

In [357]:
# Packages to install
packages_to_install = ['scikit-learn', 'imblearn', 'deslib']

# Check if they are already installed
import importlib
for package in packages_to_install:
    try:
        importlib.import_module(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"{package} is not installed. Installing...")
        !pip install {package}

scikit-learn is not installed. Installing...
imblearn is already installed.
deslib is already installed.



[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [358]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

## Load Dataset

In [359]:
x1 = pd.read_csv("./Ressources/Datasets/stand_norm_e1.txt", header=None, delimiter=" ")
x2 = pd.read_csv("./Ressources/Datasets/stand_norm_e2.txt", header=None, delimiter=" ", names=[8, 9, 10, 11, 12, 13, 14])
y = pd.read_csv("./Ressources/Datasets/y2_e1.txt", header=None, delimiter=" ", names=["label"])
x = pd.concat([x1, x2], axis=1)
x = x[1:]

x.describe()

Unnamed: 0,0,1,2,3,4,5,6,8,9,10,11,12,13,14
count,12810.0,12810.0,12810.0,12810.0,12810.0,12810.0,12810.0,12810.0,12810.0,12810.0,12810.0,12810.0,12810.0,12810.0
unique,12792.0,12810.0,12809.0,12810.0,12810.0,12724.0,12710.0,12792.0,12810.0,12809.0,12810.0,12810.0,12724.0,12717.0
top,0.590555,220.110174,21.125644,714.120693,3719.146472,0.0,0.705523,0.476584,345.489585,8.802629,353.379868,679.766033,0.0,0.425668
freq,2.0,1.0,2.0,1.0,1.0,6.0,2.0,2.0,1.0,2.0,1.0,1.0,6.0,2.0


In [360]:
# Splitting the dataset into training and test set.  
x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size = 0.25, random_state=0)

# Feature Scaling
# Adjust the mean to 0 and the standard deviation to 1
st_x = StandardScaler()    
x_train = st_x.fit_transform(x_train)    
x_test = st_x.transform(x_test)  


In [361]:
# Oversample to have same number of samples of each class
smote = SMOTE()
x_train_sampled, y_train_sampled = smote.fit_resample(x_train, y_train)

## Feature extraction : CNN
We are going to extract features from the datasets using the CNN method, knowing that it has the best performances

In [362]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout, Input
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Dense, Flatten, Dropout

In [363]:
# cnn_model = Sequential()
# # Augmenter la taille de l'input si nécessaire ou ajuster les paramètres
# cnn_model.add(Conv1D(filters=128, kernel_size=3, activation='relu', padding='same', input_shape=(x_train_sampled.shape[1], 1)))
# cnn_model.add(MaxPooling1D(pool_size=2))
# # Ajout d'une seconde couche convolutive avec padding pour conserver la dimension
# cnn_model.add(Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'))
# cnn_model.add(MaxPooling1D(pool_size=2))
# cnn_model.add(Flatten())
# cnn_model.add(Dense(100, activation='relu'))
# cnn_model.add(Dropout(0.5))
# cnn_model.add(Dense(y_train_sampled.shape[0], activation='softmax'))

In [364]:
# # Compilation du modèle
# cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['Accuracy'])

In [365]:
# # Entraînement du modèle
# cnn_model.fit(x_train_sampled, y_train_sampled, epochs=20, batch_size=20, verbose=1)

# print_score(cnn_model)

## Features Extraction : LDA

In [366]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [367]:
lda = LinearDiscriminantAnalysis(n_components=None)
x_train_sampled_feat_extracted = lda.fit_transform(x_train_sampled, y_train_sampled)
x_test_feat_extracted = lda.transform(x_test)

In [368]:
def print_score(model):
    score = model.score(x_train_sampled_feat_extracted, y_train_sampled.ravel())
    print("Score : " + str(score))


def print_accuracy(y):
    print("Accuracy:", accuracy_score(y_test, y))


def get_confusion_matrix_and_results(y):
    # Create confusion matrix
    cm = confusion_matrix(y_test, y)
    cr = classification_report(y_test, y)

    print(cr)
    print(cm)

## Random Forest Algorithm

In [369]:
from sklearn.ensemble import RandomForestClassifier

In [370]:
random_forest = RandomForestClassifier(n_estimators= 20, criterion="entropy", class_weight="balanced")  
# random_forest.fit(x_train_sampled, y_train_sampled.ravel())
random_forest.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())

print_score(random_forest)

Score : 0.9951163203693838


In [371]:
# Predicting the test set result  
y_pred = random_forest.predict(x_test_feat_extracted)

get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.86      0.64      0.74      2511
         1.0       0.32      0.49      0.39       682
         2.0       0.00      0.10      0.01        10

    accuracy                           0.61      3203
   macro avg       0.39      0.41      0.38      3203
weighted avg       0.74      0.61      0.66      3203

[[1612  715  184]
 [ 249  334   99]
 [   5    4    1]]


## Logistic Regression Algorithm

In [372]:
from sklearn.linear_model import LogisticRegression

In [373]:
logistic_regression = LogisticRegression(class_weight="balanced")

logistic_regression.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())

print_score(logistic_regression)

Score : 0.5947877819215059


In [374]:
# Prediction on the test set
y_pred = logistic_regression.predict(x_test_feat_extracted)

get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.88      0.66      0.75      2511
         1.0       0.34      0.40      0.37       682
         2.0       0.01      0.30      0.01        10

    accuracy                           0.60      3203
   macro avg       0.41      0.45      0.38      3203
weighted avg       0.76      0.60      0.67      3203

[[1647  529  335]
 [ 217  276  189]
 [   5    2    3]]


## K-Nearest Neighbours (KNN)

In [375]:
from sklearn.neighbors import KNeighborsClassifier

In [376]:
knn = KNeighborsClassifier(n_neighbors=20, weights="uniform")

knn.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())


print_score(knn)

Score : 0.7212306872669153


In [377]:
# Prediction on the test set
y_pred = knn.predict(x_test_feat_extracted)


get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.88      0.64      0.74      2511
         1.0       0.36      0.51      0.42       682
         2.0       0.00      0.10      0.00        10

    accuracy                           0.61      3203
   macro avg       0.42      0.42      0.39      3203
weighted avg       0.77      0.61      0.67      3203

[[1611  612  288]
 [ 207  345  130]
 [   5    4    1]]


## Decision Tree

In [378]:
from sklearn.tree import DecisionTreeClassifier

In [379]:
decision_tree = DecisionTreeClassifier(class_weight="balanced")

decision_tree.fit(x_train_sampled_feat_extracted, y_train_sampled)

print_score(decision_tree)

Score : 1.0


In [380]:
y_pred = decision_tree.predict(x_test_feat_extracted)


get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.84      0.58      0.69      2511
         1.0       0.26      0.47      0.34       682
         2.0       0.00      0.10      0.01        10

    accuracy                           0.55      3203
   macro avg       0.37      0.38      0.34      3203
weighted avg       0.72      0.55      0.61      3203

[[1449  893  169]
 [ 263  320   99]
 [   5    4    1]]


# Best classifier selection

In [381]:
from sklearn.ensemble import VotingClassifier

In [382]:
voting_clf = VotingClassifier(estimators=[('rf', random_forest), ('knn', knn), ('arbre decisionnel', decision_tree), ('logistic regression', logistic_regression)], voting='soft')
voting_clf.fit(x_train_sampled_feat_extracted, y_train_sampled)

print_score(voting_clf)

Score : 0.9940507902681585


In [383]:
y_pred = voting_clf.predict(x_test_feat_extracted)

get_confusion_matrix_and_results(y_pred)

              precision    recall  f1-score   support

         0.0       0.86      0.63      0.72      2511
         1.0       0.30      0.48      0.37       682
         2.0       0.00      0.10      0.01        10

    accuracy                           0.59      3203
   macro avg       0.39      0.40      0.37      3203
weighted avg       0.74      0.59      0.65      3203

[[1571  767  173]
 [ 251  328  103]
 [   6    3    1]]


# Boosting
We are going to create a strong model with the Boosting technique using our previous simple models.

In [384]:
from sklearn.ensemble import AdaBoostClassifier

In [385]:
base_models = [
    random_forest,
    logistic_regression,
    decision_tree
]

In [386]:
adaboost_classifiers = []
for base_model in base_models:
    adaboost_classifier = AdaBoostClassifier(base_model, n_estimators=50, learning_rate=1)
    adaboost_classifiers.append(adaboost_classifier)


In [387]:
# Train and evaluate each AdaBoost classifier
for i, adaboost_classifier in enumerate(adaboost_classifiers):
    print("Base model used :", base_models[i])

    # Train the AdaBoost classifier
    adaboost_classifier.fit(x_train_sampled_feat_extracted, y_train_sampled)

    print_score(adaboost_classifier)
    
    # Make predictions on the test data
    y_pred = adaboost_classifier.predict(x_test_feat_extracted)
    
    get_confusion_matrix_and_results(y_pred)

    print("")

Base model used : RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       n_estimators=20)




Score : 1.0
              precision    recall  f1-score   support

         0.0       0.86      0.62      0.72      2511
         1.0       0.30      0.50      0.38       682
         2.0       0.00      0.10      0.01        10

    accuracy                           0.59      3203
   macro avg       0.39      0.41      0.37      3203
weighted avg       0.74      0.59      0.64      3203

[[1546  779  186]
 [ 239  341  102]
 [   7    2    1]]

Base model used : LogisticRegression(class_weight='balanced')




Score : 0.5812466702184337
              precision    recall  f1-score   support

         0.0       0.88      0.67      0.76      2511
         1.0       0.33      0.36      0.34       682
         2.0       0.01      0.30      0.01        10

    accuracy                           0.60      3203
   macro avg       0.41      0.44      0.37      3203
weighted avg       0.76      0.60      0.67      3203

[[1672  486  353]
 [ 219  243  220]
 [   5    2    3]]

Base model used : DecisionTreeClassifier(class_weight='balanced')
Score : 1.0
              precision    recall  f1-score   support

         0.0       0.84      0.57      0.68      2511
         1.0       0.26      0.48      0.34       682
         2.0       0.00      0.10      0.01        10

    accuracy                           0.55      3203
   macro avg       0.37      0.38      0.34      3203
weighted avg       0.72      0.55      0.61      3203

[[1436  908  167]
 [ 263  325   94]
 [   5    4    1]]





## DESlib library
We are using the DESlib library to select the best classifier.

In [388]:
# from deslib.des.des_clustering import DESClustering

In [389]:
# des = DESClustering(pool_classifiers=[knn, random_forest, decision_tree])
# des.fit(x_train_sampled_feat_extracted, y_train_sampled.ravel())

# print_score(des)

In [390]:
# y_pred = des.predict(x_test_feat_extracted)


# get_confusion_matrix_and_results(y_pred)