In [1]:
import joblib

# Récupération des données vectorisées dans la partie 2.
X_final = joblib.load("X_final_tfidf.pkl")
y_final = joblib.load("y_final.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")



In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=25),
    "Naive Bayes": MultinomialNB(),
    "Linear SVM": LinearSVC(),
    "Decision Tree": DecisionTreeClassifier(random_state=25)
}

In [4]:
from sklearn.model_selection import train_test_split

X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    X_final, y_final, test_size=0.2, random_state=25, stratify=y_final
)

In [5]:
import os

# Dossier où enregistrer les modèles
os.makedirs("Modèles", exist_ok=True)

# Boucle d'entrainement des différents modèles
for name, model in models.items():
    print(f"### {name} ###")

    # Entraînement
    model.fit(X_train_final, y_train_final)

    # Prédictions
    y_pred = model.predict(X_test_final)

    # Évaluation
    print("Accuracy :", accuracy_score(y_test_final, y_pred))
    print(classification_report(y_test_final, y_pred))

    # Sauvegarde du modèle
    filename = f"Modèles/{name.replace(' ', '_').lower()}.pkl"
    joblib.dump(model, filename)
    print(f"Modèle sauvegardé sous : {filename}")

    print("\n" + "-"*50 + "\n")

### Logistic Regression ###
Accuracy : 0.8662615740740741
              precision    recall  f1-score   support

        10.0       0.58      0.76      0.66       640
        40.0       0.88      0.81      0.84       640
        50.0       0.89      0.92      0.91       640
        60.0       0.98      0.97      0.98       640
      1140.0       0.84      0.88      0.86       640
      1160.0       0.93      0.88      0.91       640
      1180.0       0.94      0.96      0.95       640
      1280.0       0.69      0.47      0.56       640
      1281.0       0.81      0.77      0.79       640
      1300.0       0.83      0.87      0.85       640
      1301.0       0.99      0.99      0.99       640
      1302.0       0.88      0.89      0.89       640
      1320.0       0.87      0.85      0.86       640
      1560.0       0.82      0.80      0.81       640
      1920.0       0.91      0.90      0.91       640
      1940.0       0.98      0.99      0.98       640
      2060.0       0.82

In [6]:
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
models = {
    "Ridge Classifier": RidgeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
}


from sklearn.model_selection import train_test_split
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    X_final, y_final, test_size=0.2, random_state=25, stratify=y_final
)

import os
os.makedirs("Modèles", exist_ok=True)

for name, model in models.items():
    print(f"### {name} ###")

    # Entraînement
    model.fit(X_train_final, y_train_final)

    # Prédictions
    y_pred = model.predict(X_test_final)

    # Évaluation
    print("Accuracy :", accuracy_score(y_test_final, y_pred))
    print(classification_report(y_test_final, y_pred))

    # Sauvegarde du modèle
    filename = f"Modèles/{name.replace(' ', '_').lower()}.pkl"
    joblib.dump(model, filename)
    print(f"Modèle sauvegardé sous : {filename}")

    print("\n" + "-"*50 + "\n")

### Ridge Classifier ###
Accuracy : 0.8964120370370371
              precision    recall  f1-score   support

        10.0       0.76      0.76      0.76       640
        40.0       0.90      0.88      0.89       640
        50.0       0.91      0.96      0.93       640
        60.0       0.99      0.97      0.98       640
      1140.0       0.87      0.93      0.90       640
      1160.0       0.95      0.93      0.94       640
      1180.0       0.95      0.99      0.97       640
      1280.0       0.78      0.49      0.60       640
      1281.0       0.83      0.87      0.85       640
      1300.0       0.84      0.92      0.88       640
      1301.0       0.99      1.00      0.99       640
      1302.0       0.91      0.95      0.93       640
      1320.0       0.89      0.90      0.90       640
      1560.0       0.86      0.83      0.84       640
      1920.0       0.91      0.95      0.93       640
      1940.0       0.98      1.00      0.99       640
      2060.0       0.85   

In [7]:
from sklearn.metrics import f1_score
import pandas as pd

# Chargement des modèles
model_names = {
    "Logistic Regression": "Modèles/logistic_regression.pkl",
    "Random Forest": "Modèles/random_forest.pkl",
    "Naive Bayes": "Modèles/naive_bayes.pkl",
    "Linear SVM": "Modèles/linear_svm.pkl",
    "Decision Tree": "Modèles/decision_tree.pkl",
    "Ridge Classifier": "Modèles/ridge_classifier.pkl",
    "K-Nearest Neighbors": "Modèles/k-nearest_neighbors.pkl"
}
models = {name: joblib.load(path) for name, path in model_names.items()}


# Tableau des résultats
results = []
for name, model in models.items():
    y_pred = model.predict(X_test_final)
    f1_macro = f1_score(y_test_final, y_pred, average="macro")
    results.append({"Modèle": name, "F1-score (macro)": round(f1_macro, 4)})

results_df = pd.DataFrame(results).sort_values(by="F1-score (macro)", ascending=False)
print(results_df)


                Modèle  F1-score (macro)
3           Linear SVM            0.9071
5     Ridge Classifier            0.8939
1        Random Forest            0.8788
0  Logistic Regression            0.8661
2          Naive Bayes            0.8199
4        Decision Tree            0.7868
6  K-Nearest Neighbors            0.6449
