In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Charger les données
train_data = pd.read_csv('../data/train_data.csv', sep=';')
X_train = train_data['text']
y_train = train_data['label']

# Initialiser le TF-IDF Vectorizer et l'ajuster sur X_train
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

print(f"Dimensions des vecteurs TF-IDF : {X_train_tfidf.shape}")


Dimensions des vecteurs TF-IDF : (1256, 1128)


In [7]:
from sklearn.metrics import f1_score, confusion_matrix, make_scorer

def custom_score(y_true, y_pred, target_class="lost_luggage"):
    # F1-score global (macro)
    f1_macro = f1_score(y_true, y_pred, average="macro")
    
    # Calcul de la matrice de confusion
    labels = list(set(y_true))
    conf_matrix = confusion_matrix(y_true, y_pred, labels=labels)
    
    # Index de la classe cible
    target_index = labels.index(target_class)
    
    # Calcul de TP, FP, TN pour la classe cible
    tp = conf_matrix[target_index, target_index]
    fp = conf_matrix[:, target_index].sum() - tp
    tn = conf_matrix.sum() - (conf_matrix[target_index, :].sum() + conf_matrix[:, target_index].sum() - tp)
    # Précision pour `lost_luggage`
    precision_lost_luggage = tp / (tp + fp) if (tp + fp) > 0 else 0
    # Sensibilité pour `lost_luggage` (rappel)
    sensitivity_lost_luggage = tp / (tp + conf_matrix[target_index, :].sum() - tp) if (tp + conf_matrix[target_index, :].sum() - tp) > 0 else 0
    
    # Score pour le label cible
    label_score = (precision_lost_luggage + sensitivity_lost_luggage) / 2
    
    # Combiner le F1-score global et le score pour le label cible
    final_score = (f1_macro + label_score) / 2
    return final_score


custom_scorer = make_scorer(custom_score, greater_is_better=True)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline


In [None]:
from sklearn.model_selection import GridSearchCV

# Configurer la pipeline avec TF-IDF suivi de SVC
pipeline_svc = make_pipeline(TfidfVectorizer(), SVC())

# Configurer GridSearchCV avec le scorer personnalisé
param_grid_svc = {
    "svc__C": [0.1, 1, 10],               # Paramètres de régularisation pour SVC
    "svc__kernel": ["linear", "rbf"],     # Types de noyaux
    "svc__gamma": ["scale", "auto"]       # Paramètres gamma
}

grid_search_svc = GridSearchCV(
    pipeline_svc,
    param_grid_svc,
    cv=5,
    scoring=custom_scorer,
    verbose=2
)


In [23]:


# Entraîner avec la recherche de grille
grid_search_svc.fit(X_train, y_train)

# Afficher les meilleurs paramètres et le score associé
print(f"Meilleurs paramètres pour SVC avec TF-IDF : {grid_search_svc.best_params_}")
print(f"Meilleur score avec les paramètres optimisés : {grid_search_svc.best_score_:.3f}")



Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ...svc__C=0.1, svc__gamma=scale, svc__kernel=linear; total time=   0.2s
[CV] END ...svc__C=0.1, svc__gamma=scale, svc__kernel=linear; total time=   0.4s
[CV] END ...svc__C=0.1, svc__gamma=scale, svc__kernel=linear; total time=   0.1s
[CV] END ...svc__C=0.1, svc__gamma=scale, svc__kernel=linear; total time=   0.1s
[CV] END ...svc__C=0.1, svc__gamma=scale, svc__kernel=linear; total time=   0.1s
[CV] END ......svc__C=0.1, svc__gamma=scale, svc__kernel=rbf; total time=   0.2s
[CV] END ......svc__C=0.1, svc__gamma=scale, svc__kernel=rbf; total time=   0.2s
[CV] END ......svc__C=0.1, svc__gamma=scale, svc__kernel=rbf; total time=   0.2s
[CV] END ......svc__C=0.1, svc__gamma=scale, svc__kernel=rbf; total time=   0.2s
[CV] END ......svc__C=0.1, svc__gamma=scale, svc__kernel=rbf; total time=   0.3s
[CV] END ....svc__C=0.1, svc__gamma=auto, svc__kernel=linear; total time=   0.2s
[CV] END ....svc__C=0.1, svc__gamma=auto, svc__k

In [25]:
import joblib

# Enregistrer la pipeline optimisée
joblib.dump(grid_search_svc.best_estimator_, 'tfidf/best_svc_model.joblib')


['tfidf/best_svc_model.joblib']

In [27]:
# Configurer la pipeline avec TF-IDF suivi de la Régression Logistique
pipeline_lr = make_pipeline(TfidfVectorizer(), LogisticRegression())

# Définir la grille de paramètres pour la Régression Logistique
param_grid_lr = {
    "logisticregression__C": [0.1, 1, 10],
    "logisticregression__solver": ["liblinear", "lbfgs"]
}

# Configurer GridSearchCV avec le scorer personnalisé
grid_search_lr = GridSearchCV(
    pipeline_lr,
    param_grid_lr,
    cv=5,
    scoring=custom_scorer,
    verbose=2
)

# Entraîner la recherche de grille
grid_search_lr.fit(X_train, y_train)

# Enregistrer la pipeline optimisée
joblib.dump(grid_search_lr.best_estimator_, 'tfidf/best_logistic_regression_model.joblib')



Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END logisticregression__C=0.1, logisticregression__solver=liblinear; total time=   0.0s
[CV] END logisticregression__C=0.1, logisticregression__solver=liblinear; total time=   0.0s
[CV] END logisticregression__C=0.1, logisticregression__solver=liblinear; total time=   0.0s
[CV] END logisticregression__C=0.1, logisticregression__solver=liblinear; total time=   0.0s
[CV] END logisticregression__C=0.1, logisticregression__solver=liblinear; total time=   0.0s
[CV] END logisticregression__C=0.1, logisticregression__solver=lbfgs; total time=   0.0s
[CV] END logisticregression__C=0.1, logisticregression__solver=lbfgs; total time=   0.0s
[CV] END logisticregression__C=0.1, logisticregression__solver=lbfgs; total time=   0.0s
[CV] END logisticregression__C=0.1, logisticregression__solver=lbfgs; total time=   0.0s
[CV] END logisticregression__C=0.1, logisticregression__solver=lbfgs; total time=   0.0s
[CV] END logisticregression__C

['tfidf/best_logistic_regression_model.joblib']

In [28]:
# Configurer la pipeline avec TF-IDF suivi de Random Forest
pipeline_rf = make_pipeline(TfidfVectorizer(), RandomForestClassifier())

# Définir la grille de paramètres pour Random Forest
param_grid_rf = {
    "randomforestclassifier__n_estimators": [50, 100, 200],
    "randomforestclassifier__max_depth": [None, 10, 20]
}

# Configurer GridSearchCV avec le scorer personnalisé
grid_search_rf = GridSearchCV(
    pipeline_rf,
    param_grid_rf,
    cv=5,
    scoring=custom_scorer,
    verbose=2
)

# Entraîner la recherche de grille
grid_search_rf.fit(X_train, y_train)

# Enregistrer la pipeline optimisée
joblib.dump(grid_search_rf.best_estimator_, 'tfidf/best_random_forest_model.joblib')

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END randomforestclassifier__max_depth=None, randomforestclassifier__n_estimators=50; total time=   0.4s
[CV] END randomforestclassifier__max_depth=None, randomforestclassifier__n_estimators=50; total time=   0.2s
[CV] END randomforestclassifier__max_depth=None, randomforestclassifier__n_estimators=50; total time=   0.3s
[CV] END randomforestclassifier__max_depth=None, randomforestclassifier__n_estimators=50; total time=   0.3s
[CV] END randomforestclassifier__max_depth=None, randomforestclassifier__n_estimators=50; total time=   0.3s
[CV] END randomforestclassifier__max_depth=None, randomforestclassifier__n_estimators=100; total time=   0.5s
[CV] END randomforestclassifier__max_depth=None, randomforestclassifier__n_estimators=100; total time=   0.8s
[CV] END randomforestclassifier__max_depth=None, randomforestclassifier__n_estimators=100; total time=   0.8s
[CV] END randomforestclassifier__max_depth=None, randomforestclas

['tfidf/best_random_forest_model.joblib']

In [None]:

# Configurer la pipeline avec TF-IDF suivi de KNN
pipeline_knn = make_pipeline(TfidfVectorizer(), KNeighborsClassifier())

# Définir la grille de paramètres pour KNN
param_grid_knn = {
    "kneighborsclassifier__n_neighbors": [3, 5, 7],
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

# Configurer GridSearchCV avec le scorer personnalisé
grid_search_knn = GridSearchCV(
    pipeline_knn,
    param_grid_knn,
    cv=5,
    scoring=custom_scorer,
    verbose=2
)

# Entraîner la recherche de grille
grid_search_knn.fit(X_train, y_train)

# Enregistrer la pipeline optimisée
joblib.dump(grid_search_knn.best_estimator_, 'tfidf/best_knn_model.joblib')

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END kneighborsclassifier__n_neighbors=3, kneighborsclassifier__weights=uniform; total time=   0.0s
[CV] END kneighborsclassifier__n_neighbors=3, kneighborsclassifier__weights=uniform; total time=   0.2s
[CV] END kneighborsclassifier__n_neighbors=3, kneighborsclassifier__weights=uniform; total time=   0.0s
[CV] END kneighborsclassifier__n_neighbors=3, kneighborsclassifier__weights=uniform; total time=   0.0s
[CV] END kneighborsclassifier__n_neighbors=3, kneighborsclassifier__weights=uniform; total time=   0.0s
[CV] END kneighborsclassifier__n_neighbors=3, kneighborsclassifier__weights=distance; total time=   0.0s
[CV] END kneighborsclassifier__n_neighbors=3, kneighborsclassifier__weights=distance; total time=   0.0s
[CV] END kneighborsclassifier__n_neighbors=3, kneighborsclassifier__weights=distance; total time=   0.0s
[CV] END kneighborsclassifier__n_neighbors=3, kneighborsclassifier__weights=distance; total time=   0.0s


['tfidf/best_knn_model.joblib']