In [1]:
%load_ext autoreload
%autoreload 2

In [34]:
import os
import sys
import s3fs
import numpy as np
import fireducks.pandas as pd
import warnings
from dotenv import load_dotenv
from tqdm import tqdm
from pprint import pprint
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from gensim.models import Word2Vec, FastText

sys.path.append("../src")
from ml_utils import *

In [3]:
load_dotenv()
pd.set_option("display.max_columns", None)
warnings.simplefilter("ignore")
fs = s3fs.S3FileSystem(
            client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"},
            key=os.environ["Accesskey"],
            secret=os.environ["Secretkey"],
            token=os.environ["Token"]
)

# Data Preprocessing

In [4]:
with fs.open("elissamim/text_classification_men/data/stages-votes.json", "r") as file:
    df = pd.read_json(file)

df = df.groupby("phrase_text", as_index = False)["sol"].apply(lambda x: x.mode().iloc[0])
df["sol"]=df["sol"].apply(lambda x: 1 if x == "ok" else 0)
df["clean_phrase_text"] = df["phrase_text"].apply(lambda x: nltk_text_preprocessing(x, True))
df = df[df["clean_phrase_text"] != ""]
df.head()

Unnamed: 0,phrase_text,sol,clean_phrase_text
0,* Aider à la mise en place de l évènement Shar...,0,aider mise place évènemer shareplan envoi rapp...
1,* Comprendre le métier des achats * Comment or...,0,comprendre métier achat comment organiser appe...
2,* Fendre du bois en forêt au merlin manuelleme...,0,fendre boi forêt merlin manuellemer débarder b...
4,"2 jours au CDI , 1 jour en arts plastiques , 1...",0,2 jour cdi 1 jour art plastique 1 jour musiqu ...
5,4 jours au sein du Bureau des affaires institu...,1,4 jour sein bureau affaire institutionnel fina...


# Model selection (static embedding (sparse or dense) + classification algorithm) with grid search and cross validation

In [39]:
X = df["clean_phrase_text"]
y = df["sol"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42)

In [40]:
tokenized_texts = [text.split() for text in X_train]
word2vec_model = Word2Vec(sentences = tokenized_texts,
                         vector_size = 100,
                         window = 5,
                         min_count = 1,
                         workers = 4,
                         seed = 42)
fasttext_model = FastText(sentences = tokenized_texts,
                         vector_size = 100,
                         window = 5,
                         min_count = 1,
                         workers = 4,
                         seed = 42)

static_embedding_models = {
    # Sparse embeddings
    "Bag of Words":CountVectorizer(),
    "TF":TfidfVectorizer(use_idf=False, norm = "l1"),
    "TF-IDF":TfidfVectorizer(),
    # Dense embeddings
    "Word2Vec": MeanEmbeddingVectorizer(model=word2vec_model),
    "FastText": MeanEmbeddingVectorizer(model=fasttext_model)
}

classification_models = {
    "Logistic Regression":LogisticRegression(),
    "Random Forest":RandomForestClassifier(),
    "Linear SVM":SVC(kernel="linear", probability=True),
    "Multinomial Naive Bayes":MultinomialNB(),
    "XGBoost":XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}



Static embeddings:   0%|          | 0/5 [00:00<?, ?it/s]
Classification algorithms:   0%|          | 0/5 [00:00<?, ?it/s][A
Classification algorithms:  20%|██        | 1/5 [00:01<00:06,  1.64s/it][A
Classification algorithms:  40%|████      | 2/5 [00:12<00:21,  7.29s/it][A
Classification algorithms:  60%|██████    | 3/5 [00:20<00:15,  7.57s/it][A
Classification algorithms:  80%|████████  | 4/5 [00:22<00:05,  5.10s/it][A
Classification algorithms: 100%|██████████| 5/5 [00:25<00:00,  5.12s/it][A
Static embeddings:  20%|██        | 1/5 [00:25<01:42, 25.60s/it]
Classification algorithms:   0%|          | 0/5 [00:00<?, ?it/s][A
Classification algorithms:  20%|██        | 1/5 [00:01<00:05,  1.41s/it][A
Classification algorithms:  40%|████      | 2/5 [00:12<00:21,  7.17s/it][A
Classification algorithms:  60%|██████    | 3/5 [00:20<00:15,  7.68s/it][A
Classification algorithms:  80%|████████  | 4/5 [00:22<00:05,  5.32s/it][A
Classification algorithms: 100%|██████████| 5/5 [00:35<00:

In [None]:
params_grid_tfidf = {
    "tfidf__max_df":[.7, .8, .9, 1],
    "tfidf__min_df":[.001, .01, .1],
    "tfidf__norm":["l1", "l2", None],
    "tfidf__sublinear_tf":[True, False],
    "tfidf__max_features":[10, 100, 1000, 10000],
    "tfidf__ngram_range":[(1,1), (1,2), (2,2)]
}

In [None]:
dict_scores = {}

for embedding_name, embedding_model in tqdm(static_embedding_models.items(),
                                           desc="Static embeddings"):

    dict_scores[embedding_name] = {}
    
    for classification_name, classification_model in tqdm(classification_models.items(),
                                                         desc="Classification algorithms"):

        # Multinomial NB is not suited for dense vectors
        if embedding_name in ["Word2Vec", "FastText"] and classification_name == "Multinomial Naive Bayes":
            continue

        steps = [("feature_extraction", embedding_model)]

        # For Logistic Regression and Linear SVM, and for dense embeddings, add standardisation
        if embedding_name in ["Word2Vec", "FastText"] and classification_name in ["Logistic Regression", "Linear SVM"]:
            steps.append(("standardisation", StandardScaler()))

        steps.append(("classifier", classification_model))

        pipeline = Pipeline(steps)

        # We compute scores using Grid Search on the parameter grid to do model selection with best
        # hyperparameters

        dict_scores[embedding_name][classification_name] = f"{np.mean(scores):.3f} ± {np.std(scores):.3f}"

In [41]:
pprint(dict_scores)

{'Bag of Words': {'Linear SVM': '0.644 ± 0.035',
                  'Logistic Regression': '0.686 ± 0.040',
                  'Multinomial Naive Bayes': '0.674 ± 0.044',
                  'Random Forest': '0.686 ± 0.031',
                  'XGBoost': '0.680 ± 0.033'},
 'FastText': {'Linear SVM': '0.603 ± 0.003',
              'Logistic Regression': '0.597 ± 0.018',
              'Random Forest': '0.567 ± 0.028',
              'XGBoost': '0.551 ± 0.041'},
 'TF': {'Linear SVM': '0.602 ± 0.007',
        'Logistic Regression': '0.613 ± 0.020',
        'Multinomial Naive Bayes': '0.601 ± 0.004',
        'Random Forest': '0.673 ± 0.034',
        'XGBoost': '0.680 ± 0.030'},
 'TF-IDF': {'Linear SVM': '0.669 ± 0.035',
            'Logistic Regression': '0.691 ± 0.030',
            'Multinomial Naive Bayes': '0.672 ± 0.037',
            'Random Forest': '0.690 ± 0.037',
            'XGBoost': '0.677 ± 0.030'},
 'Word2Vec': {'Linear SVM': '0.589 ± 0.051',
              'Logistic Regression': '0.5

In [None]:
params_grid = {
    "tfidf__max_df":[.7, .8, .9, 1],
    "tfidf__min_df":[.001, .01, .1],
    "tfidf__norm":["l1", "l2", None],
    "tfidf__sublinear_tf":[True, False],
    "tfidf__max_features":[10, 100, 1000, 10000],
    "tfidf__ngram_range":[(1,1), (1,2), (2,2)],
    "logreg__C":[.001, .01, .1, 1, 10, 100],
    "logreg__penalty":["l2"],
    "logreg__solver":["lbfgs"]
}

# Model evaluation

In [42]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("logreg", LogisticRegression(max_iter=1000))
])



grid_search = GridSearchCV(
    pipeline,
    params_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=0
)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

{'logreg__C': 0.01, 'logreg__penalty': 'l2', 'logreg__solver': 'lbfgs', 'tfidf__max_df': 0.7, 'tfidf__max_features': 10000, 'tfidf__min_df': 0.001, 'tfidf__ngram_range': (1, 1), 'tfidf__norm': None, 'tfidf__sublinear_tf': False}
0.7028301886792453


['../models/tfidf_logreg_model.joblib']

In [45]:
y_pred = grid_search.best_estimator_.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

0.6578947368421053
              precision    recall  f1-score   support

           0       0.69      0.83      0.75       166
           1       0.57      0.37      0.45       100

    accuracy                           0.66       266
   macro avg       0.63      0.60      0.60       266
weighted avg       0.64      0.66      0.64       266



array([[138,  28],
       [ 63,  37]])

In [46]:
joblib.dump(grid_search.best_estimator_,
            "../models/tfidf_logreg_model.joblib")

['../models/tfidf_logreg_model.joblib']