In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import s3fs
import numpy as np
import fireducks.pandas as pd
import warnings
from dotenv import load_dotenv
from tqdm import tqdm
from pprint import pprint
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import (
                                    cross_val_score, 
                                    GridSearchCV, 
                                    train_test_split,
                                    StratifiedKFold
                                    )
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from gensim.models import Word2Vec, FastText

sys.path.append("../src")
from ml_utils import *

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/onyxia/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
load_dotenv()
pd.set_option("display.max_columns", None)
warnings.simplefilter("ignore")
fs = s3fs.S3FileSystem(
            client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"},
            key=os.environ["Accesskey"],
            secret=os.environ["Secretkey"],
            token=os.environ["Token"]
)

# Data Preprocessing

In [4]:
with fs.open("elissamim/text_classification_men/data/stages-votes.json", "r") as file:
    df = pd.read_json(file)

df = df.groupby("phrase_text", as_index = False)["sol"].apply(lambda x: x.mode().iloc[0])
df["sol"]=df["sol"].apply(lambda x: 1 if x == "ok" else 0)
df["clean_phrase_text"] = df["phrase_text"].apply(lambda x: nltk_text_preprocessing(x, True))
df = df[df["clean_phrase_text"] != ""]
df.head()

Unnamed: 0,phrase_text,sol,clean_phrase_text
0,* Aider à la mise en place de l évènement Shar...,0,aider mise place évènemer shareplan envoi rapp...
1,* Comprendre le métier des achats * Comment or...,0,comprendre métier achat comment organiser appe...
2,* Fendre du bois en forêt au merlin manuelleme...,0,fendre boi forêt merlin manuellemer débarder b...
4,"2 jours au CDI , 1 jour en arts plastiques , 1...",0,2 jour cdi 1 jour art plastique 1 jour musiqu ...
5,4 jours au sein du Bureau des affaires institu...,1,4 jour sein bureau affaire institutionnel fina...


# Model selection (static embedding (sparse or dense) + classification algorithm) with grid search and cross validation

In [5]:
X = df["clean_phrase_text"]
y = df["sol"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42)

In [6]:
tokenized_texts = [text.split() for text in X_train]
word2vec_model = Word2Vec(sentences = tokenized_texts,
                         vector_size = 100,
                         window = 5,
                         min_count = 1,
                         workers = 4,
                         seed = 42)
fasttext_model = FastText(sentences = tokenized_texts,
                         vector_size = 100,
                         window = 5,
                         min_count = 1,
                         workers = 4,
                         seed = 42)

static_embedding_models = {
    # Sparse embeddings
    "Bag of Words":CountVectorizer(),
    "TF":TfidfVectorizer(use_idf=False),
    "TF-IDF":TfidfVectorizer(),
    # Dense embeddings
    "Word2Vec": MeanEmbeddingVectorizer(model=word2vec_model),
    "FastText": MeanEmbeddingVectorizer(model=fasttext_model)
}

classification_models = {
    "Logistic Regression":LogisticRegression(),
    "Random Forest":RandomForestClassifier(),
    "Linear SVM":SVC(kernel="linear", probability=True),
    "Multinomial Naive Bayes":MultinomialNB(),
    "XGBoost":XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

In [7]:
params_grid = {
    "Bag of Words" : {
        "feature_extraction__ngram_range":[(1,1), (1,2), (2,2)],
        "feature_extraction__max_df":[.7, .9, 1],
        "feature_extraction__min_df":[.001, .01, .1],
        "feature_extraction__max_features":[10, 100, 1000, 10000],
        "feature_extraction__binary":[True, False]
    },
    "TF" : {
        "feature_extraction__max_df":[.7, .8, .9, 1],
        "feature_extraction__min_df":[.001, .01, .1],
        "feature_extraction__norm":["l1", "l2", None],
        "feature_extraction__sublinear_tf":[True, False],
        "feature_extraction__max_features":[10, 100, 1000, 10000],
        "feature_extraction__ngram_range":[(1,1), (1,2), (2,2)]
    },
    "TF-IDF" : {
        "feature_extraction__max_df":[.7, .8, .9, 1],
        "feature_extraction__min_df":[.001, .01, .1],
        "feature_extraction__norm":["l1", "l2", None],
        "feature_extraction__sublinear_tf":[True, False],
        "feature_extraction__max_features":[10, 100, 1000, 10000],
        "feature_extraction__ngram_range":[(1,1), (1,2), (2,2)]
    },
    "Logistic Regression":{
        "classifier__C":[.001, .01, .1, 1, 10, 100],
        "classifier__penalty":["l2"],
        "classifier__solver":["lbfgs"]
    },
    "Random Forest":{
        "classifier__n_estimators":[100,300,500],
        "classifier__max_depth":[None, 10, 20, 50],
        "classifier__min_samples_split": [2, 5, 10],
        "classifier__min_samples_leaf":[1, 2, 5, 10],
        "classifier__max_features":["sqrt", "log2", None],
        "classifier__bootstrap":[True, False]
    },
    "Linear SVM":{
        "classifier__C":[.001, .01, .1, 1, 10, 100]
    },
    "Multinomial Naive Bayes":{
        "classifier__alpha":[.0001, .001, .01, .1, 1, 10],
        "classifier__fit_prior":[True, False]
    },
    "XGBoost":{
        "classifier__n_estimators":[100, 200, 300, 500],
        "classifier__max_depth":[3, 5, 7, 10],
        "classifier__learning_rate":[.01, .05, .1, .3],
        "classifier__subsample":[.5, .7, .8, 1],
        "classifier__colsample_bytree":[.5, .7, .8, 1],
        "classifier__gamma":[0, .1, .5, 1],
        "classifier__reg_alpha":[0, .01, .1, 1],
        "classifier__reg_lambda":[.1, 1, 10]
    }
}

In [None]:
dict_scores, dict_params = {}, {}

for embedding_name, embedding_model in tqdm(static_embedding_models.items(),
                                           desc="Static embeddings"):

    dict_scores[embedding_name], dict_params[embedding_name] = {}, {}
    
    for classification_name, classification_model in tqdm(classification_models.items(),
                                                         desc="Classification algorithms"):

        # Multinomial NB is not suited for dense vectors
        if embedding_name in ["Word2Vec", "FastText"] and classification_name == "Multinomial Naive Bayes":
            continue

        steps = [("feature_extraction", embedding_model)]

        # For Logistic Regression and Linear SVM, and for dense embeddings, add standardisation
        if embedding_name in ["Word2Vec", "FastText"] and classification_name in ["Logistic Regression", "Linear SVM"]:
            steps.append(("standardisation", StandardScaler()))

        steps.append(("classifier", classification_model))

        pipeline = Pipeline(steps)

        # We compute scores using Grid Search on the parameter grid to do model selection with best
        # hyperparameters
        cv = StratifiedKFold(n_splits=5, 
                                 shuffle = True, 
                                 random_state=42)
        
        if (embedding_name in params_grid) or (classification_name in params_grid):

            selected_params_grid = {
                **params_grid.get(embedding_name, {}),
                **params_grid.get(classification_name, {})
            }
            
            grid_search = GridSearchCV(
                pipeline,
                param_grid=selected_params_grid,
                cv=cv,
                scoring="accuracy",
                n_jobs=-1,
                verbose=0
            )
            
            grid_search.fit(X_train, y_train)
            
            score = grid_search.best_score_
            dict_params[embedding_name][classification_name] = grid_search.best_params_

        else:

            scores = cross_val_score(
                pipeline,
                X_train, 
                y_train,
                cv=cv,
                scoring="accuracy"
            )

            score = np.mean(scores)
            
        dict_scores[embedding_name][classification_name] = score

Static embeddings:   0%|          | 0/5 [00:00<?, ?it/s]
Classification algorithms:   0%|          | 0/5 [00:00<?, ?it/s][A

In [None]:
pprint(dict_scores)

# Model evaluation

In [42]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("logreg", LogisticRegression(max_iter=1000))
])



grid_search = GridSearchCV(
    pipeline,
    params_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=0
)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

{'logreg__C': 0.01, 'logreg__penalty': 'l2', 'logreg__solver': 'lbfgs', 'tfidf__max_df': 0.7, 'tfidf__max_features': 10000, 'tfidf__min_df': 0.001, 'tfidf__ngram_range': (1, 1), 'tfidf__norm': None, 'tfidf__sublinear_tf': False}
0.7028301886792453


['../models/tfidf_logreg_model.joblib']

In [45]:
y_pred = grid_search.best_estimator_.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

0.6578947368421053
              precision    recall  f1-score   support

           0       0.69      0.83      0.75       166
           1       0.57      0.37      0.45       100

    accuracy                           0.66       266
   macro avg       0.63      0.60      0.60       266
weighted avg       0.64      0.66      0.64       266



array([[138,  28],
       [ 63,  37]])

In [46]:
joblib.dump(grid_search.best_estimator_,
            "../models/tfidf_logreg_model.joblib")

['../models/tfidf_logreg_model.joblib']