In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import s3fs
import numpy as np
import fireducks.pandas as pd
import warnings
from dotenv import load_dotenv
from tqdm import tqdm
from pprint import pprint

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from gensim.models import Word2Vec, FastText

sys.path.append("../src")
from ml_utils import *

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/onyxia/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [9]:
load_dotenv()
pd.set_option("display.max_columns", None)
warnings.simplefilter("ignore")
fs = s3fs.S3FileSystem(
            client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"},
            key=os.environ["Accesskey"],
            secret=os.environ["Secretkey"],
            token=os.environ["Token"]
)

In [10]:
with fs.open("elissamim/text_classification_men/data/stages-votes.json", "r") as file:
    df = pd.read_json(file)

df = df.groupby("phrase_text", as_index = False)["sol"].apply(lambda x: x.mode().iloc[0])
df["sol"]=df["sol"].apply(lambda x: 1 if x == "ok" else 0)
df["clean_phrase_text"] = df["phrase_text"].apply(lambda x: nltk_text_preprocessing(x, True))
df = df[df["clean_phrase_text"] != ""]
df.head()

Unnamed: 0,phrase_text,sol,clean_phrase_text
0,* Aider à la mise en place de l évènement Shar...,0,aider mise place évènemer shareplan envoi rapp...
1,* Comprendre le métier des achats * Comment or...,0,comprendre métier achat comment organiser appe...
2,* Fendre du bois en forêt au merlin manuelleme...,0,fendre boi forêt merlin manuellemer débarder b...
4,"2 jours au CDI , 1 jour en arts plastiques , 1...",0,2 jour cdi 1 jour art plastique 1 jour musiqu ...
5,4 jours au sein du Bureau des affaires institu...,1,4 jour sein bureau affaire institutionnel fina...


In [25]:
X = df["clean_phrase_text"]
y = df["sol"]

static_embedding_models = {
    "Bag of Words":CountVectorizer(),
    "TF":TfidfVectorizer(use_idf=False, norm = "l1"),
    "TF-IDF":TfidfVectorizer(),
    "Word2Vec": ,
    "FastText": 
}

classification_models = {
    "Logistic Regression":LogisticRegression(),
    "Random Forest":RandomForestClassifier(),
    "Linear SVM":SVC(kernel="linear", probability=True),
    "Multinomial Naive Bayes":MultinomialNB(),
    "XGBoost":XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

dict_scores = {}

for embedding_name, embedding_model in tqdm(static_embedding_models.items(),
                                           desc="Static embeddings"):

    dict_scores[embedding_name] = {}
    
    for classification_name, classification_model in tqdm(classification_models.items(),
                                                         desc="Classification algorithms"):
        
        pipeline = Pipeline(
            [
                ("feature_extraction", embedding_model),
                ("classifier", classification_model)
            ]
        )

        scores = cross_val_score(pipeline, X, y, cv=10, scoring="accuracy")

        dict_scores[embedding_name][classification_name] = f"{np.mean(scores):.3f} ± {np.std(scores):.3f}"

Static embeddings:   0%|          | 0/3 [00:00<?, ?it/s]
Classification algorithms:   0%|          | 0/5 [00:00<?, ?it/s][A
Classification algorithms:  20%|██        | 1/5 [00:08<00:35,  8.85s/it][A
Classification algorithms:  40%|████      | 2/5 [00:44<01:13, 24.62s/it][A
Classification algorithms:  60%|██████    | 3/5 [01:11<00:51, 25.70s/it][A
Classification algorithms:  80%|████████  | 4/5 [01:16<00:17, 17.37s/it][A
Classification algorithms: 100%|██████████| 5/5 [02:15<00:00, 27.18s/it][A
Static embeddings:  33%|███▎      | 1/3 [02:15<04:31, 135.90s/it]
Classification algorithms:   0%|          | 0/5 [00:00<?, ?it/s][A
Classification algorithms:  20%|██        | 1/5 [00:05<00:23,  5.97s/it][A
Classification algorithms:  40%|████      | 2/5 [00:35<00:59, 19.95s/it][A
Classification algorithms:  60%|██████    | 3/5 [00:58<00:42, 21.11s/it][A
Classification algorithms:  80%|████████  | 4/5 [01:02<00:14, 14.47s/it][A
Classification algorithms: 100%|██████████| 5/5 [02:21<00

In [26]:
pprint(dict_scores)

{'Bag of Words': {'Linear SVM': '0.609 ± 0.023',
                  'Logistic Regression': '0.655 ± 0.029',
                  'Multinomial Naive Bayes': '0.614 ± 0.052',
                  'Random Forest': '0.633 ± 0.026',
                  'XGBoost': '0.635 ± 0.023'},
 'TF': {'Linear SVM': '0.605 ± 0.012',
        'Logistic Regression': '0.604 ± 0.019',
        'Multinomial Naive Bayes': '0.606 ± 0.004',
        'Random Forest': '0.638 ± 0.023',
        'XGBoost': '0.651 ± 0.040'},
 'TF-IDF': {'Linear SVM': '0.629 ± 0.033',
            'Logistic Regression': '0.659 ± 0.032',
            'Multinomial Naive Bayes': '0.621 ± 0.024',
            'Random Forest': '0.650 ± 0.027',
            'XGBoost': '0.623 ± 0.036'}}
