## Modélisation supervisée ##

Je passe maintenant à une approche supervisée, qui utilise directement les tags du dataset pour entraîner un modèle prédictif.  
Je choisis d'utiliser un simple réseau de neurones à une couche cachée.

### Importation et fonctions ###


#### Environnement de travail ####

In [76]:
# Générique
import random
from collections import Counter
import time
import joblib

# Manipulation de données
import pandas as pd
import numpy as np

# NLP
from sklearn.preprocessing import MultiLabelBinarizer
import torch

# MLOps
import mlflow

# Modèlisation
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import RMSprop
from sklearn.metrics import f1_score, precision_score, recall_score, hamming_loss, make_scorer
from sklearn.model_selection import cross_val_predict, KFold
import sklearn.base
from scipy.optimize import minimize_scalar
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV



# DataViz
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Project modules
from config.paths import DATA_DIR

In [77]:
# Initialisation de MLFlow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("NLP StackOverflow Tagging")

<Experiment: artifact_location='mlflow-artifacts:/963439363743810932', creation_time=1727610300479, experiment_id='963439363743810932', last_update_time=1727610300479, lifecycle_stage='active', name='NLP StackOverflow Tagging', tags={}>

#### Importation des données ###

In [78]:
X_train = pd.read_csv(f'{DATA_DIR}/gold/x_train.csv').combined_text
X_test = pd.read_csv(f'{DATA_DIR}/gold/x_test.csv').combined_text
y_train = pd.read_csv(f'{DATA_DIR}/gold/y_train.csv').processed_tags
y_test = pd.read_csv(f'{DATA_DIR}/gold/y_test.csv').processed_tags
X = pd.concat([X_train, X_test], ignore_index=True)
y = pd.concat([y_train, y_test], ignore_index=True)


In [79]:
embeddings_bert_train = np.mean(np.load(f'{DATA_DIR}/gold/embeddings_bert_train.npy'), axis=1)
embeddings_bert_test = np.mean(np.load(f'{DATA_DIR}/gold/embeddings_bert_test.npy'), axis=1)
embeddings_stella_train = np.load(f'{DATA_DIR}/gold/embeddings_stella_train.npy')
embeddings_stella_test = np.load(f'{DATA_DIR}/gold/embeddings_stella_test.npy')
embeddings_use_train = np.load(f'{DATA_DIR}/gold/embeddings_use_train.npy')
embeddings_use_test = np.load(f'{DATA_DIR}/gold/embeddings_use_test.npy')
embeddings_word2vec_train = np.load(f'{DATA_DIR}/gold/embeddings_word2vec_train.npy')
embeddings_word2vec_test = np.load(f'{DATA_DIR}/gold/embeddings_word2vec_test.npy')


In [80]:
X.head()

0    python difference difference class child def _...
1    sql server issue query declare int insert cust...
2    interface know view uiviewcontroller ivar prop...
3    wpf allow user images method within richtextbo...
4    database user linq asp net mvc app must write ...
Name: combined_text, dtype: object

In [81]:
y.head()

0    python,oop,inheritance,multiple-inheritance,super
1     sql,sql-server,database,sql-server-2005,identity
2    iphone,cocoa-touch,uiview,uiviewcontroller,int...
3                     wpf,image,resize,richtextbox,rtf
4    c#,asp.net-mvc,linq,asp.net-membership,membership
Name: processed_tags, dtype: object

#### Définition des fonctions ####

In [82]:
def log_params_mlflow(model, model_name, embedder_name, epochs=None, batch_size=None):

    mlflow.log_param("model_name", model_name)
    mlflow.log_param("embedder_name", embedder_name)
    
    if isinstance(model, sklearn.base.BaseEstimator):
        params = model.get_params()
        for param_name, param_value in params.items():
            mlflow.log_param(param_name, param_value)
    
    elif isinstance(model, Sequential):
        optimizer = model.optimizer.__class__.__name__
        learning_rate = model.optimizer.learning_rate.numpy() if hasattr(model.optimizer.learning_rate, 'numpy') else model.optimizer.learning_rate
        mlflow.log_param("optimizer", optimizer)
        mlflow.log_param("learning_rate", learning_rate)

        for i, layer in enumerate(model.layers):
            if isinstance(layer, Dense):
                mlflow.log_param(f"layer_{i}_neurons", layer.units)
            if isinstance(layer, Dropout):
                mlflow.log_param(f"layer_{i}_dropout_rate", layer.rate)
        
        if epochs:
            mlflow.log_param("epochs", epochs)
        if batch_size:
            mlflow.log_param("batch_size", batch_size)
    
    else:
        print(f"Type de modèle non pris en charge pour le logging automatique : {type(model)}")


def log_metrics_mlflow(f1, precision, recall, hamming_loss, exact_match_ratio, exec_time):
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("hamming_loss", hamming_loss)
    mlflow.log_metric("exact_match_ratio", exact_match_ratio)
    mlflow.log_metric("execution_time", exec_time)

def log_model_mlflow(model, model_name):
    if isinstance(model, sklearn.base.BaseEstimator):
        mlflow.sklearn.log_model(model, artifact_path=f"{model_name}_model")
    elif isinstance(model, Sequential):
        mlflow.tensorflow.log_model(model, artifact_path=f"{model_name}_model")
    else:
        raise ValueError("Le modèle fourni n'est ni un modèle scikit-learn, ni un modèle TensorFlow.")




In [83]:
def multi_label_binarizer(corpus, sep=' '):
    corpus_list = corpus.apply(lambda x: x.split(sep))
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(corpus_list)
    tags_binarized_df = pd.DataFrame(y, columns=mlb.classes_)
    return tags_binarized_df

def define_neural_network(X, y):

    # Définition du réseau de neurones
    model = Sequential()
    model.add(Dense(256, input_shape=(X.shape[1],), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(y.shape[1], activation='sigmoid'))

    # Compilation du modèle
    optimizer = RMSprop(learning_rate=0.001)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    model.summary()

    return model

def create_model(neurons=64, dropout_rate=0.5, learning_rate=0.001, input_dim=None, output_dim=None):
    """Création du modèle avec des hyperparamètres variables."""
    model = Sequential()
    model.add(Dense(neurons, input_shape=(input_dim,), activation='relu'))  # Utiliser input_dim ici
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons // 2, activation='relu'))
    model.add(Dense(output_dim, activation='sigmoid'))  # Utiliser output_dim pour la sortie
    
    optimizer = RMSprop(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

def optimize_neural_net(X, y, param_dist=None, cv=3, epochs=20):
    
    input_dim = X.shape[1] 
    output_dim = y.shape[1]

    model = KerasClassifier(model=create_model, input_dim=input_dim, output_dim=output_dim, epochs=epochs, batch_size=32, verbose=0)
    
    if param_dist is None:
        param_dist = {
            'model__neurons': [64, 128, 256],
            'model__dropout_rate': [0.3, 0.5, 0.7],
            'model__learning_rate': [0.001, 0.0001, 0.00001],
            'batch_size': [32, 64, 128]
        }

    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=cv, scoring='f1_micro')
    random_search.fit(X, y)
    
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_

    return best_model, best_params


def find_optimal_threshold(y_true, y_pred_prob):

    def f1_for_threshold(threshold):
        y_pred = (y_pred_prob >= threshold).astype(int)
        return -f1_score(y_true, y_pred, average='micro')
    result = minimize_scalar(f1_for_threshold, bounds=(0, 1), method='bounded')
    return result.x 

def cross_validate_model(model, X, y, cv=5, epochs=20, batch_size=512, model_name="model", embedder_name="unknown_embedder"):
    
    run_name = f"{model_name}_{embedder_name}_run"
    start_time = time.time()

    with mlflow.start_run(run_name=run_name):
    
        if isinstance(model, sklearn.base.BaseEstimator):

            log_params_mlflow(model, model_name, embedder_name)

            y_pred_prob = cross_val_predict(model, X, y, cv=cv, method='predict_proba')
            
            optimal_threshold = find_optimal_threshold(y, y_pred_prob)
            y_pred_binary = (y_pred_prob >= optimal_threshold).astype(int)
        
        elif isinstance(model, Sequential):

            log_params_mlflow(model, model_name, embedder_name, epochs, batch_size)

            if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
                y = y.reset_index(drop=True).to_numpy()

            kf = KFold(n_splits=cv, shuffle=True, random_state=42)
            y_pred_prob = np.zeros(y.shape)
            for train_index, test_index in kf.split(X):
                X_train_cv, X_test_cv = X[train_index], X[test_index]
                y_train_cv, y_test_cv = y[train_index], y[test_index]
                
                model.fit(X_train_cv, y_train_cv, epochs=epochs, batch_size=batch_size, verbose=0)
                y_pred_prob[test_index] = model.predict(X_test_cv)

                optimal_threshold = find_optimal_threshold(y, y_pred_prob)
                y_pred_binary = (y_pred_prob >= optimal_threshold).astype(int)

        else:
            raise ValueError("Le modèle fourni n'est ni un modèle scikit-learn, ni un modèle TensorFlow.")
        
        exec_time = time.time() - start_time
        f1 = f1_score(y, y_pred_binary, average='micro')
        precision = precision_score(y, y_pred_binary, average='micro')
        recall = recall_score(y, y_pred_binary, average='micro')
        h_loss = hamming_loss(y, y_pred_binary)
        exact_match_ratio = np.all(y == y_pred_binary, axis=1).mean()

        metrics = {
            "f1_score": f1,
            "precision": precision,
            "recall": recall,
            "hamming_loss": h_loss,
            "exact_match_ratio": exact_match_ratio,
            "execution_time": exec_time
        }

        log_metrics_mlflow(f1, precision, recall, h_loss, exact_match_ratio, exec_time)
        log_model_mlflow(model, model_name)

    return metrics


In [130]:
def evaluate_against_test_set(model, X_test, y_test, model_name, embedder_name, epochs=None, batch_size=None):
    run_name = f"{model_name}_{embedder_name}_test_evaluation"
    start_time = time.time()
    
    with mlflow.start_run(run_name=run_name):
        
        log_params_mlflow(model, model_name, embedder_name, epochs, batch_size)

        if isinstance(model, sklearn.base.BaseEstimator):
            y_pred_prob = model.predict_proba(X_test)
        elif isinstance(model, Sequential):
            y_pred_prob = model.predict(X_test)
        else:
            raise ValueError("Modèle non pris en charge pour l'évaluation")

        optimal_threshold = find_optimal_threshold(y_test, y_pred_prob)
        y_pred_binary = (y_pred_prob >= optimal_threshold).astype(int)

        f1 = f1_score(y_test, y_pred_binary, average='micro')
        precision = precision_score(y_test, y_pred_binary, average='micro')
        recall = recall_score(y_test, y_pred_binary, average='micro')
        h_loss = hamming_loss(y_test, y_pred_binary)
        exact_match_ratio = np.all(y_test == y_pred_binary, axis=1).mean()
        exec_time = time.time() - start_time

        log_metrics_mlflow(f1, precision, recall, h_loss, exact_match_ratio, exec_time)

        if isinstance(model, Sequential): 
            log_model_mlflow(model, model_name)
        elif isinstance(model, sklearn.base.BaseEstimator):
            joblib.dump(model, f"{model_name}.joblib")
            mlflow.log_artifact(f"{model_name}.joblib")
        
    # Affichage des résultats
    print(f"F1 Score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"Exact Match Ratio: {exact_match_ratio}")
    print(f"Execution Time: {exec_time} seconds")


### Préparation des données ###

Pour ce modèle, je vais utiliser:
- Pour **X**, une matrice de features sous forme d'embeddings
- Pour **y**, une matrice de labels binarisée qui indique la présence ou non d'un tag dans le document

Je veux d'abord filtrer les données sur les tags les plus fréquents (top 100), pour éviter les outliers et l'overfitting.

In [85]:
y_train

0       python,oop,inheritance,multiple-inheritance,super
1        sql,sql-server,database,sql-server-2005,identity
2       iphone,cocoa-touch,uiview,uiviewcontroller,int...
3                        wpf,image,resize,richtextbox,rtf
4       c#,asp.net-mvc,linq,asp.net-membership,membership
                              ...                        
7995                       javascript,gwt,html,canvas,gxt
7996                  linux,gcc,linker,pthreads,libstdc++
7997                      asp.net,css,iis,xhtml,font-face
7998               iphone,objective-c,object,release,null
7999                             http,file,vba,forms,post
Name: processed_tags, Length: 8000, dtype: object

In [86]:
tags_list = y.str.split(',')
tag_counter = Counter(tag for tags in tags_list for tag in tags)
top_100_tags = set([tag for tag, _ in tag_counter.most_common(62)])
y = tags_list.apply(lambda tags: ','.join([tag for tag in tags if tag in top_100_tags]))

In [87]:
len(top_100_tags)

62

In [88]:
y_train

0       python,oop,inheritance,multiple-inheritance,super
1        sql,sql-server,database,sql-server-2005,identity
2       iphone,cocoa-touch,uiview,uiviewcontroller,int...
3                        wpf,image,resize,richtextbox,rtf
4       c#,asp.net-mvc,linq,asp.net-membership,membership
                              ...                        
7995                       javascript,gwt,html,canvas,gxt
7996                  linux,gcc,linker,pthreads,libstdc++
7997                      asp.net,css,iis,xhtml,font-face
7998               iphone,objective-c,object,release,null
7999                             http,file,vba,forms,post
Name: processed_tags, Length: 8000, dtype: object

Je supprime les observations qui n'ont plus de tags associés (car leurs tags n'étaient pas dans le top 100).

In [89]:
train_observations_number = len(embeddings_bert_train)

In [90]:
y_train = y[:train_observations_number][y != '']
y_test = y[train_observations_number:][y != '']

In [91]:
y_test

8000                          c#,wpf
8001                        java,c++
8002                     python,ruby
8003                              c#
8004    javascript,internet-explorer
                    ...             
9994                     c++,eclipse
9995                        php,http
9996                             wpf
9997                          python
9999                             wpf
Name: processed_tags, Length: 1792, dtype: object

In [92]:
X_train

0       python difference difference class child def _...
1       sql server issue query declare int insert cust...
2       interface know view uiviewcontroller ivar prop...
3       wpf allow user images method within richtextbo...
4       database user linq asp net mvc app must write ...
                              ...                        
7995    two trying two html5 canvas able effect absolu...
7996    create shared object linux gcc create shared o...
7997    font working client site css code font face fo...
7998    c iphone set object references nil developing ...
7999    draw send binary text file http post request s...
Name: combined_text, Length: 8000, dtype: object

In [93]:
X_test

0       wpf pattern started learning mvvm pattern wpf ...
1       create every object order implement c programm...
2       data simple way common unix scripting language...
3       c new win7 wmi please advice see active access...
4       access error details browsers way access webpa...
                              ...                        
1995    sending response framework sending files actio...
1996    completely code wpf try add flip animation use...
1997    problem building library python inherited pyth...
1998    data simple sql server services microsoft inst...
1999    xaml bind element element base bind element de...
Name: combined_text, Length: 2000, dtype: object

In [94]:
train_indices = list(y_train.index)
test_indices = [i - train_observations_number for i in y_test.index]

In [95]:
X_train = X_train[X_train.index.isin(list(y_train.index))]
X_test = X_test[X_test.index.isin(test_indices)]

Je supprime aussi les embeddings associés à ces observations.

In [96]:
X_train

0       python difference difference class child def _...
1       sql server issue query declare int insert cust...
2       interface know view uiviewcontroller ivar prop...
3       wpf allow user images method within richtextbo...
4       database user linq asp net mvc app must write ...
                              ...                        
7995    two trying two html5 canvas able effect absolu...
7996    create shared object linux gcc create shared o...
7997    font working client site css code font face fo...
7998    c iphone set object references nil developing ...
7999    draw send binary text file http post request s...
Name: combined_text, Length: 7144, dtype: object

In [97]:
len(embeddings_bert_train)

8000

In [98]:
embeddings_bert_train = embeddings_bert_train[train_indices]
embeddings_word2vec_train = embeddings_word2vec_train[train_indices]
embeddings_use_train = embeddings_use_train[train_indices]
embeddings_bert_test = embeddings_bert_test[test_indices]
embeddings_stella_train = embeddings_stella_train[train_indices]
embeddings_stella_test = embeddings_stella_test[test_indices]
embeddings_word2vec_test = embeddings_word2vec_test[test_indices]
embeddings_use_test = embeddings_use_test[test_indices]


In [99]:
len(embeddings_stella_train)

7144

Je peux maintenant générer les matrices multi-label binarisées.

In [100]:
# Liste de labels binarisés
y_mlb_train = multi_label_binarizer(y_train, sep=',')
y_mlb_test = multi_label_binarizer(y_test, sep=',')

In [101]:
joblib.dump(list(y_mlb_train.columns), f'{DATA_DIR}/gold/tags_list.pkl')

['/Users/gaspardhurez/csprojects/repos/openclassrooms/p5-stackoverflow-tagging-system/src/datasets/gold/tags_list.pkl']

In [102]:
y_mlb_train

Unnamed: 0,actionscript-3,ajax,algorithm,apache-flex,arrays,asp.net,asp.net-mvc,c,c#,c++,...,vb.net,visual-studio,visual-studio-2008,web-services,winapi,windows,winforms,wpf,xcode,xml
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7139,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7140,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7141,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7142,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Test de différents modèles ###

Les données sont prêtes à être modélisées; je vais maintenant entraîner plusieurs algorithmes, et les comparer.

In [103]:
all_metrics = {}

In [104]:
log_reg = OneVsRestClassifier(LogisticRegression(max_iter=1000))

In [105]:
log_reg_use_metrics = cross_validate_model(log_reg, embeddings_use_train, y_mlb_train, model_name='log_reg', embedder_name='use')
all_metrics['log_reg_use'] = log_reg_use_metrics

2024/10/04 10:31:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run log_reg_use_run at: http://localhost:5000/#/experiments/963439363743810932/runs/57cf9bcf632e4a5cbace13ed645d6aa6.
2024/10/04 10:31:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/963439363743810932.


In [106]:
log_reg_bert_metrics = cross_validate_model(log_reg, embeddings_bert_train, y_mlb_train, model_name='log_reg', embedder_name='bert')
all_metrics['log_reg_bert'] = log_reg_bert_metrics

2024/10/04 10:34:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run log_reg_bert_run at: http://localhost:5000/#/experiments/963439363743810932/runs/5914918d888f4621b6149784d9730f2f.
2024/10/04 10:34:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/963439363743810932.


In [107]:
# log_reg_stella_metrics = cross_validate_model(log_reg, embeddings_stella_train, y_mlb_train)
# all_metrics['log_reg_stella'] = log_reg_stella_metrics

In [108]:
log_reg_word2vec_metrics = cross_validate_model(log_reg, embeddings_word2vec_train, y_mlb_train, model_name='log_reg', embedder_name='word2vec')
all_metrics['log_reg_word2vec'] = log_reg_word2vec_metrics

2024/10/04 10:34:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run log_reg_word2vec_run at: http://localhost:5000/#/experiments/963439363743810932/runs/943a1c7b10f145e2bb32322ae6a94ca0.
2024/10/04 10:34:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/963439363743810932.


Je teste maintenant un réseau de neurones à plusieurs couches.

In [109]:
bert_neural_net = define_neural_network(embeddings_bert_train, y_mlb_train)
neural_net_bert_metrics = cross_validate_model(bert_neural_net, embeddings_bert_train, y_mlb_train, model_name='neural_net', embedder_name='bert')
all_metrics['neural_net_bert'] = neural_net_bert_metrics

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 528us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 465us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 535us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 468us/step


2024/10/04 10:34:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run neural_net_bert_run at: http://localhost:5000/#/experiments/963439363743810932/runs/693460b242164a808176d26d6f9f9a7f.
2024/10/04 10:34:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/963439363743810932.


In [110]:
use_neural_net = define_neural_network(embeddings_use_train, y_mlb_train)
neural_net_use_metrics = cross_validate_model(use_neural_net, embeddings_use_train, y_mlb_train, model_name='neural_net', embedder_name='use')
all_metrics['neural_net_use'] = neural_net_use_metrics

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 821us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 455us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 441us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 450us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


2024/10/04 10:34:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run neural_net_use_run at: http://localhost:5000/#/experiments/963439363743810932/runs/f9724d652b2d4611ad52d640de0e2455.
2024/10/04 10:34:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/963439363743810932.


In [111]:
stella_neural_net = define_neural_network(embeddings_stella_train, y_mlb_train)
neural_net_stella_metrics = cross_validate_model(stella_neural_net, embeddings_stella_train, y_mlb_train, model_name='neural_net', embedder_name='stella')
all_metrics['neural_net_stella'] = neural_net_stella_metrics

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 964us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 550us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 470us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 485us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 532us/step


2024/10/04 10:35:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run neural_net_stella_run at: http://localhost:5000/#/experiments/963439363743810932/runs/bd1bb2d3802245b8ad9e9911ac9a2692.
2024/10/04 10:35:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/963439363743810932.


In [112]:
word2vec_neural_net = define_neural_network(embeddings_word2vec_train, y_mlb_train)
neural_net_word2vec_metrics = cross_validate_model(word2vec_neural_net, embeddings_word2vec_train, y_mlb_train, model_name='neural_net', embedder_name='word2vec')
all_metrics['neural_net_word2vec'] = neural_net_word2vec_metrics

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 777us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 428us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 355us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 436us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 390us/step


2024/10/04 10:35:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run neural_net_word2vec_run at: http://localhost:5000/#/experiments/963439363743810932/runs/ab685f40d19f4a14a60be74d54b22f01.
2024/10/04 10:35:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/963439363743810932.


In [113]:
all_metrics

{'log_reg_use': {'f1_score': 0.6057345181893918,
  'precision': 0.5831049923143755,
  'recall': 0.6301914048392921,
  'hamming_loss': 0.025642993895170323,
  'exact_match_ratio': 0.20282754759238522,
  'execution_time': 27.9515118598938},
 'log_reg_bert': {'f1_score': 0.4361403882323716,
  'precision': 0.4287009906515976,
  'recall': 0.4438425424340917,
  'hamming_loss': 0.03587264747317848,
  'exact_match_ratio': 0.09420492721164614,
  'execution_time': 166.42875814437866},
 'log_reg_word2vec': {'f1_score': 0.4527804976871391,
  'precision': 0.42512046664975905,
  'recall': 0.48429035752979416,
  'hamming_loss': 0.03659059711736445,
  'exact_match_ratio': 0.0977043673012318,
  'execution_time': 14.391094207763672},
 'neural_net_bert': {'f1_score': 0.25692222835079975,
  'precision': 0.2078072719161276,
  'recall': 0.3364391477067533,
  'hamming_loss': 0.0608315572734169,
  'exact_match_ratio': 0.009098544232922732,
  'execution_time': 10.824025630950928},
 'neural_net_use': {'f1_score

Deux modèles ressortent: La régression logistique avec les embeddings de USE, et le réseau de neurones avec l'embedder Stella. Je vais donc chercher à optimiser les hyperparamètres des deux modèles.

### Evaluation sur le test set ###

In [114]:
best_nn_model, best_nn_params = optimize_neural_net(embeddings_stella_train, y_mlb_train)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **

In [115]:
best_nn_params

{'model__neurons': 256,
 'model__learning_rate': 0.001,
 'model__dropout_rate': 0.7,
 'batch_size': 128}

In [116]:
best_nn_model

In [117]:
evaluate_against_test_set(best_nn_model, embeddings_stella_test, y_mlb_test, model_name="neural_net", embedder_name="stella")

2024/10/04 10:36:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run neural_net_stella_test_evaluation at: http://localhost:5000/#/experiments/963439363743810932/runs/6212bd0faab147ec8374eac259e75ad6.
2024/10/04 10:36:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/963439363743810932.


F1 Score: 0.6416103603603603
Precision: 0.6225075116088501
Recall: 0.6619227417949463
Exact Match Ratio: 0.24386160714285715
Execution Time: 0.40903806686401367 seconds


In [122]:
best_nn_model.fit(embeddings_stella_train, y_mlb_train, epochs=100, batch_size=32)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [126]:
evaluate_against_test_set(best_nn_model, embeddings_stella_test, y_mlb_test, model_name="neural_net", embedder_name="stella")

2024/10/04 10:41:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run neural_net_stella_test_evaluation at: http://localhost:5000/#/experiments/963439363743810932/runs/a154abf50f4e409aafa55af64e4bf2a1.
2024/10/04 10:41:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/963439363743810932.


F1 Score: 0.6565770862800566
Precision: 0.6399228012131238
Recall: 0.6741214057507987
Exact Match Ratio: 0.26283482142857145
Execution Time: 0.4087178707122803 seconds


Grâce à l'optimisation des hyper-paramètres, le modèle atteint des scores satisfaisants, permettant de faire des suggestions informées.

Dans le cadre de ce projet, je vais déployer la régression logistique avec les embeddings USE, plus adaptée aux serveurs accessibles pour le déploiement (pas besoin de GPU), et pas très éloignée du réseau de neurones en terme de performances.

In [127]:
use_log_reg_model = log_reg.fit(embeddings_use_train, y_mlb_train)

In [131]:
evaluate_against_test_set(use_log_reg_model, embeddings_use_test, y_mlb_test, model_name="log_reg", embedder_name="use")

2024/10/04 10:47:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run log_reg_use_test_evaluation at: http://localhost:5000/#/experiments/963439363743810932/runs/b0367f9134d74f269aff70ab085f5cf1.
2024/10/04 10:47:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/963439363743810932.


F1 Score: 0.6092773011487568
Precision: 0.6100757134536983
Recall: 0.6084809758931164
Exact Match Ratio: 0.22154017857142858
Execution Time: 0.5319099426269531 seconds
