# Imports et Installs

In [None]:
!pip install mlflow dagshub

In [None]:
import mlflow
import dagshub
import mlflow.keras
from mlflow.models.signature import infer_signature

import time
import os
from google.colab import userdata

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rcParams
import matplotlib.font_manager as fm

# Récupère automatiquement le secret
dagshub_token = userdata.get('DAGSHUB_TOKEN')

# Initialisation Dagshub
dagshub.auth.add_app_token(dagshub_token)

# Connecter MLflow à Dagshub
dagshub.init(repo_owner='fabiencappelli', repo_name='Projet_07', mlflow=True)

# Configure MLflow pour pointer vers Dagshub
mlflow.set_tracking_uri('https://dagshub.com/fabiencappelli/Projet_07.mlflow')

font_path = os.path.expanduser("/content/drive/MyDrive/Colab Notebooks/fonts/Exo2-VariableFont_wght.ttf")  # Remplacez par le chemin exact
fm.fontManager.addfont(font_path)

# Définir la police globale avec le nom de la police
rcParams["font.family"] = "Exo 2"
# deux couleurs pertinentes pour aller avec la présentation
bleuclair = (0.15, 0.55, 0.82)
couleur_complementaire = (1 - bleuclair[0], 1 - bleuclair[1], 1 - bleuclair[2])
bleufonce = "#073642"

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
from keras import layers

import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, Callback
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [None]:
SEED = 34
csvPath = '/content/drive/MyDrive/Colab Notebooks/Projet_07/df_cleaned.csv'
imgPrezPath = '/content/drive/MyDrive/Colab Notebooks/Projet_07/presentationimg'
checkpoint_path = '/content/drive/MyDrive/Colab Notebooks/Projet_07/outputs/checkpoints_2/.weights.h5'

In [None]:
df_cleaned = pd.read_csv(csvPath, encoding='latin-1')

# Fonctions pour le modèle

In [None]:
def create_callbacks(
    checkpoint_path,
    patience_es=6,
    min_delta_es=0.01,
    monitor_es='val_loss',
    mode_es='min',
    monitor_mc='val_accuracy',
    mode_mc='max',
    factor_lr=0.1,
    cooldown_lr=5,
    patience_lr=5,
    min_lr=1e-5,
    monitor_lr='val_loss',
    mode_lr='min'
):

    early_stopping = EarlyStopping(
        patience=patience_es,
        min_delta=min_delta_es,
        monitor=monitor_es,
        mode=mode_es,
        verbose=1
    )

    model_autosave = ModelCheckpoint(
        filepath=checkpoint_path,
        save_weights_only=True,
        save_best_only=True,
        monitor=monitor_mc,
        mode=mode_mc,
        verbose=1
    )

    lr_reducer = ReduceLROnPlateau(
        factor=factor_lr,
        cooldown=cooldown_lr,
        patience=patience_lr,
        min_lr=min_lr,
        monitor=monitor_lr,
        mode=mode_lr,
        verbose=1
    )

    return [early_stopping, model_autosave, lr_reducer]

In [None]:
def build_model(vocab_size, lstm_units=64, learning_rate=0.001):
    # https://keras.io/examples/nlp/bidirectional_lstm_imdb/
    # Input for variable-length sequences of integers
    inputs = keras.Input(shape=(None,), dtype="int64")
    x = layers.Embedding(vocab_size, 256)(inputs)
    # Add 2 bidirectional LSTMs
    x = layers.Bidirectional(layers.LSTM(lstm_units, return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(lstm_units))(x)
    predictions = layers.Dense(1, activation="sigmoid")(x)


    model = tf.keras.Model(inputs, predictions)
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
    return model

In [None]:
def get_inference_time(model, X, n_runs=10):
    _ = model.predict(X[:2], verbose=0)  # Pré-chauffage
    times = []
    for _ in range(n_runs):
        start = time.time()
        _ = model.predict(X, verbose=0)
        end = time.time()
        times.append(end - start)
    mean_time = np.mean(times)
    ms_per_sample = (mean_time / X.shape[0]) * 1000
    return ms_per_sample

In [None]:
def train_pipeline(data, labels, num_words=10000, param_grid=None, random_state=34, sample_frac=None):
    """
    data, labels: pd.Series ou array-like
    param_grid: dict, paramètres pour la grid search
    sample_frac: float (ex: 0.2 pour 20%), si None, tout le dataset est utilisé pour la grid search
    """

    if sample_frac is not None:
        data_sample, _, labels_sample, _ = train_test_split(
          data, labels,
          train_size=sample_frac,
          random_state=random_state,
          stratify=labels
          )
    else:
        data_sample, labels_sample = data, labels

    # Split train/val/test sur l'échantillon (pour la grid search)
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        data_sample, labels_sample, test_size=0.15, random_state=random_state, stratify=labels_sample)
    X_train, X_val, y_train, y_val = train_test_split(
        X_trainval, y_trainval, test_size=0.1765, random_state=random_state, stratify=y_trainval)
    # car O.85*0.1765~0.15

    # Tokenizer (fit sur train seulement)
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(X_train)
    maxlen = 100
    def encode(X): return pad_sequences(tokenizer.texts_to_sequences(X), maxlen=maxlen, padding='post')
    X_train_enc = encode(X_train)
    X_val_enc = encode(X_val)
    X_test_enc = encode(X_test)
    vocab_size = min(len(tokenizer.word_index) + 1, num_words)
    y_train_arr = np.asarray(y_train).astype('float32')
    y_val_arr = np.asarray(y_val).astype('float32')
    y_test_arr = np.asarray(y_test).astype('float32')

    # Paramètres de la grid search
    if param_grid is None:
        param_grid = {
            'lstm_units': [64, 128],
            'batch_size': [64, 128],
            'learning_rate': [0.001, 0.0005, 0.0001]
          }
    search = list(ParameterGrid(param_grid))

    best_val_f1 = 0
    best_params = None
    best_model = None

    for params in search:
        with mlflow.start_run(nested=True):
            mlflow.log_params(params)
            model = build_model(
                vocab_size,
                lstm_units=params['lstm_units'],
                learning_rate=params['learning_rate']
            )
            callbacks = create_callbacks(checkpoint_path=checkpoint_path)
            history = model.fit(
                X_train_enc, y_train_arr,
                validation_data=(X_val_enc, y_val_arr),
                epochs=15,
                batch_size=params['batch_size'],
                callbacks=callbacks,
                verbose=0
            )
            val_pred_proba = model.predict(X_val_enc)
            val_pred = (val_pred_proba > 0.5).astype(int)
            val_f1 = f1_score(y_val_arr, val_pred)
            val_acc = accuracy_score(y_val_arr, val_pred)
            try:
                val_roc_auc = roc_auc_score(y_val_arr, val_pred_proba)
            except Exception:
                val_roc_auc = np.nan
            mlflow.log_metric("val_f1", val_f1)
            mlflow.log_metric("val_accuracy", val_acc)
            mlflow.log_metric("val_roc_auc", val_roc_auc)
            if val_f1 > best_val_f1:
                best_val_f1 = val_f1
                best_params = params
                best_model = model

    mlflow.log_params({"best_"+k: v for k, v in best_params.items()})

    # Test set metrics
    test_pred_proba = best_model.predict(X_test_enc)
    test_pred = (test_pred_proba > 0.5).astype(int)
    test_acc = accuracy_score(y_test_arr, test_pred)
    test_f1 = f1_score(y_test_arr, test_pred)
    try:
        test_roc_auc = roc_auc_score(y_test_arr, test_pred_proba)
    except Exception:
        test_roc_auc = np.nan

    mlflow.log_metric("test_accuracy", test_acc)
    mlflow.log_metric("test_f1", test_f1)
    mlflow.log_metric("test_roc_auc", test_roc_auc)

    inf_time_ms = get_inference_time(best_model, X_test_enc, n_runs=10)
    mlflow.log_metric("test_inference_time_ms_per_sample", inf_time_ms)

    mlflow.keras.log_model(best_model, "model")

    print(f"Best val_f1: {best_val_f1:.3f} | Test acc: {test_acc:.3f} | Test f1: {test_f1:.3f} | Test ROC AUC: {test_roc_auc:.3f} | Inf time (ms/sample): {inf_time_ms:.3f}")
    return best_model, best_params, best_val_f1, test_acc, test_f1, test_roc_auc, inf_time_ms

In [None]:
def refit_best_model(data, labels, best_params, num_words=10000, random_state=34):
    # Split complet (train/val/test sur tout le jeu de données)
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        data, labels, test_size=0.15, random_state=random_state, stratify=labels)
    X_train, X_val, y_train, y_val = train_test_split(
        X_trainval, y_trainval, test_size=0.1765, random_state=random_state, stratify=y_trainval)

    # Tokenizer
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(X_train)
    maxlen = 100
    def encode(X): return pad_sequences(tokenizer.texts_to_sequences(X), maxlen=maxlen, padding='post')
    X_train_enc = encode(X_train)
    X_val_enc = encode(X_val)
    X_test_enc = encode(X_test)
    vocab_size = min(len(tokenizer.word_index) + 1, num_words)
    y_train_arr = np.asarray(y_train).astype('float32')
    y_val_arr = np.asarray(y_val).astype('float32')
    y_test_arr = np.asarray(y_test).astype('float32')

    # Entraînement du modèle avec best_params
    model = build_model(
        vocab_size,
        lstm_units=best_params['lstm_units'],
        learning_rate=best_params['learning_rate']
    )
    callbacks = create_callbacks(checkpoint_path="final_model.weights.h5")
    history = model.fit(
        X_train_enc, y_train_arr,
        validation_data=(X_val_enc, y_val_arr),
        epochs=15,
        batch_size=best_params['batch_size'],
        callbacks=callbacks,
        verbose=0
    )
    # Log des métriques par époque :
    hist = history.history
    for metric_name, values in hist.items():
        if metric_name == "lr":
            continue
        for epoch, value in enumerate(values):
            mlflow.log_metric(metric_name, value, step=epoch+1)

    # Évaluation sur test
    test_pred_proba = model.predict(X_test_enc)
    test_pred = (test_pred_proba > 0.5).astype(int)
    test_acc = accuracy_score(y_test_arr, test_pred)
    test_f1 = f1_score(y_test_arr, test_pred)
    try:
        test_roc_auc = roc_auc_score(y_test_arr, test_pred_proba)
    except Exception:
        test_roc_auc = np.nan
    inf_time_ms = get_inference_time(model, X_test_enc, n_runs=10)

    mlflow.log_metric("test_accuracy", test_acc)
    mlflow.log_metric("test_f1", test_f1)
    mlflow.log_metric("test_roc_auc", test_roc_auc)

    inf_time_ms = get_inference_time(model, X_test_enc, n_runs=10)
    mlflow.log_metric("test_inference_time_ms_per_sample", inf_time_ms)

    mlflow.keras.log_model(model, "model")

    print(f"Final test acc: {test_acc:.3f}, test f1: {test_f1:.3f}, test ROC AUC: {test_roc_auc:.3f}, inf. time (ms/sample): {inf_time_ms:.3f}")
    return model, history, test_acc, test_f1, test_roc_auc, inf_time_ms


In [None]:
def graphhistory(history, test_acc, filename):
  plt.figure(figsize=(10, 6))
  hist = history.history
  epochs = range(1, len(hist['loss']) + 1)
  # Loss
  plt.plot(epochs, hist['loss'],
          label='Training loss',
          linestyle='-',
          linewidth=2,
          color=bleufonce)
  plt.plot(epochs, hist['val_loss'],
          label='Validation loss',
          linestyle='--',
          linewidth=2,
          color=bleufonce)

  # Accuracy
  plt.plot(epochs, hist['accuracy'],
          label='Training accuracy',
          linestyle='-',
          linewidth=2,
          color=bleuclair)
  plt.plot(epochs, hist['val_accuracy'],
          label='Validation accuracy',
          linestyle='--',
          linewidth=2,
          color=couleur_complementaire)

  # Ajout de l’accuracy test en ligne horizontale
  plt.axhline(test_acc, linestyle=':', linewidth=2, color='darkgreen', label='Test accuracy')

  # Mise en forme
  plt.title("Évolution de la perte et de l'exactitude", fontsize=14)
  plt.xlabel("Époque", fontsize=12)
  plt.ylabel("Valeur", fontsize=12)
  plt.legend()
  plt.grid(alpha=0.3)
  plt.tight_layout()
  plt.savefig(os.path.join(imgPrezPath, filename + ".svg"), format="svg", bbox_inches="tight", pad_inches=0.1)
  plt.show()

https://realpython.com/python-keras-text-classification/#choosing-a-data-set

# Données brutes

On repart sur les données brutes vu qu'elles nous ont donné le meilleur résultat précédemment

In [None]:
# Division en jeu d'entraînement et de test
X = df_cleaned['text']
y = df_cleaned['target']

In [None]:
mlflow.set_experiment("BLSTM")

In [None]:
callbacks = create_callbacks(checkpoint_path)

In [None]:
with mlflow.start_run(run_name="GridSearch_sample_20pct"):
    best_model, best_params, best_val_f1, test_acc, test_f1, test_roc_auc, inf_time_ms = train_pipeline(X, y, num_words=10000, sample_frac=0.2)

In [None]:
with mlflow.start_run(run_name="Final_refit_full_data"):
    model, history, test_acc, test_f1, test_roc_auc, inf_time_ms = refit_best_model(X, y, best_params=best_params)

In [None]:
graphhistory(history, test_acc, "BLSTMRefitHistory")