# <a id='toc1_'></a>[Modèle avancé BERT](#toc0_)

**Table of contents**<a id='toc0_'></a>    
- [Modèle avancé BERT](#toc1_)    
- [Telechargements & imports des données](#toc2_)    
  - [Telechargement des libs](#toc2_1_)    
  - [Import des données](#toc2_2_)    
  - [Telechargement du dataset](#toc2_3_)    
- [Preprocessing des données](#toc3_)    
  - [Préprocessing simple](#toc3_1_)    
  - [Tokenize and split data](#toc3_2_)    
- [Modelisation](#toc4_)    
  - [Train the model](#toc4_1_)    
  - [Evaluate the model](#toc4_2_)    
  - [Log the model with Mlflow](#toc4_3_)    
  - [Save the model](#toc4_4_)    
  - [Load and use the model](#toc4_5_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc2_'></a>[Telechargements & imports des données](#toc0_)

## <a id='toc2_1_'></a>[Telechargement des libs](#toc0_)

In [None]:
# !pip install uv
# !uv pip install pandas numpy matplotlib scikit-learn wordcloud tqdm sentence_transformers ipykernel tensorflow spacy mlflow
# !python -m spacy download en_core_web_sm

## <a id='toc2_2_'></a>[Import des données](#toc0_)

In [None]:
import os
import re
import string

import mlflow
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from mlflow.models.signature import infer_signature
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    classification_report,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BertTokenizer,
    Trainer,
    TrainingArguments,
)

os.environ["TF_USE_LEGACY_KERAS"] = "1"
os.environ["WANDB_DISABLED"] = "true"


LOCAL = True

if LOCAL:
    MODEL_NAME = "../models/bert-base-uncased"
    OUTPUT_DIR = "./bert-base-uncased-trained"
else:
    MODEL_NAME = "bert-base-uncased"
    OUTPUT_DIR = "/content/bert-base-uncased-trained"


# Load the pretrained model
# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Load pre-trained BERT for binary classification
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../models/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## <a id='toc2_3_'></a>[Telechargement du dataset](#toc0_)

In [None]:
# Telecharger les données
!wget https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/AI+Engineer/Project+7%C2%A0-+D%C3%A9tectez+les+Bad+Buzz+gr%C3%A2ce+au+Deep+Learning/sentiment140.zip

In [None]:
# Extraction des données
ZIP_PATH = "/content/sentiment140.zip"
!unzip $ZIP_PATH

# <a id='toc3_'></a>[Preprocessing des données](#toc0_)

## <a id='toc3_1_'></a>[Préprocessing simple](#toc0_)

In [None]:
# Load the dataset
DATASET_PATH = "../data/training.1600000.processed.noemoticon.csv"
df = pd.read_csv(DATASET_PATH, sep=",", encoding="ISO-8859-1", header=None)

# Rename the columns
df = df.rename(
    columns={
        df.columns[0]: "target",
        df.columns[1]: "ids",
        df.columns[2]: "date",
        df.columns[3]: "flag",
        df.columns[4]: "user",
        df.columns[5]: "text",
    }
)

# Define the datasets
complete_df = df[["target", "text"]]
sample_df = df[["target", "text"]].sample(16_000, random_state=42)

# Convert to binary 0,1
sample_df["target"] = sample_df["target"].replace({0: 0, 4: 1})
complete_df["target"] = complete_df["target"].replace({0: 0, 4: 1})


In [None]:
def tweet_cleaning(tweet):
    """
    Nettoie et prétraite un tweet

    Cette fonction effectue plusieurs étapes de nettoyage :
        - Suppression des URLs, mentions et hashtags
        - Suppression des emojis et caractères spéciaux
        - Suppression de la ponctuation et des chiffres
        - Normalisation du texte (minuscules, espaces multiples)

    Params :
        tweet (str) : Le tweet brut à nettoyer.

    Return :
        str : Le tweet nettoyé et prétraité, prêt pour l'analyse de sentiment.

    """
    # Supprimer les URLs
    tweet = re.sub(r"https?://\S+|www\.\S+", "", tweet)

    # Supprimer les mentions (@user)
    tweet = re.sub(r"@\w+", "", tweet)

    # Supprimer les hashtags (#hashtag)
    tweet = re.sub(r"#\w+", "", tweet)

    # Normaliser & supprimer les caractères
    tweet = tweet.encode("ascii", "ignore").decode("utf-8")
    tweet = re.sub(r"[^\x00-\x7F]+", "", tweet)

    # Supprimer la ponctuation
    tweet = tweet.translate(str.maketrans("", "", string.punctuation))

    # Supprimer les chiffres
    tweet = re.sub(r"\d+", "", tweet)

    # Supprimer les espaces multiples et les espaces au début/fin
    tweet = re.sub(r"\s+", " ", tweet).strip()

    return tweet

In [None]:
# appliquer la fonction a la colonne text
sample_df.apply(lambda x: tweet_cleaning(x["text"]), axis=1)

## <a id='toc3_2_'></a>[Tokenize and split data](#toc0_)

In [None]:
def preprocess_function(data):
    """Tokenise une phrase données avec le tokenizer bert"""
    return tokenizer(
        data["text"], padding="max_length", truncation=True, max_length=128
    )


# Tokenize the dataset
tokenized_data = sample_df.apply(lambda x: preprocess_function(x), axis=1)

# Convert tokenized_data to a Hugging Face Dataset
tokenized_df = pd.DataFrame(
    {
        "input_ids": [data["input_ids"] for data in tokenized_data],
        "attention_mask": [data["attention_mask"] for data in tokenized_data],
        "labels": sample_df["target"].tolist(),
    }
)

# Creation du dataset huggingface
dataset = Dataset.from_pandas(tokenized_df)


# Split the dataset into training and test sets
split_dataset = dataset.train_test_split(test_size=0.2)

# Get the training and test datasets
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]


# <a id='toc4_'></a>[Modelisation](#toc0_)

## <a id='toc4_1_'></a>[Train the model](#toc0_)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    report_to="none",
    output_dir=OUTPUT_DIR,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,  # L2 regularization
    save_total_limit=2,  # Save only the last 2 checkpoints
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()


## <a id='toc4_2_'></a>[Evaluate the model](#toc0_)

In [None]:
# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)

# Print classification report
print(classification_report(test_dataset["labels"], preds))

## <a id='toc4_3_'></a>[Log the model with Mlflow](#toc0_)

In [None]:
def log_huggingface_model_with_mlflow(
    model,
    tokenizer,
    X_test,
    y_test,
    tags,
    model_name,
    model_version=None,
    experiment_name=" Model Bert",
    hyperparams=None,
):
    """
    Enregistre un modèle Hugging Face avec MLflow.

    Args:
        model: Modèle Hugging Face à logger.
        tokenizer: Tokenizer associé au modèle.
        X_test: Données de test pour l'évaluation du modèle.
        y_test: Labels de test pour l'évaluation du modèle.
        tags (dict): Dictionnaire de tags supplémentaires.
        model_name (str): Nom du modèle.
        model_version (str, optional): Version du modèle.
        experiment_name (str, optional): Nom de l'expérience MLflow.
        hyperparams (dict, optional): Hyperparamètres du modèle.
    """
    # Désactiver le logging automatique de MLflow pour les modèles Hugging Face
    mlflow.transformers.autolog(disable=True)

    mlflow.set_experiment(experiment_name)

    with mlflow.start_run(run_name=model_name):
        # Récupération des hyperparamètres
        if hyperparams is None:
            hyperparams = {}
            try:
                # Pour les modèles Hugging Face, on peut essayer d'extraire certaines informations
                hyperparams["model_type"] = model.config.model_type
                hyperparams["hidden_size"] = model.config.hidden_size
                hyperparams["num_hidden_layers"] = model.config.num_hidden_layers
                hyperparams["num_attention_heads"] = model.config.num_attention_heads
            except Exception as e:
                print(
                    f"Impossible de récupérer les hyperparamètres automatiquement : {e}"
                )

        for key, value in hyperparams.items():
            mlflow.log_param(key, str(value))

        # Évaluation du modèle
        def evaluate_model(model, tokenizer, texts, labels):
            # Tokenize les données de test
            inputs = tokenizer(
                texts,
                padding=True,
                truncation=True,
                return_tensors="pt",
                max_length=128,
            )

            # Prédictions
            with torch.no_grad():
                outputs = model(**inputs)

            # Obtenir les prédictions et probabilités
            logits = outputs.logits
            y_pred_proba = torch.softmax(logits, dim=1).numpy()[
                :, 1
            ]  # Probabilités pour la classe positive
            y_pred_class = np.argmax(logits.numpy(), axis=1)

            return y_pred_proba, y_pred_class

        y_pred_proba, y_pred_class = evaluate_model(model, tokenizer, X_test, y_test)

        # Calcul des métriques
        accuracy = accuracy_score(y_test, y_pred_class)
        precision = precision_score(y_test, y_pred_class)
        recall = recall_score(y_test, y_pred_class)
        f1 = f1_score(y_test, y_pred_class)
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        pr_auc = average_precision_score(y_test, y_pred_proba)

        # Log des métriques dans MLflow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1", f1)
        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.log_metric("pr_auc", pr_auc)

        # Informations sur le modèle
        mlflow.set_tag("mlflow.note.content", model_name)
        if model_version:
            mlflow.set_tag("model_version", model_version)

        # Logger le modèle Hugging Face
        # Créer un exemple d'entrée pour la signature
        example_input = X_test[:1]  # Prendre un exemple des données de test

        # Créer une signature pour le modèle
        def predict_wrapper(texts):
            inputs = tokenizer(
                texts,
                padding=True,
                truncation=True,
                return_tensors="pt",
                max_length=128,
            )
            with torch.no_grad():
                outputs = model(**inputs)
            return torch.softmax(outputs.logits, dim=1).numpy()

        # Inférer la signature à partir de l'exemple
        signature = infer_signature(example_input, predict_wrapper(example_input))

        # Logger le modèle avec MLflow
        mlflow.transformers.log_model(
            transformers_model={"model": model, "tokenizer": tokenizer},
            artifact_path="model",
            signature=signature,
            input_example=example_input,
        )

        # Ajouter les tags supplémentaires
        for key, val in tags.items():
            mlflow.set_tag(key, val)


In [None]:
# Logger le modele dans hugging face
log_huggingface_model_with_mlflow(
    model=model,
    tokenizer=tokenizer,
    X_test=test_dataset["text"],  # Assurez-vous que c'est une liste de textes
    y_test=test_dataset["labels"],  # Assurez-vous que c'est une liste de labels
    tags={
        "dataset_used": "sentiment140",
        "embedding_method": "Bert embedding",
        "preprocessing": "tweet_cleaning_function",
        "sample_size": str(sample_df.shape[0]),
        "sample_seed": "42",
        },
    
    model_name="bert-base-uncased",
    model_version="1.0",
    hyperparams={
        "learning_rate": 2e-5,
        "batch_size": 16,
        "epochs": 3
    }
)

## <a id='toc4_4_'></a>[Save the model](#toc0_)

In [None]:
# Sauvegarder le modèle
MODEL_SAVE_PATH = "./bert"
model.save_pretrained(MODEL_SAVE_PATH)

# Sauvegarder le tokenizer
tokenizer = BertTokenizer.from_pretrained("../models/bert-base-uncased")
tokenizer.save_pretrained(MODEL_SAVE_PATH)

# Sauvegarder le modèle au format PyTorch
torch.save(model.state_dict(), "bert_model.pth")

print(f"Modèle sauvegardé dans le dossier: {MODEL_SAVE_PATH}")
print("Modèle sauvegardé au format PyTorch: bert_model.pth")

## <a id='toc4_5_'></a>[Load and use the model](#toc0_)

In [None]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Charger le modèle
model = BertForSequenceClassification.from_pretrained(MODEL_SAVE_PATH)
model.load_state_dict(torch.load("bert_model.pth"))

# Charger le tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_SAVE_PATH)