# Modèle avancé BERT

# Telechargements & imports des données

In [None]:
!pip install uv
!uv pip install pandas numpy matplotlib scikit-learn wordcloud tqdm sentence_transformers ipykernel tensorflow spacy mlflow
!python -m spacy download en_core_web_sm

In [None]:
import logging
import os, re, string

import mlflow
import mlflow.tensorflow
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer, TFBertForSequenceClassification

os.environ["TF_USE_LEGACY_KERAS"] = "1"

In [None]:
# Telecharger les données
!wget https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/AI+Engineer/Project+7%C2%A0-+D%C3%A9tectez+les+Bad+Buzz+gr%C3%A2ce+au+Deep+Learning/sentiment140.zip

In [None]:
# Extraction des données
ZIP_PATH = '/content/sentiment140.zip'

!unzip $ZIP_PATH

In [None]:
# Lecture du Dataframe
DATASET_PATH = '/content/training.1600000.processed.noemoticon.csv'
df = pd.read_csv(DATASET_PATH, sep=',', encoding = "ISO-8859-1", header=None)
df.head()

In [None]:
# Renommer les colonnes en ce basant sur les cards du dataset
df = df.rename(columns={
    df.columns[0]: 'target',
    df.columns[1]: 'ids',
    df.columns[2]: 'date',
    df.columns[3]: 'flag',
    df.columns[4]: 'user',
    df.columns[5]: 'text',

})

In [None]:
# Definir les jeux de données

complete_df = df[['target', 'text']]
sample_df = df[['target', 'text']].sample(16_000)

# Afficher la valeurs des labels initiaux
print(sample_df['target'].value_counts())

# Conversion en binaire 0,1
sample_df['target'] = sample_df['target'].replace({0: 0, 4: 1})
complete_df['target'] = complete_df['target'].replace({0: 0, 4: 1})


In [None]:

def tweet_cleaning(tweet):
    """
    Nettoie et prétraite un tweet

    Cette fonction effectue plusieurs étapes de nettoyage :
        - Suppression des URLs, mentions et hashtags
        - Suppression des emojis et caractères spéciaux
        - Suppression de la ponctuation et des chiffres
        - Normalisation du texte (minuscules, espaces multiples)

    Params :
        tweet (str) : Le tweet brut à nettoyer.

    Return :
        str : Le tweet nettoyé et prétraité, prêt pour l'analyse de sentiment.

    """
    # Supprimer les URLs
    tweet = re.sub(r'https?://\S+|www\.\S+', '', tweet)

    # Supprimer les mentions (@user)
    tweet = re.sub(r'@\w+', '', tweet)

    # Supprimer les hashtags (#hashtag)
    tweet = re.sub(r'#\w+', '', tweet)

    # Normaliser & supprimer les caractères
    tweet = tweet.encode('ascii', 'ignore').decode('utf-8')
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)

    # Supprimer la ponctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))

    # Supprimer les chiffres
    tweet = re.sub(r'\d+', '', tweet)

    # Supprimer les espaces multiples et les espaces au début/fin
    tweet = re.sub(r'\s+', ' ', tweet).strip()

    return tweet



In [None]:
sample_df.apply(lambda x: tweet_cleaning(x['text']), axis=1)

In [None]:
X = sample_df['text'].apply(tweet_cleaning)
y = sample_df['target']

In [None]:
# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_test, X_val, y_test, y_val  = train_test_split(X_test, y_test, test_size=0.2, random_state=42)

## Settings Bert

In [None]:
MODEL_NAME = "distilbert-base-uncased"
MAX_LENGTH = 16
BATCH_SIZE = 16
EPOCHS = 6
LEARNING_RATE = 0.0001

### Création du Tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)


def encode_texts(texts):
    return tokenizer(
        texts,
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length",
        return_attention_mask=True,
        return_token_type_ids=False,
        return_tensors="tf",
    )


train_encodings = encode_texts(X_train)
val_encodings = encode_texts(X_val)
test_encodings = encode_texts(X_test)

# Conversion en tf.data.Dataset
train_dataset = (
    tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train))
    .shuffle(len(X_train))
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

val_dataset = (
    tf.data.Dataset.from_tensor_slices((dict(val_encodings), y_val))
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

test_dataset = (
    tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test))
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

print("\nExemple d'encodage (première phrase d'entraînement):")
for key, value in train_encodings.items():
    print(f"{key}: {value[0].numpy().tolist()[:10]}...")  # Affiche les 10

### Creation modèle BERT

In [None]:
model = TFBertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
)

# Compile the model
optimizer = Adam(learning_rate=2e-5)
loss = SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

# Prepare TensorFlow datasets
train_dataset = (
    tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train))
    .shuffle(1000)
    .batch(16)
)

test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(
    16
)

## Entrainement

In [None]:
history = model.fit(train_dataset, epochs=EPOCHS, validation_data=val_dataset)
results = model.evaluate(test_dataset, batch_size=BATCH_SIZE, return_dict=True)


print(f"Résultats du test: {results}")
mlflow.log_metrics(
    {"test_loss": results["loss"], "test_accuracy": results["accuracy"]}
)

## Evaluation du modèle

In [None]:
# Tokenization des exemples
sample_encodings = encode_texts(X_test)

# Prédictions 
predictions = model.predict(dict(sample_encodings))
logits = predictions.logits

# Conversion des logits en probabilités et en classes prédites
probabilities = tf.nn.softmax(logits, axis=-1).numpy()
predicted_classes = np.argmax(probabilities, axis=1)

for tweet, true, prob, pred_class in zip(
    X_test, y_test, probabilities, predicted_classes
)[:50]:
    sentiment = "Non-Négatif/Positif" if pred_class == 1 else "Négatif"
    print(f"\nTweet: {tweet}")
    print(f"  Probabilités (Négatif, Non-Négatif/Positif): {prob}")
    print(f"  Sentiment Prédit: {sentiment} | 'Vrai' Sentiment : {true}")