# Modèle simple

# Telechargements & imports des données

In [49]:
# !pip install uv
# !uv pip install pandas gensim numpy matplotlib scikit-learn wordcloud tqdm sentence_transformers ipykernel tensorflow spacy
# !python -m spacy download en_core_web_sm

In [50]:
import os
import re
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from wordcloud import WordCloud

tqdm.pandas()

import string
import warnings

warnings.filterwarnings("ignore")
import tensorflow as tf
from sentence_transformers import SentenceTransformer

from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [51]:
# Telecharger les données
!wget https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/AI+Engineer/Project+7%C2%A0-+D%C3%A9tectez+les+Bad+Buzz+gr%C3%A2ce+au+Deep+Learning/sentiment140.zip

--2025-05-22 11:38:37--  https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/AI+Engineer/Project+7%C2%A0-+D%C3%A9tectez+les+Bad+Buzz+gr%C3%A2ce+au+Deep+Learning/sentiment140.zip
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.0.219, 52.218.57.19, 52.92.33.56, ...
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.0.219|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84855679 (81M) [application/zip]
Saving to: ‘sentiment140.zip.1’


2025-05-22 11:38:41 (28.6 MB/s) - ‘sentiment140.zip.1’ saved [84855679/84855679]



In [52]:
# Extraction des données
ZIP_PATH = '/content/sentiment140.zip'

!unzip $ZIP_PATH

Archive:  /content/sentiment140.zip
replace training.1600000.processed.noemoticon.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [53]:
# Lecture du Dataframe
DATASET_PATH = '/content/training.1600000.processed.noemoticon.csv'
df = pd.read_csv(DATASET_PATH, sep=',', encoding = "ISO-8859-1", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


# Preprocessing des données

## Renommer les colonnes

In [54]:
# Renommer les colonnes en ce basant sur les cards du dataset
df = df.rename(columns={
    df.columns[0]: 'target',
    df.columns[1]: 'ids',
    df.columns[2]: 'date',
    df.columns[3]: 'flag',
    df.columns[4]: 'user',
    df.columns[5]: 'text',

})

In [55]:
# Definir les jeux de données

complete_df = df[['target', 'text']]
sample_df = df[['target', 'text']].sample(20_000)

# Afficher la valeurs des labels initiaux
print(sample_df['target'].value_counts())

# Conversion en binaire 0,1
sample_df['target'] = sample_df['target'].replace({0: 0, 4: 1})
complete_df['target'] = complete_df['target'].replace({0: 0, 4: 1})


target
4    10093
0     9907
Name: count, dtype: int64


## Text cleaning

In [56]:

def tweet_cleaning(tweet):
    """
    Nettoie et prétraite un tweet

    Cette fonction effectue plusieurs étapes de nettoyage :
        - Suppression des URLs, mentions et hashtags
        - Suppression des emojis et caractères spéciaux
        - Suppression de la ponctuation et des chiffres
        - Normalisation du texte (minuscules, espaces multiples)

    Params :
        tweet (str) : Le tweet brut à nettoyer.

    Return :
        str : Le tweet nettoyé et prétraité, prêt pour l'analyse de sentiment.

    """
    # Supprimer les URLs
    tweet = re.sub(r'https?://\S+|www\.\S+', '', tweet)

    # Supprimer les mentions (@user)
    tweet = re.sub(r'@\w+', '', tweet)

    # Supprimer les hashtags (#hashtag)
    tweet = re.sub(r'#\w+', '', tweet)

    # Normaliser & supprimer les caractères
    tweet = tweet.encode('ascii', 'ignore').decode('utf-8')
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)

    # Supprimer la ponctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))

    # Supprimer les chiffres
    tweet = re.sub(r'\d+', '', tweet)

    # Supprimer les espaces multiples et les espaces au début/fin
    tweet = re.sub(r'\s+', ' ', tweet).strip()

    return tweet



In [57]:
sample_df.apply(lambda x: tweet_cleaning(x['text']), axis=1)

Unnamed: 0,0
55439,is selfish
1153829,Better breakfast for you then good sir
597636,I have kinda stopped working on the concept as...
1252630,LOL yeabye infinite xeezzzz and ooooz
1378087,working HARD on my jonas room almost finishedk...
...,...
971891,how is going the trip Demi hope you have a gre...
139922,I dont want to sub at the high school today
1123705,Did you own anything else maybe
702826,crappy layout


## Tokenisation, Lematisation

In [58]:
import spacy
import nltk
from nltk.stem.snowball import PorterStemmer
nltk.download('punkt')
nltk.download('punkt_tab')


# Charger le modèle anglais
nlp = spacy.load("en_core_web_sm")

# Charger le stemmer anglais
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [59]:
def lemmatize_text(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

In [60]:
def stem_text_french(text):
    tokens = nltk.word_tokenize(text, language='french')
    return [stemmer.stem(token) for token in tokens]

In [61]:
def tokenize_text(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [62]:
text = sample_df['text'].sample(1).values[0]
print(text)

print(lemmatize_text(text=text))
print(stem_text_french(text=text))
print(tokenize_text(text=text))

Testing out this Snap2Twitter app for my new BlackBerry Curve.. this is only a test..  http://sml.vg/srWxVz
['test', 'out', 'this', 'Snap2Twitter', 'app', 'for', 'my', 'new', 'BlackBerry', 'Curve', '..', 'this', 'be', 'only', 'a', 'test', '..', ' ', 'http://sml.vg/srwxvz']
['test', 'out', 'thi', 'snap2twitt', 'app', 'for', 'my', 'new', 'blackberri', 'curv', '..', 'thi', 'is', 'onli', 'a', 'test', '..', 'http', ':', '//sml.vg/srwxvz']
['Testing', 'out', 'this', 'Snap2Twitter', 'app', 'for', 'my', 'new', 'BlackBerry', 'Curve', '..', 'this', 'is', 'only', 'a', 'test', '..', ' ', 'http://sml.vg/srWxVz']


In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [64]:
X = sample_df['text'].apply(tweet_cleaning)
y = sample_df['target']

In [65]:
# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization
tokenizer = Tokenizer(num_words=10_000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
MAX_LEN = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

## Embedding et vectorisation

In [66]:
!wget -O glove6b100dtxt.zip https://www.kaggle.com/api/v1/datasets/download/danielwillgeorge/glove6b100dtxt
!unzip glove6b100dtxt.zip

--2025-05-22 11:40:36--  https://www.kaggle.com/api/v1/datasets/download/danielwillgeorge/glove6b100dtxt
Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98
Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://storage.googleapis.com:443/kaggle-data-sets/715814/1246668/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20250522%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250522T114036Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=284b46412bfae9b8d901d3be7ed2e0088436ba1e6888a8e3219d74163521ff474cbc0a32dc418bce44e4a7062d87fa220714a6389d286e7e2dd1f1c01387037545e8ed2e72721c14a970067117d52cc3c7184a369559f67cf6336bbcc366bea6fac0fa81eebc3b52a5056461bd10f2313cd5f6c37069bc092e250d21deb39d37dfdd82053e9e511d336ec04ec930e6311b3a8fc74553ded05b27abbefca3d818c920046a79d8adcfb36dec365c2530d316f9bc658bac12

In [67]:
# Charger les embeddings GloVe
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_file = 'glove.6B.100d.txt'
embeddings_index = load_glove_embeddings(glove_file)

# Créer une matrice d'embeddings
embedding_matrix = np.zeros((10000, 100))
for word, i in tokenizer.word_index.items():
    if i < 10000:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Modelisation

## Creations des modèles

In [68]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, Dropout, Bidirectional

### RNN

In [69]:
# Modèle RNN avec Keras Embedding
rnn_model_keras = Sequential()
rnn_model_keras.add(Embedding(input_dim=10000, output_dim=128, input_length=MAX_LEN))
rnn_model_keras.add(SimpleRNN(128))
rnn_model_keras.add(Dense(1, activation='sigmoid'))

rnn_model_keras.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [70]:
# Modèle RNN avec GloVe Embedding
rnn_model_glove = Sequential()
rnn_model_glove.add(Embedding(input_dim=10000, output_dim=100, input_length=MAX_LEN, weights=[embedding_matrix], trainable=False))
rnn_model_glove.add(SimpleRNN(128))
rnn_model_glove.add(Dense(1, activation='sigmoid'))

rnn_model_glove.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

### LSTM

In [71]:
# Modèle LSTM avec Keras Embedding
lstm_model_keras = Sequential()
lstm_model_keras.add(Embedding(input_dim=10000, output_dim=128, input_length=MAX_LEN))
lstm_model_keras.add(LSTM(128))
lstm_model_keras.add(Dense(1, activation='sigmoid'))

lstm_model_keras.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [72]:
# Modèle LSTM avec GloVe Embedding
lstm_model_glove = Sequential()
lstm_model_glove.add(Embedding(input_dim=10000, output_dim=100, input_length=MAX_LEN, weights=[embedding_matrix], trainable=False))
lstm_model_glove.add(LSTM(128))
lstm_model_glove.add(Dense(1, activation='sigmoid'))

lstm_model_glove.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

### LSTM BIDIRECTIONEL

In [73]:
# Modèle LSTM Bidirectionnel à plusieurs couches avec Keras Embedding
lstm_model_bidirectional_keras = Sequential()
lstm_model_bidirectional_keras.add(Embedding(input_dim=10000, output_dim=128, input_length=MAX_LEN))
lstm_model_bidirectional_keras.add(Bidirectional(LSTM(128, return_sequences=True)))
lstm_model_bidirectional_keras.add(Dropout(0.5))
lstm_model_bidirectional_keras.add(Bidirectional(LSTM(64)))
lstm_model_bidirectional_keras.add(Dropout(0.5))
lstm_model_bidirectional_keras.add(Dense(1, activation='sigmoid'))

lstm_model_bidirectional_keras.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [74]:
# Modèle LSTM Bidirectionnel à plusieurs couches avec GloVe Embedding
lstm_model_bidirectional_glove = Sequential()
lstm_model_bidirectional_glove.add(Embedding(input_dim=10000, output_dim=100, input_length=MAX_LEN, weights=[embedding_matrix], trainable=False))
lstm_model_bidirectional_glove.add(Bidirectional(LSTM(128, return_sequences=True)))
lstm_model_bidirectional_glove.add(Dropout(0.5))
lstm_model_bidirectional_glove.add(Bidirectional(LSTM(64)))
lstm_model_bidirectional_glove.add(Dropout(0.5))
lstm_model_bidirectional_glove.add(Dense(1, activation='sigmoid'))

lstm_model_bidirectional_glove.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


## Entrainement et evaluation des modèles

### Tokenisation avec Keras

In [75]:
# Entraîner les modèles
print('Entrainement du RNN ...')
rnn_model_keras.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))
rnn_loss_keras, rnn_accuracy_keras = rnn_model_keras.evaluate(X_test_pad, y_test)
print(f'RNN Model with Keras Embedding - Loss: {rnn_loss_keras}, Accuracy: {rnn_accuracy_keras}')

print('\n\nEntrainement du LSTM ...')
lstm_model_keras.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))
lstm_loss_keras, lstm_accuracy_keras = lstm_model_keras.evaluate(X_test_pad, y_test)
print(f'LSTM Model with Keras Embedding - Loss: {lstm_loss_keras}, Accuracy: {lstm_accuracy_keras}')

print('\n\nEntrainement du LSTM Bidirectionel ...')
lstm_model_bidirectional_keras.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))
lstm_loss_bidirectional_keras, lstm_accuracy_bidirectional_keras = lstm_model_bidirectional_keras.evaluate(X_test_pad, y_test)
print(f'LSTM Bidirectional Model with Keras Embedding - Loss: {lstm_loss_bidirectional_keras}, Accuracy: {lstm_accuracy_bidirectional_keras}')



Entrainement du RNN ...
Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 63ms/step - accuracy: 0.5478 - loss: 0.6815 - val_accuracy: 0.7017 - val_loss: 0.5736
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 64ms/step - accuracy: 0.7792 - loss: 0.4692 - val_accuracy: 0.7088 - val_loss: 0.5855
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 61ms/step - accuracy: 0.8381 - loss: 0.3734 - val_accuracy: 0.7097 - val_loss: 0.6271
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 62ms/step - accuracy: 0.8892 - loss: 0.2836 - val_accuracy: 0.7072 - val_loss: 0.6938
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 63ms/step - accuracy: 0.9204 - loss: 0.2091 - val_accuracy: 0.6870 - val_loss: 0.7415
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.6933 - loss: 0.7411
RNN Model with Keras Embedding - Loss: 0.7414832115

In [76]:
print('Entrainement du RNN ...')
rnn_model_glove.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))
rnn_loss_glove, rnn_accuracy_glove = rnn_model_glove.evaluate(X_test_pad, y_test)
print(f'RNN Model with GloVe Embedding - Loss: {rnn_loss_glove}, Accuracy: {rnn_accuracy_glove}')

print('\n\nEntrainement du LSTM ...')
lstm_model_glove.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))
lstm_loss_glove, lstm_accuracy_glove = lstm_model_glove.evaluate(X_test_pad, y_test)
print(f'LSTM Model with GloVe Embedding - Loss: {lstm_loss_glove}, Accuracy: {lstm_accuracy_glove}')


print('\n\nEntrainement du LSTM Bidirectionel ...')
lstm_model_bidirectional_glove.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))
lstm_loss_bidirectional_glove, lstm_accuracy_bidirectional_glove = lstm_model_bidirectional_glove.evaluate(X_test_pad, y_test)
print(f'LSTM Bidirectional Model with GloVe Embedding - Loss: {lstm_loss_bidirectional_glove}, Accuracy: {lstm_accuracy_bidirectional_glove}')

Entrainement du RNN ...
Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 41ms/step - accuracy: 0.5742 - loss: 0.6843 - val_accuracy: 0.6572 - val_loss: 0.6169
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 40ms/step - accuracy: 0.6689 - loss: 0.6080 - val_accuracy: 0.6543 - val_loss: 0.6164
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 40ms/step - accuracy: 0.6670 - loss: 0.6086 - val_accuracy: 0.6720 - val_loss: 0.6002
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 40ms/step - accuracy: 0.6824 - loss: 0.5947 - val_accuracy: 0.6835 - val_loss: 0.5928
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 40ms/step - accuracy: 0.7006 - loss: 0.5750 - val_accuracy: 0.6837 - val_loss: 0.5904
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.6852 - loss: 0.5886
RNN Model with GloVe Embedding - Loss: 0.5903609991

## Tokenisation Glove

In [77]:
results = {
    'RNN with Keras Embedding': rnn_accuracy_keras,
    'LSTM with Keras Embedding': lstm_accuracy_keras,
    'Bidirectional LSTM with Keras Embedding': lstm_accuracy_bidirectional_keras,
    'RNN with GloVe Embedding': rnn_accuracy_glove,
    'Bidirectional LSTM with GloVe Embedding': lstm_accuracy_bidirectional_glove
}

best_model = max(results, key=results.get)
print(f'Le meilleur modèle est: {best_model} avec une précision de {results[best_model]}')


Le meilleur modèle est: Bidirectional LSTM with GloVe Embedding avec une précision de 0.7547500133514404
