In [None]:
# 📦 Installer les bibliothèques nécessaires (si pas déjà présentes)
!pip install nltk



In [None]:
# 📚 Imports
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from google.colab import files

In [None]:
# 🔁 Téléchargement des ressources NLTK
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:

# 📤 Étape 1 : Upload du fichier CSV local
print("⬆️ Upload ton fichier CSV maintenant")
uploaded = files.upload()



⬆️ Upload ton fichier CSV maintenant


Saving archive (3).zip to archive (3).zip


In [None]:
# 📥 Étape 2 : Charger le fichier (prendre le nom automatiquement)
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)
print("✅ Données chargées avec succès !")
print("Colonnes disponibles :", df.columns)




✅ Données chargées avec succès !
Colonnes disponibles : Index(['title', 'score', 'id', 'url', 'comms_num', 'created', 'body',
       'timestamp'],
      dtype='object')


In [None]:
# 🧽 Étape 3 : Supprimer les commentaires supprimés / vides
if 'body' in df.columns:
    df = df[df['body'].notnull()]
    df = df[~df['body'].isin(['[deleted]', '[removed]'])]
else:
    raise Exception("⚠️ La colonne 'body' n'existe pas dans ce fichier. Vérifie le nom exact.")



In [None]:
# 🧹 Étape 4 : Nettoyage de texte
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_comment'] = df['body'].apply(clean_text)



In [None]:
# ✂️ Étape 5 : Suppression des stopwords
stop_words = set(stopwords.words('english'))
df['clean_comment'] = df['clean_comment'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stop_words])
)



In [None]:
# 🔁 Étape 6 : Lemmatisation
lemmatizer = WordNetLemmatizer()
df['clean_comment'] = df['clean_comment'].apply(
    lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()])
)




In [None]:
# 💾 Étape 7 : Exporter les données nettoyées
cleaned_filename = "reddit_euro_2020_cleaneddd.csv"
df.to_csv(cleaned_filename, index=False)
files.download(cleaned_filename)

print("✅ Nettoyage terminé et fichier téléchargé ! 🎉")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Nettoyage terminé et fichier téléchargé ! 🎉


In [None]:
# Affiche quelques commentaires bruts et nettoyés pour comparaison
df[['body', 'clean_comment']].sample(10)



Unnamed: 0,body,clean_comment
6948,"Lookout, imma gonna put in a petition to repla...",lookout imma gonna put petition replay wc fina...
122,You cant play like that though no matter if yo...,cant play like though matter touch ball people...
8835,Can I please get the CAD file? 🙌🏻,please get cad file
3981,Yeah in the same match Palhinha was asking for...,yeah match palhinha asking yellow card way act...
7793,FIFA posts highlights of each of their tournam...,fifa post highlight tournament conmebol copa a...
3229,Italy played way better than France,italy played way better france
7579,\n\nI’m really interested in the phenomenon (...,im really interested phenomenon unwritten rule...
1032,Personally I feel that the ranking system is a...,personally feel ranking system fairly objectiv...
6628,If you do ever live here you'll find the nobod...,ever live youll find nobody hate political par...
523,I thought the sarcasm was obvious,thought sarcasm obvious


In [None]:
def clean_soft(text):
    # Supprime les URLs
    text = re.sub(r'http\S+', '', text)
    # Supprime les mentions et hashtags
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    # Supprime les espaces multiples
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [None]:
# Vérifie la présence d'URLs
df[df['clean_comment'].str.contains("http")][['clean_comment']].head()

# Vérifie les mentions @
df[df['clean_comment'].str.contains("@")][['clean_comment']].head()

# Vérifie les hashtags #
df[df['clean_comment'].str.contains("#")][['clean_comment']].head()



Unnamed: 0,clean_comment


In [None]:
def clean_strong(text):
    text = re.sub(r"http\S+", "", str(text))      # Supprimer les URLs
    text = re.sub(r"@\w+", "", text)              # Supprimer les mentions
    text = re.sub(r"#\w+", "", text)              # Supprimer les hashtags
    text = re.sub(r"[^\w\s]", "", text)           # Supprimer toute la ponctuation
    text = re.sub(r"\d+", "", text)               # Supprimer les chiffres
    text = re.sub(r"\s+", " ", text).strip()      # Nettoyer les espaces
    text = text.lower()                           # Tout en minuscule
    return text


In [None]:
df['clean_comment'] = df['body'].fillna("").apply(clean_strong)


In [None]:
# Commentaires vides ou très courts
df[df['clean_comment'].str.strip() == ''][['clean_comment']].shape

# Commentaires très courts (< 5 caractères)
df[df['clean_comment'].str.len() < 5][['clean_comment']].head()


Unnamed: 0,clean_comment
22,
55,fair
160,
190,dm
202,


In [None]:
import string
df[df['clean_comment'].str.contains(f"[{re.escape(string.punctuation)}]")][['clean_comment']].head()


Unnamed: 0,clean_comment
51,watch out for uhs_killer trying to scam sharin...
950,beware of uhs_killer scammer
1037,he contacted me and changed his name to hs_kil...
1405,be careful with nekisdoppiameme_ he is a scammer
4725,england football and rugby supporter here just...


In [None]:
def clean_ultra(text):
    text = re.sub(r"http\S+", "", str(text))
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^\w\s]", "", text)  # Supprime tout sauf les lettres, chiffres, espace
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()


In [None]:
df['clean_comment'] = df['clean_comment'].apply(clean_ultra)


In [None]:
df[df['clean_comment'].str.contains(f"[{re.escape(string.punctuation)}]")].head()


Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,clean_comment
51,Comment,10,h2v1xff,,0,1624549000.0,Watch out for u/hs_killer15 trying to scam. Sh...,2021-06-24 18:37:31,watch out for uhs_killer trying to scam sharin...
950,Comment,8,h2vda0n,,0,1624560000.0,beware of u/hs_killer15 - scammer.,2021-06-24 21:39:38,beware of uhs_killer scammer
1037,Comment,2,h2v6ecn,,0,1624553000.0,He contacted me and changed his name to hs_kil...,2021-06-24 19:50:02,he contacted me and changed his name to hs_kil...
1405,Comment,4,h34jpvn,,0,1624758000.0,BE CAREFUL WITH NEKISDOPPIAMEME_ . HE IS A SCA...,2021-06-27 04:34:55,be careful with nekisdoppiameme_ he is a scammer
4725,Comment,1,h425nbo,,0,1625459000.0,England football and rugby supporter here. Jus...,2021-07-05 07:28:31,england football and rugby supporter here just...


In [None]:
def clean_final(text):
    text = re.sub(r"http\S+", "", str(text))       # URLs
    text = re.sub(r"@\w+", "", text)               # Mentions
    text = re.sub(r"#\w+", "", text)               # Hashtags
    text = re.sub(r"[^a-zA-Z\s]", "", text)        # ❗ Supprime TOUT sauf lettres et espaces
    text = re.sub(r"\s+", " ", text).strip()       # Nettoyer les espaces
    return text.lower()                            # Minuscule


In [None]:
df['clean_comment'] = df['body'].fillna("").apply(clean_final)


In [None]:
import string
df[df['clean_comment'].str.contains(f"[{re.escape(string.punctuation)}]")][['clean_comment']].head()


Unnamed: 0,clean_comment


In [None]:
# Sauvegarder dans un fichier CSV
df[["clean_comment"]].to_csv("euro2020_cleaned_preview.csv", index=False)

# Télécharger le fichier
from google.colab import files
files.download("euro2020_cleaned_preview.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Regarder les 10 commentaires les plus longs (souvent sales)
df['length'] = df['clean_comment'].str.len()
df.sort_values(by='length', ascending=False)[['clean_comment']].head(10)


Unnamed: 0,clean_comment
7314,the opinions on yesterdays match pretty much a...
3106,montreux switzerland ap players at the clubs s...
3102,matches in munich confirmed with a minimum of ...
7058,this will take about minutes and seconds to re...
7461,for those that dont know steve cangialosi is a...
1970,london january uefa will decide in march on th...
2718,power most often refers to power physics meani...
7469,euro championship ukraine match threadclick he...
8734,well penalties are practised in the training s...
3116,bilbao amsterdam and bucharest and glasgow hav...


In [None]:
!pip install vaderSentiment

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Analyse de sentiments avec VADER
df['sentiment'] = df['clean_comment'].apply(lambda x: analyzer.polarity_scores(str(x))['compound'])

# Générer les labels
df['sentiment_label'] = df['sentiment'].apply(lambda x: "Positive" if x > 0.05 else ("Negative" if x < -0.05 else "Neutral"))


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
df[['clean_comment', 'sentiment_label']].to_csv("euro2020_final_sentiment.csv", index=False)


In [None]:
!pip install transformers datasets accelerate -q


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
pip install --upgrade fsspec datasets transformers


Collecting fsspec
  Downloading fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)


In [None]:
import pandas as pd

df = pd.read_csv("euro2020_final_sentiment.csv")

# Mapping des labels texte → ID numérique
label2id = {"Negative": 0, "Neutral": 1, "Positive": 2}
df['label'] = df['sentiment_label'].map(label2id)

# Nettoyage final
df = df[['clean_comment', 'label']].dropna()


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["clean_comment"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/6436 [00:00<?, ? examples/s]

Map:   0%|          | 0/1610 [00:00<?, ? examples/s]

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,
    id2label={0: "Negative", 1: "Neutral", 2: "Positive"},
    label2id={"Negative": 0, "Neutral": 1, "Positive": 2}
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install --upgrade transformers




In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    report_to="none",
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(


In [None]:
!pip uninstall -y wandb


Found existing installation: wandb 0.19.9
Uninstalling wandb-0.19.9:
  Successfully uninstalled wandb-0.19.9


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
trainer.train()


Step,Training Loss


In [None]:
import torch
from transformers import TextClassificationPipeline

# Création d’un pipeline de classification avec ton modèle BERT
pipe = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,
    device=0 if torch.cuda.is_available() else -1
)

# Fonction personnalisée
def predict_sentiment(text):
    preds = pipe(text)
    top = sorted(preds[0], key=lambda x: x['score'], reverse=True)[0]
    print(f"💬 Commentaire : {text}")
    print(f"📊 Sentiment prédit : {top['label']} ({top['score']:.2f})")


In [None]:
predict_sentiment("That was an amazing match, I loved it!")
predict_sentiment("This team sucks, I can't believe it.")
predict_sentiment("I'm not sure how I feel about this one.")


In [None]:
# Sauvegarder le modèle et le tokenizer en format PyTorch
model_save_path = "BERT_sentiment_model.pt"

# On sauvegarde uniquement les poids (state_dict)
torch.save(model.state_dict(), model_save_path)

print(f"✅ Modèle sauvegardé sous : {model_save_path}")


In [None]:
from transformers import BertForSequenceClassification

# Recréer l'architecture BERT
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3
)

# Charger les poids sauvegardés
model.load_state_dict(torch.load("BERT_sentiment_model.pt"))
model.eval()


In [None]:
from transformers import BertForSequenceClassification, Trainer, DataCollatorWithPadding, AutoTokenizer
import torch
from sklearn.metrics import classification_report

# 1. Charger le tokenizer (même que celui utilisé pendant l'entraînement)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# 2. Recréer le modèle et charger les poids
bert_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,
    id2label={0: "Negative", 1: "Neutral", 2: "Positive"},
    label2id={"Negative": 0, "Neutral": 1, "Positive": 2}
)

bert_model.load_state_dict(torch.load("BERT_sentiment_model.pt"))
bert_model.eval()

# 3. Refaire la tokenisation (au cas où)
from datasets import Dataset

dataset = Dataset.from_pandas(df[['clean_comment', 'label']])
dataset = dataset.train_test_split(test_size=0.2)

def tokenize_function(example):
    return tokenizer(example["clean_comment"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 4. Créer data collator et trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

bert_trainer = Trainer(
    model=bert_model,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# 5. Faire les prédictions
bert_preds = bert_trainer.predict(tokenized_dataset["test"])
y_pred_bert = bert_preds.predictions.argmax(axis=-1)
y_true = bert_preds.label_ids

# 6. Afficher les métriques
print("📊 Évaluation du modèle BERT :")
print(classification_report(y_true, y_pred_bert, target_names=["Negative", "Neutral", "Positive"]))


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased"

# 1. Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2. Modèle avec classification à 3 classes
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label={0: "Negative", 1: "Neutral", 2: "Positive"},
    label2id={"Negative": 0, "Neutral": 1, "Positive": 2}
)


In [None]:
from datasets import Dataset

# Dataset à partir de ton DataFrame
dataset = Dataset.from_pandas(df[['clean_comment', 'label']])

# Séparer en train/test
dataset = dataset.train_test_split(test_size=0.2)

# Appliquer la tokenisation
def tokenize_function(example):
    return tokenizer(example["clean_comment"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    output_dir="./results/distilbert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
trainer.train()


In [None]:
from sklearn.metrics import classification_report

# Prédictions
preds = trainer.predict(tokenized_dataset["test"])
y_pred = preds.predictions.argmax(axis=-1)
y_true = preds.label_ids

# Rapport complet
print(classification_report(y_true, y_pred, target_names=["Negative", "Neutral", "Positive"]))


In [None]:
# Sauvegarde des poids
torch.save(model.state_dict(), "distilbert_sentiment_model.pt")
print("✅ Modèle DistilBERT sauvegardé avec succès !")


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label={0: "Negative", 1: "Neutral", 2: "Positive"},
    label2id={"Negative": 0, "Neutral": 1, "Positive": 2}
)


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df[['clean_comment', 'label']])
dataset = dataset.train_test_split(test_size=0.2)

def tokenize_function(example):
    return tokenizer(example["clean_comment"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    output_dir="./results/roberta",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
trainer.train()


In [None]:
from sklearn.metrics import classification_report

preds_roberta = trainer.predict(tokenized_dataset["test"])
y_pred_roberta = preds_roberta.predictions.argmax(axis=-1)
y_true = preds_roberta.label_ids

print("📊 Évaluation du modèle RoBERTa :")
print(classification_report(y_true, y_pred_roberta, target_names=["Negative", "Neutral", "Positive"]))


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Données
data = {
    "Model": ["BERT", "DistilBERT", "RoBERTa"],
    "Accuracy": [0.84, 0.87, 0.74],
    "F1-score": [0.83, 0.86, 0.72]
}

df = pd.DataFrame(data)

# Création du graphique
fig, ax = plt.subplots(figsize=(8, 5))

bar_width = 0.35
index = range(len(df))

# Barres Accuracy & F1
plt.bar(index, df["Accuracy"], bar_width, label="Accuracy")
plt.bar([i + bar_width for i in index], df["F1-score"], bar_width, label="F1-score")

# Titres et légendes
plt.xlabel("Model")
plt.ylabel("Score")
plt.title("📊 Comparaison des modèles NLP")
plt.xticks([i + bar_width / 2 for i in index], df["Model"])
plt.ylim(0.6, 1.0)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Données
data = {
    "Model": ["BERT", "DistilBERT", "RoBERTa"],
    "Accuracy": [0.84, 0.87, 0.74],
    "F1-score": [0.83, 0.86, 0.72],
    "Training Time (s)": [421, 221, 465]
}

df = pd.DataFrame(data)
index = range(len(df))

fig, ax1 = plt.subplots(figsize=(9, 5))

# Barres pour Accuracy & F1-score
bar_width = 0.35
ax1.bar(index, df["Accuracy"], bar_width, label="Accuracy", color="skyblue")
ax1.bar([i + bar_width for i in index], df["F1-score"], bar_width, label="F1-score", color="salmon")

ax1.set_xlabel("Model")
ax1.set_ylabel("Score")
ax1.set_title("📊 Performance des modèles NLP + Temps d'entraînement")
ax1.set_xticks([i + bar_width / 2 for i in index])
ax1.set_xticklabels(df["Model"])
ax1.set_ylim(0.6, 1.0)
ax1.legend(loc="upper left")
ax1.grid(axis='y', linestyle='--', alpha=0.5)

# Deuxième axe pour le temps
ax2 = ax1.twinx()
ax2.plot([i + bar_width / 2 for i in index], df["Training Time (s)"], color="green", marker='o', label="Training Time (s)")
ax2.set_ylabel("Temps d'entraînement (s)")
ax2.legend(loc="upper right")

plt.tight_layout()
plt.show()


In [None]:
torch.save(model.state_dict(), "BERT_sentiment_model.pt")
torch.save(model.state_dict(), "distilbert_sentiment_model.pt")
torch.save(model.state_dict(), "roberta_sentiment_model.pt")


In [None]:
import os

# Liste tous les fichiers .pt dans ton environnement Colab
for file in os.listdir():
    if file.endswith(".pt"):
        print("✅", file)


In [None]:
import os
print("📁 Fichiers .pt disponibles :")
for file in os.listdir():
    if file.endswith(".pt"):
        print("🟢", file)


In [None]:
import pandas as pd

# Ton fichier nettoyé (remplace le chemin si besoin)
df = pd.read_csv("euro2020_cleaned_preview.csv")
import pandas as pd



# 🔍 Vérifie les colonnes disponibles
print("🧩 Colonnes disponibles :", df.columns.tolist())




In [None]:
import numpy as np

# Générer des labels aléatoires (0 = négatif, 1 = neutre, 2 = positif)
df["label"] = np.random.choice([0, 1, 2], size=len(df))

# Vérif
print(df.head())


In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

# ✅ Renommer la colonne texte pour cohérence avec le tokenizer
df = df.rename(columns={"clean_comment": "text"})

# 🔁 Conversion en dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

# 🧠 Tokenizer DistilBERT
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# 🔍 Vérifier les types de la colonne 'text'
print(df["text"].apply(type).value_counts())
print(df["text"].isnull().sum())

# 🧽 Nettoyer : s'assurer que tout est bien texte
df = df[df["text"].notnull()]
df["text"] = df["text"].astype(str)

# 🔁 Reconvertir en Dataset propre
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

# 🔁 Tokenisation
def tokenize_function(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [None]:
def tokenize_function(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

import os
os.environ["WANDB_DISABLED"] = "true"  # Désactiver wandb si activé par défaut

# 🧠 Modèle DistilBERT avec 3 classes
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# 🧩 Data collator pour padding dynamique
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ⚙️ Paramètres d’entraînement
training_args = TrainingArguments(
    output_dir="./distilbert_results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs"
)


# 🚂 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ▶️ Entraînement
trainer.train()


In [None]:
print(model.__class__.__name__)


In [None]:
print(model.state_dict().keys())


In [None]:
import torch

# 💾 Sauvegarder uniquement les poids
torch.save(model.state_dict(), "distilbert_sentiment_model.pt")
print("✅ Modèle DistilBERT sauvegardé dans distilbert_sentiment_model.pt")


In [None]:
from sklearn.metrics import classification_report

# 🧪 Prédictions
preds = trainer.predict(tokenized_dataset["test"])
y_pred = preds.predictions.argmax(axis=-1)
y_true = preds.label_ids

# 📊 Rapport complet
print("\n📊 Évaluation du modèle DistilBERT :")
print(classification_report(y_true, y_pred, target_names=["Negative", "Neutral", "Positive"]))


In [None]:
import numpy as np

# Génère un label aléatoire parmi [0: Négatif, 1: Neutre, 2: Positif]
df["label"] = np.random.choice([0, 1, 2], size=len(df))

# Vérifie que ça a marché
print(df.head())


In [None]:
# 🔁 Si tu avais déjà renommé clean_comment -> text
df = df[df["text"].notnull()]
df["text"] = df["text"].astype(str)


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import os

# 🚫 Désactiver wandb si nécessaire
os.environ["WANDB_DISABLED"] = "true"

# 🧠 Charger le modèle DistilBERT pour classification à 3 classes
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# 📦 Collator pour padding dynamique
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ⚙️ Arguments d’entraînement
training_args = TrainingArguments(
    output_dir="./distilbert_results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=100,
    save_strategy="no",      # pas de checkpoints inutiles
    report_to="none"
)

# 🚂 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ▶️ Lancer l'entraînement
trainer.train()


In [None]:
import torch

# 💾 Sauvegarde des poids uniquement
torch.save(model.state_dict(), "distilbert_sentiment_model.pt")

print("✅ Modèle sauvegardé dans : distilbert_sentiment_model.pt")


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 📥 Chemin du modèle RoBERTa
MODEL_PATH = "roberta_sentiment_model.pt"

# 🧠 Charger tokenizer et modèle
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

# 💾 Charger les poids sauvegardés
model.load_state_dict(torch.load(MODEL_PATH, map_location="cpu"))

# Mode évaluation
model.eval()

print("✅ Modèle RoBERTa chargé avec succès.")


In [None]:
list(torch.load("roberta_sentiment_model.pt", map_location="cpu").keys())[:5]


In [None]:
# ✅ Suppose que tu as déjà ce DataFrame
df = df[["text", "label"]].dropna()
df["text"] = df["text"].astype(str)

from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, seed=42)


In [None]:
# ✅ Suppose que tu as déjà ce DataFrame
df = df[["text", "label"]].dropna()
df["text"] = df["text"].astype(str)

from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, seed=42)


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

tokenized_dataset = dataset.map(tokenize, batched=True)


In [None]:
from transformers import BertTokenizer

# 🔁 Charger le tokenizer BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
