In [7]:
import os
import sys
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# ✅ Utilise le GPU si disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("📌 Using device:", device)


📌 Using device: cpu


In [8]:
# 📥 Charger les données nettoyées
df = pd.read_csv("../data/train_clean.csv").sample(n=1000, random_state=42)
df = df.dropna()

texts = df["text_clean"].tolist()
labels = df["toxic"].tolist()


In [9]:
# 🔠 Tokenizer BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class CommentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding["input_ids"].squeeze(),
            'attention_mask': encoding["attention_mask"].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [10]:
# 🧪 Diviser les données
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_dataset = CommentDataset(X_train, y_train, tokenizer)
test_dataset = CommentDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [11]:
# 🧠 Modèle BERT + CNN simple
class BERT_CNN(nn.Module):
    def __init__(self):
        super(BERT_CNN, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.conv1 = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(256, 2)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():  # Garde le modèle BERT gelé pour rapidité
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state.permute(0, 2, 1)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x).squeeze(-1)
        x = self.fc(x)
        return x

model = BERT_CNN().to(device)


In [12]:
# 🎯 Entraînement
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
epochs = 3

model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"✅ Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


Epoch 1:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch 1: 100%|██████████| 25/25 [02:15<00:00,  5.42s/it]


✅ Epoch 1, Loss: 0.5045


Epoch 2: 100%|██████████| 25/25 [02:15<00:00,  5.42s/it]


✅ Epoch 2, Loss: 0.2974


Epoch 3: 100%|██████████| 25/25 [02:13<00:00,  5.33s/it]

✅ Epoch 3, Loss: 0.2321





In [13]:
# 💾 Sauvegarde
model_path = "../models/model_HP.pt"
torch.save(model.state_dict(), model_path)
print(f"✅ Modèle HP sauvegardé : {model_path}")


✅ Modèle HP sauvegardé : ../models/model_HP.pt
