In [90]:
import sys
sys.path.append('..')
from tqdm import tqdm
import numpy as np
import pandas as pd
import os

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import classification_report

from constants import CATEGORIES

In [75]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
print(torch.cuda.get_device_name(0))


Using device: cuda
NVIDIA GeForce RTX 3070


In [76]:
df_train = pd.read_parquet('data/df_train_preprocessed.parquet')
df_val = pd.read_parquet('data/df_val_preprocessed.parquet')
df_test = pd.read_parquet('data/df_test_preprocessed.parquet')

In [77]:
df_train.head()

Unnamed: 0,id,comment_text_baseline,toxic,severe_toxic,obscene,threat,insult,identity_hate,overall_toxic,comment_text_word_tokenize_no_normalization,comment_text_gpt_tokenize_no_normalization,comment_text_word_tokenize_normalization,comment_text_gpt_tokenize_normalization,comment_text_word_tokenize_full_normalization,comment_text_gpt_tokenize_full_normalization,comment_text_word_tokenize_simple_normalization,comment_text_gpt_tokenize_simple_normalization
140030,ed56f082116dcbd0,Grandma Terri Should Burn in Trash \nGrandma T...,1,0,0,0,0,0,1,Grandma Terri Should Burn in Trash Grandma Ter...,41251 1764 10335 462 12540 18530 304 71723 720...,grandma terri burn trash grandma terri trash ....,53766 1764 2024 462 8395 23701 83777 2024 462 ...,grandma terri burn trash grandma terri trash h...,53766 1764 2024 462 8395 23701 83777 2024 462 ...,grandma terri should burn in trash grandma ter...,53766 1764 2024 462 1288 8395 304 23701 720 53...
159124,f8e3cd98b63bf401,", 9 May 2009 (UTC)\nIt would be easiest if you...",0,0,0,0,0,0,0,", 9 May 2009 ( UTC ) It would be easiest if yo...",11 220 24 3297 220 1049 24 320 21872 340 2181 ...,", may ( utc ) would easy admit member involved...",11 1253 320 70696 883 1053 4228 17113 4562 653...,may utc would easy admit member involved portu...,18864 70696 1053 4228 17113 4562 6532 2700 773...,", may ( utc ) it would be easiest if you were ...",11 220 1253 220 320 29455 340 275 1053 387 306...
60006,a09e1bcf10631f9a,"""\n\nThe Objectivity of this Discussion is dou...",0,0,0,0,0,0,0,`` The Objectivity of this Discussion is doubt...,1875 791 3075 1968 315 420 36613 374 75699 320...,`` objectivity discussion doubtful ( non-exist...,14196 1665 1968 10430 75699 320 2536 60928 883...,objectivity discussion doubtful nonexistent in...,1735 1968 10430 75699 88034 13519 6931 3857 36...,`` the objectivity of this discussion is doubt...,1875 1820 1665 1968 315 420 10430 374 75699 32...
65432,af0ee0066c607eb8,Shelly Shock\nShelly Shock is. . .( ),0,0,0,0,0,0,0,Shelly Shock Shelly Shock is . . . ( ),2059 12160 48083 198 2059 12160 48083 374 13 6...,shelly shock shelly shock . . . ( ),939 12160 10988 559 12160 10988 662 662 662 32...,shelly shock shelly shock,939 12160 10988 559 12160 10988,shelly shock shelly shock is . . . ( ),939 12160 10988 198 939 12160 10988 374 13 662...
154979,b734772b1a807e09,I do not care. Refer to Ong Teng Cheong talk p...,0,0,0,0,0,0,0,I do not care . Refer to Ong Teng Cheong talk ...,40 656 539 2512 13 29734 311 507 983 350 833 8...,care . refer ong teng cheong talk page . la go...,10727 662 8464 389 70 42249 3091 647 3137 2199...,care refer ong teng cheong talk page la goutte...,10727 8464 389 70 42249 3091 647 3137 2199 120...,i do not care . refer to ong teng cheong talk ...,72 656 539 2512 13 8464 311 389 70 42249 3091 ...


In [78]:
print(os.listdir('gloves'))
normalization_type = 'comment_text_word_tokenize_full_normalization'
glove_path = 'gloves/glove.6B.100d.txt'
embedding_dim = 100

['glove.6B.200d.txt', 'glove.6B.300d.txt', 'glove.6B.100d.txt', 'glove.6B.50d.txt']


In [79]:
# Fonction de préparation des données
def prepare_data(df : pd.DataFrame, normalization_type : str, vocab : dict = None):
    
    texts = df[normalization_type].values
    labels = df[CATEGORIES].values
    
    tokenizer = get_tokenizer("basic_english")
    
    # Tokenisation des textes
    tokenized_texts = [tokenizer(text) for text in texts]
    
    # Création du vocabulaire si non fourni
    if vocab is None:
        vocab = build_vocab_from_iterator(tokenized_texts, specials=["<pad>"])
        vocab.set_default_index(vocab["<pad>"])
    
    # Conversion des textes tokenisés en séquences d'index
    sequences = [torch.tensor(vocab(tokenized_text)) for tokenized_text in tokenized_texts]
    
    # Padding des séquences
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=vocab["<pad>"])
    
    # Conversion des labels en tenseurs PyTorch
    labels_tensor = torch.tensor(labels, dtype=torch.float)
    
    return padded_sequences, labels_tensor, vocab

In [80]:
X_train, y_train, vocab = prepare_data(df_train, normalization_type)

X_val, y_val, _ = prepare_data(df_val, normalization_type, vocab)

X_test, y_test, _ = prepare_data(df_test, normalization_type, vocab)

vocab_size = len(vocab)

In [81]:
# Fonction pour charger les embeddings GloVe
def load_glove_embeddings(glove_file : str) -> dict:
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Fonction pour créer la matrice d'embeddings
def create_embedding_matrix(vocab: dict, embeddings_index: dict, embedding_dim: int) -> torch.Tensor:
    vocab_size = len(vocab)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in vocab.get_stoi().items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))
    embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float)
    return embedding_matrix

In [82]:
embeddings_index = load_glove_embeddings(glove_path)
embedding_matrix = create_embedding_matrix(vocab, embeddings_index, embedding_dim).to(device)

In [83]:
embedding_layer = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)

class FeedforwardNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(FeedforwardNN, self).__init__()
        self.embedding = embedding_layer
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x).mean(dim=1)
        out = self.fc1(embedded)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [84]:
hidden_dim = 128
output_dim = 6  

model = FeedforwardNN(vocab_size=vocab_size, embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=output_dim)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss() 

print(model)

FeedforwardNN(
  (embedding): Embedding(179418, 100)
  (fc1): Linear(in_features=100, out_features=128, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=128, out_features=6, bias=True)
  (sigmoid): Sigmoid()
)


In [91]:
batch_size = 32

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [97]:
def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=10):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in tqdm(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        
        epoch_loss = running_loss / len(train_loader.dataset)
        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}')
        
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)
        
        val_loss = val_loss / len(val_loader.dataset)
        print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss:.4f}')


In [98]:
train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=50)

100%|██████████| 3990/3990 [00:23<00:00, 170.81it/s]


Epoch 1/50, Training Loss: 0.0462
Epoch 1/50, Validation Loss: 0.0722


100%|██████████| 3990/3990 [00:23<00:00, 172.77it/s]


Epoch 2/50, Training Loss: 0.0447
Epoch 2/50, Validation Loss: 0.0659


100%|██████████| 3990/3990 [00:22<00:00, 173.84it/s]


Epoch 3/50, Training Loss: 0.0440
Epoch 3/50, Validation Loss: 0.0748


100%|██████████| 3990/3990 [00:23<00:00, 173.05it/s]


Epoch 4/50, Training Loss: 0.0421
Epoch 4/50, Validation Loss: 0.0688


100%|██████████| 3990/3990 [00:23<00:00, 172.39it/s]


Epoch 5/50, Training Loss: 0.0415
Epoch 5/50, Validation Loss: 0.0706


100%|██████████| 3990/3990 [00:22<00:00, 173.88it/s]


Epoch 6/50, Training Loss: 0.0397
Epoch 6/50, Validation Loss: 0.0700


100%|██████████| 3990/3990 [00:23<00:00, 172.99it/s]


Epoch 7/50, Training Loss: 0.0384
Epoch 7/50, Validation Loss: 0.0729


100%|██████████| 3990/3990 [00:23<00:00, 172.92it/s]


Epoch 8/50, Training Loss: 0.0371
Epoch 8/50, Validation Loss: 0.0750


100%|██████████| 3990/3990 [00:23<00:00, 173.02it/s]


Epoch 9/50, Training Loss: 0.0354
Epoch 9/50, Validation Loss: 0.0783


100%|██████████| 3990/3990 [00:22<00:00, 173.85it/s]


Epoch 10/50, Training Loss: 0.0349
Epoch 10/50, Validation Loss: 0.0784


100%|██████████| 3990/3990 [00:23<00:00, 172.51it/s]


Epoch 11/50, Training Loss: 0.0335
Epoch 11/50, Validation Loss: 0.0932


100%|██████████| 3990/3990 [00:22<00:00, 173.91it/s]


Epoch 12/50, Training Loss: 0.0319
Epoch 12/50, Validation Loss: 0.0858


100%|██████████| 3990/3990 [00:22<00:00, 174.00it/s]


Epoch 13/50, Training Loss: 0.0308
Epoch 13/50, Validation Loss: 0.0921


100%|██████████| 3990/3990 [00:23<00:00, 173.34it/s]


Epoch 14/50, Training Loss: 0.0298
Epoch 14/50, Validation Loss: 0.0883


100%|██████████| 3990/3990 [00:23<00:00, 172.45it/s]


Epoch 15/50, Training Loss: 0.0287
Epoch 15/50, Validation Loss: 0.0876


100%|██████████| 3990/3990 [00:23<00:00, 172.82it/s]


Epoch 16/50, Training Loss: 0.0270
Epoch 16/50, Validation Loss: 0.1015


100%|██████████| 3990/3990 [00:23<00:00, 173.32it/s]


Epoch 17/50, Training Loss: 0.0266
Epoch 17/50, Validation Loss: 0.0963


100%|██████████| 3990/3990 [00:23<00:00, 173.34it/s]


Epoch 18/50, Training Loss: 0.0257
Epoch 18/50, Validation Loss: 0.0996


100%|██████████| 3990/3990 [00:23<00:00, 172.67it/s]


Epoch 19/50, Training Loss: 0.0250
Epoch 19/50, Validation Loss: 0.1059


100%|██████████| 3990/3990 [00:23<00:00, 173.46it/s]


Epoch 20/50, Training Loss: 0.0241
Epoch 20/50, Validation Loss: 0.1075


100%|██████████| 3990/3990 [00:22<00:00, 173.99it/s]


Epoch 21/50, Training Loss: 0.0236
Epoch 21/50, Validation Loss: 0.1102


100%|██████████| 3990/3990 [00:23<00:00, 172.92it/s]


Epoch 22/50, Training Loss: 0.0229
Epoch 22/50, Validation Loss: 0.1061


100%|██████████| 3990/3990 [00:23<00:00, 173.32it/s]


Epoch 23/50, Training Loss: 0.0226
Epoch 23/50, Validation Loss: 0.1129


100%|██████████| 3990/3990 [00:22<00:00, 174.31it/s]


Epoch 24/50, Training Loss: 0.0215
Epoch 24/50, Validation Loss: 0.1121


100%|██████████| 3990/3990 [00:23<00:00, 172.17it/s]


Epoch 25/50, Training Loss: 0.0208
Epoch 25/50, Validation Loss: 0.1091


100%|██████████| 3990/3990 [00:23<00:00, 172.50it/s]


Epoch 26/50, Training Loss: 0.0205
Epoch 26/50, Validation Loss: 0.1239


100%|██████████| 3990/3990 [00:22<00:00, 174.19it/s]


Epoch 27/50, Training Loss: 0.0198
Epoch 27/50, Validation Loss: 0.1235


100%|██████████| 3990/3990 [00:23<00:00, 167.73it/s]


Epoch 28/50, Training Loss: 0.0200
Epoch 28/50, Validation Loss: 0.1222


100%|██████████| 3990/3990 [00:24<00:00, 163.52it/s]


Epoch 29/50, Training Loss: 0.0191
Epoch 29/50, Validation Loss: 0.1312


100%|██████████| 3990/3990 [00:24<00:00, 163.13it/s]


Epoch 30/50, Training Loss: 0.0190
Epoch 30/50, Validation Loss: 0.1305


100%|██████████| 3990/3990 [00:25<00:00, 159.37it/s]


Epoch 31/50, Training Loss: 0.0183
Epoch 31/50, Validation Loss: 0.1244


100%|██████████| 3990/3990 [00:23<00:00, 172.49it/s]


Epoch 32/50, Training Loss: 0.0178
Epoch 32/50, Validation Loss: 0.1315


100%|██████████| 3990/3990 [00:22<00:00, 174.02it/s]


Epoch 33/50, Training Loss: 0.0177
Epoch 33/50, Validation Loss: 0.1326


100%|██████████| 3990/3990 [00:23<00:00, 173.19it/s]


Epoch 34/50, Training Loss: 0.0175
Epoch 34/50, Validation Loss: 0.1390


100%|██████████| 3990/3990 [00:23<00:00, 167.81it/s]


Epoch 35/50, Training Loss: 0.0171
Epoch 35/50, Validation Loss: 0.1296


100%|██████████| 3990/3990 [00:22<00:00, 173.66it/s]


Epoch 36/50, Training Loss: 0.0160
Epoch 36/50, Validation Loss: 0.1390


100%|██████████| 3990/3990 [00:23<00:00, 168.28it/s]


Epoch 37/50, Training Loss: 0.0157
Epoch 37/50, Validation Loss: 0.1480


100%|██████████| 3990/3990 [00:23<00:00, 171.82it/s]


Epoch 38/50, Training Loss: 0.0159
Epoch 38/50, Validation Loss: 0.1459


100%|██████████| 3990/3990 [00:23<00:00, 172.73it/s]


Epoch 39/50, Training Loss: 0.0158
Epoch 39/50, Validation Loss: 0.1370


100%|██████████| 3990/3990 [00:23<00:00, 172.83it/s]


Epoch 40/50, Training Loss: 0.0153
Epoch 40/50, Validation Loss: 0.1469


100%|██████████| 3990/3990 [00:23<00:00, 169.33it/s]


Epoch 41/50, Training Loss: 0.0151
Epoch 41/50, Validation Loss: 0.1451


100%|██████████| 3990/3990 [00:23<00:00, 171.66it/s]


Epoch 42/50, Training Loss: 0.0150
Epoch 42/50, Validation Loss: 0.1448


100%|██████████| 3990/3990 [00:23<00:00, 170.31it/s]


Epoch 43/50, Training Loss: 0.0145
Epoch 43/50, Validation Loss: 0.1519


100%|██████████| 3990/3990 [00:24<00:00, 163.12it/s]


Epoch 44/50, Training Loss: 0.0149
Epoch 44/50, Validation Loss: 0.1498


100%|██████████| 3990/3990 [00:25<00:00, 157.63it/s]


Epoch 45/50, Training Loss: 0.0138
Epoch 45/50, Validation Loss: 0.1420


100%|██████████| 3990/3990 [00:25<00:00, 158.87it/s]


Epoch 46/50, Training Loss: 0.0143
Epoch 46/50, Validation Loss: 0.1802


100%|██████████| 3990/3990 [00:23<00:00, 171.19it/s]


Epoch 47/50, Training Loss: 0.0141
Epoch 47/50, Validation Loss: 0.1651


100%|██████████| 3990/3990 [00:23<00:00, 171.33it/s]


Epoch 48/50, Training Loss: 0.0136
Epoch 48/50, Validation Loss: 0.1601


100%|██████████| 3990/3990 [00:22<00:00, 176.33it/s]


Epoch 49/50, Training Loss: 0.0139
Epoch 49/50, Validation Loss: 0.1602


100%|██████████| 3990/3990 [00:22<00:00, 178.66it/s]


Epoch 50/50, Training Loss: 0.0133
Epoch 50/50, Validation Loss: 0.1640


In [113]:
model.eval()

y_pred = []
y_true = []

with torch.no_grad():
    for inputs, targets in val_loader:
        inputs, targets = inputs.to(device), targets.to(device)  # Move to GPU
        outputs = model(inputs)
        y_pred.extend(outputs.cpu().numpy())  # Move to CPU before converting to numpy
        y_true.extend(targets.cpu().numpy())  # Move to CPU before converting to numpy

y_true = torch.tensor(y_true)
y_pred = torch.tensor(y_pred) > 0.5 # Convert probabilities to binary labels

y_true = y_true.numpy()
y_pred = y_pred.numpy()

print(classification_report(y_true, y_pred, target_names=CATEGORIES))


               precision    recall  f1-score   support

        toxic       0.85      0.63      0.73      3056
 severe_toxic       0.54      0.26      0.35       321
      obscene       0.85      0.69      0.76      1715
       threat       0.50      0.27      0.35        74
       insult       0.70      0.64      0.67      1614
identity_hate       0.46      0.45      0.46       294

    micro avg       0.78      0.62      0.69      7074
    macro avg       0.65      0.49      0.55      7074
 weighted avg       0.78      0.62      0.69      7074
  samples avg       0.06      0.05      0.05      7074



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [125]:
model.eval()

for i, (inputs, targets) in enumerate(test_loader):
    if i == 20:
        break
    inputs, targets = inputs.to(device), targets.to(device)
    outputs = model(inputs)
    outputs = torch.where(outputs > 0.5, 1, 0)
    print(f'Output: {outputs[0]}')
    print(f'Target: {targets[0]}')
    print(outputs[0] == targets[0])
    print('')
    

Output: tensor([0, 0, 0, 0, 0, 0], device='cuda:0')
Target: tensor([0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([True, True, True, True, True, True], device='cuda:0')

Output: tensor([0, 0, 0, 0, 0, 0], device='cuda:0')
Target: tensor([0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([True, True, True, True, True, True], device='cuda:0')

Output: tensor([0, 0, 0, 0, 0, 0], device='cuda:0')
Target: tensor([0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([True, True, True, True, True, True], device='cuda:0')

Output: tensor([0, 0, 0, 0, 0, 0], device='cuda:0')
Target: tensor([0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([True, True, True, True, True, True], device='cuda:0')

Output: tensor([0, 0, 0, 0, 0, 0], device='cuda:0')
Target: tensor([0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([True, True, True, True, True, True], device='cuda:0')

Output: tensor([0, 0, 0, 0, 0, 0], device='cuda:0')
Target: tensor([0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([True, True, True,