In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')
from tqdm import tqdm
import numpy as np
import pandas as pd
import os

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import DataLoader

from torchviz import make_dot

from sklearn.metrics import classification_report , precision_score, recall_score, f1_score

from constants import CATEGORIES



In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
print(torch.cuda.get_device_name(0))

Using device: cuda
NVIDIA GeForce RTX 3070


In [4]:
df_train = pd.read_parquet('data/df_train_preprocessed.parquet')
df_val = pd.read_parquet('data/df_val_preprocessed.parquet')
df_test = pd.read_parquet('data/df_test_preprocessed.parquet')

In [5]:
df_train.head()

Unnamed: 0,id,comment_text_baseline,toxic,severe_toxic,obscene,threat,insult,identity_hate,overall_toxic,comment_text_word_tokenize_no_normalization,comment_text_gpt_tokenize_no_normalization,comment_text_word_tokenize_normalization,comment_text_gpt_tokenize_normalization,comment_text_word_tokenize_full_normalization,comment_text_gpt_tokenize_full_normalization,comment_text_word_tokenize_simple_normalization,comment_text_gpt_tokenize_simple_normalization
140030,ed56f082116dcbd0,Grandma Terri Should Burn in Trash \nGrandma T...,1,0,0,0,0,0,1,Grandma Terri Should Burn in Trash Grandma Ter...,41251 1764 10335 462 12540 18530 304 71723 720...,grandma terri burn trash grandma terri trash ....,53766 1764 2024 462 8395 23701 83777 2024 462 ...,grandma terri burn trash grandma terri trash h...,53766 1764 2024 462 8395 23701 83777 2024 462 ...,grandma terri should burn in trash grandma ter...,53766 1764 2024 462 1288 8395 304 23701 720 53...
159124,f8e3cd98b63bf401,", 9 May 2009 (UTC)\nIt would be easiest if you...",0,0,0,0,0,0,0,", 9 May 2009 ( UTC ) It would be easiest if yo...",11 220 24 3297 220 1049 24 320 21872 340 2181 ...,", may ( utc ) would easy admit member involved...",11 1253 320 70696 883 1053 4228 17113 4562 653...,may utc would easy admit member involved portu...,18864 70696 1053 4228 17113 4562 6532 2700 773...,", may ( utc ) it would be easiest if you were ...",11 220 1253 220 320 29455 340 275 1053 387 306...
60006,a09e1bcf10631f9a,"""\n\nThe Objectivity of this Discussion is dou...",0,0,0,0,0,0,0,`` The Objectivity of this Discussion is doubt...,1875 791 3075 1968 315 420 36613 374 75699 320...,`` objectivity discussion doubtful ( non-exist...,14196 1665 1968 10430 75699 320 2536 60928 883...,objectivity discussion doubtful nonexistent in...,1735 1968 10430 75699 88034 13519 6931 3857 36...,`` the objectivity of this discussion is doubt...,1875 1820 1665 1968 315 420 10430 374 75699 32...
65432,af0ee0066c607eb8,Shelly Shock\nShelly Shock is. . .( ),0,0,0,0,0,0,0,Shelly Shock Shelly Shock is . . . ( ),2059 12160 48083 198 2059 12160 48083 374 13 6...,shelly shock shelly shock . . . ( ),939 12160 10988 559 12160 10988 662 662 662 32...,shelly shock shelly shock,939 12160 10988 559 12160 10988,shelly shock shelly shock is . . . ( ),939 12160 10988 198 939 12160 10988 374 13 662...
154979,b734772b1a807e09,I do not care. Refer to Ong Teng Cheong talk p...,0,0,0,0,0,0,0,I do not care . Refer to Ong Teng Cheong talk ...,40 656 539 2512 13 29734 311 507 983 350 833 8...,care . refer ong teng cheong talk page . la go...,10727 662 8464 389 70 42249 3091 647 3137 2199...,care refer ong teng cheong talk page la goutte...,10727 8464 389 70 42249 3091 647 3137 2199 120...,i do not care . refer to ong teng cheong talk ...,72 656 539 2512 13 8464 311 389 70 42249 3091 ...


In [6]:
print(os.listdir('gloves'))
normalization_type = 'comment_text_word_tokenize_full_normalization'
glove_path = 'gloves/glove.twitter.27B.100d.txt'
embedding_dim = 100

['glove.twitter.27B.200d.txt', 'glove.twitter.27B.100d.txt', 'glove.twitter.27B.50d.txt', 'glove.twitter.27B.25d.txt']


In [7]:
# Fonction de préparation des données
def prepare_data(df: pd.DataFrame, normalization_type: str, vocab: dict = None, max_length: int = None):
    
    texts = df[normalization_type].values
    labels = df[CATEGORIES].values
    
    tokenizer = get_tokenizer("basic_english")
    
    # Tokenization of texts
    tokenized_texts = [tokenizer(text) for text in texts]
    
    # Creating the vocabulary if not provided
    if vocab is None:
        vocab = build_vocab_from_iterator(tokenized_texts, specials=["<pad>"])
        vocab.set_default_index(vocab["<pad>"])
    
    # Conversion of tokenized texts to sequences of indices
    sequences = [torch.tensor(vocab(tokenized_text)) for tokenized_text in tokenized_texts]
    
    # Determine max length if not provided
    if max_length is None:
        max_length = max(len(seq) for seq in sequences)
    
    # Padding sequences
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=vocab["<pad>"])
    if padded_sequences.shape[1] < max_length:
        padding = torch.full((padded_sequences.shape[0], max_length - padded_sequences.shape[1]), vocab["<pad>"])
        padded_sequences = torch.cat([padded_sequences, padding], dim=1)
    elif padded_sequences.shape[1] > max_length:
        padded_sequences = padded_sequences[:, :max_length]
    
    # Conversion of labels to PyTorch tensors
    labels_tensor = torch.tensor(labels, dtype=torch.float)
    
    return padded_sequences, labels_tensor, vocab


In [8]:
max_length_train = max(len(text) for text in df_train[normalization_type].values)
max_length_val = max(len(text) for text in df_val[normalization_type].values)
max_length_test = max(len(text) for text in df_test[normalization_type].values)
max_length = max(max_length_train, max_length_val, max_length_test)

In [9]:
X_train, y_train, vocab = prepare_data(df_train, normalization_type, max_length=max_length)
X_val, y_val, _ = prepare_data(df_val, normalization_type, vocab, max_length=max_length)
X_test, y_test, _ = prepare_data(df_test, normalization_type, vocab, max_length=max_length)

vocab_size = len(vocab)

In [10]:
print(f'{vocab_size=}')
print(f'{X_train.shape=}')
print(f'{X_val.shape=}')
print(f'{X_test.shape=}')
print(f'{y_train.shape=}')
print(f'{y_val.shape=}')
print(f'{y_test.shape=}')

vocab_size=179418
X_train.shape=torch.Size([127656, 5117])
X_val.shape=torch.Size([31915, 5117])
X_test.shape=torch.Size([63978, 5117])
y_train.shape=torch.Size([127656, 6])
y_val.shape=torch.Size([31915, 6])
y_test.shape=torch.Size([63978, 6])


In [11]:
# Fonction pour charger les embeddings GloVe
def load_glove_embeddings(glove_path: str, embedding_dim: int):
    embeddings_index = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print(f'Loaded {len(embeddings_index)} word vectors from GloVe.')
    return embeddings_index

# Fonction pour créer la matrice d'embeddings
def create_embedding_matrix(vocab, embeddings_index, embedding_dim: int):
    vocab_size = len(vocab)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, idx in vocab.get_stoi().items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[idx] = embedding_vector
        else:
            # If the word is not found in the GloVe embeddings, use random normal distribution for initialization
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return torch.tensor(embedding_matrix, dtype=torch.float)

# Charger les embeddings GloVe
embeddings_index = load_glove_embeddings(glove_path, embedding_dim)

# Créer la matrice d'embeddings
embedding_matrix = create_embedding_matrix(vocab, embeddings_index, embedding_dim)

Loaded 1193514 word vectors from GloVe.


In [19]:
class MultiLabelNN(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, num_labels, dropout_rate=0.5):
        super(MultiLabelNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        embedding_dim = embedding_matrix.size(1)
        
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout1 = nn.Dropout(dropout_rate)
        
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.bn2 = nn.BatchNorm1d(hidden_dim // 2)
        self.dropout2 = nn.Dropout(dropout_rate)
        
        self.fc_out = nn.Linear(hidden_dim // 2, num_labels)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = torch.max(x, dim=1)
        
        x = self.fc1(x)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = torch.relu(x)
        x = self.dropout2(x)
        
        x = self.fc_out(x)
        return x

In [20]:
batch_size = 32

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [21]:
def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=10):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        all_labels = []
        all_preds = []
        for inputs, labels in tqdm(train_loader, desc=f'Training Epoch {epoch+1}/{num_epochs}'):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
            
            # Stocker les prédictions et les labels pour calculer les métriques
            preds = torch.sigmoid(outputs).data > 0.5
            all_labels.append(labels.cpu().numpy())
            all_preds.append(preds.cpu().numpy())
        
        epoch_loss = running_loss / len(train_loader.dataset)
        all_labels = np.concatenate(all_labels, axis=0)
        all_preds = np.concatenate(all_preds, axis=0)
        precision = precision_score(all_labels, all_preds, average='micro')
        recall = recall_score(all_labels, all_preds, average='micro')
        f1 = f1_score(all_labels, all_preds, average='micro')
        
        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')
        
        model.eval()
        val_loss = 0.0
        all_labels = []
        all_preds = []
        with torch.no_grad():
            for inputs, labels in tqdm(val_loader, desc=f'Validation Epoch {epoch+1}/{num_epochs}'):
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)
                
                # Stocker les prédictions et les labels pour calculer les métriques
                preds = torch.sigmoid(outputs).data > 0.5
                all_labels.append(labels.cpu().numpy())
                all_preds.append(preds.cpu().numpy())
        
        val_loss = val_loss / len(val_loader.dataset)
        all_labels = np.concatenate(all_labels, axis=0)
        all_preds = np.concatenate(all_preds, axis=0)
        precision = precision_score(all_labels, all_preds, average='micro')
        recall = recall_score(all_labels, all_preds, average='micro')
        f1 = f1_score(all_labels, all_preds, average='micro')
        
        print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')


In [22]:
hidden_dim = 128
output_dim = len(CATEGORIES)

model = MultiLabelNN(embedding_matrix, hidden_dim, output_dim).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [31]:
train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=5)

Training Epoch 1/5:   4%|▍         | 173/3990 [01:36<35:20,  1.80it/s]


KeyboardInterrupt: 

In [None]:
model.eval()

y_pred = []
y_true = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)  # Move to GPU
        outputs = model(inputs)
        y_pred.extend(outputs.cpu().numpy())  # Move to CPU before converting to numpy
        y_true.extend(targets.cpu().numpy())  # Move to CPU before converting to numpy

y_true = torch.tensor(y_true)
y_pred = torch.tensor(y_pred) > 0.5 # Convert probabilities to binary labels

y_true = y_true.numpy()
y_pred = y_pred.numpy()

print(classification_report(y_true, y_pred, target_names=CATEGORIES))


               precision    recall  f1-score   support

        toxic       0.45      0.88      0.59      6090
 severe_toxic       0.35      0.23      0.28       367
      obscene       0.52      0.75      0.61      3691
       threat       0.42      0.02      0.04       211
       insult       0.51      0.67      0.58      3427
identity_hate       0.38      0.10      0.16       712

    micro avg       0.47      0.73      0.57     14498
    macro avg       0.44      0.44      0.38     14498
 weighted avg       0.47      0.73      0.56     14498
  samples avg       0.07      0.07      0.07     14498



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KeyboardInterrupt: 

In [None]:
removes_index = []
for i in range(len(y_true)):
    if (y_true[i] == [0,0,0,0,0,0,0]).all():
        removes_index.append(i)
y_true = np.delete(y_true, removes_index, axis=0)
y_pred = np.delete(y_pred, removes_index, axis=0)

accuracy = (y_true == y_pred).mean()
print(f'Accuracy: {accuracy:.4f}')

(63978, 6)
(63978, 5)


ValueError: operands could not be broadcast together with shapes (63978,5) (63978,6) 

In [None]:
def display_model(model: nn.Module):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    x = torch.randint(0, 1000, (32, 100)).to(device)
    output = model(x)
    dot = make_dot(output, params=dict(model.named_parameters()))
    dot.render('model', format='png', cleanup=True)
    print(model)
    
display_model(model)