In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import sys
sys.path.append('..')
from tqdm import tqdm
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import DataLoader

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

from constants import CATEGORIES

#### GPU Usage

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
print(torch.cuda.get_device_name(0))

Using device: cuda
NVIDIA GeForce RTX 3070


### Import Data

In [16]:
df_train = pd.read_parquet('data/df_train_preprocessed.parquet')
df_val = pd.read_parquet('data/df_val_preprocessed.parquet')
df_test = pd.read_parquet('data/df_test_preprocessed.parquet')

In [17]:
df_train.head()

Unnamed: 0,id,comment_text_baseline,toxic,severe_toxic,obscene,threat,insult,identity_hate,overall_toxic,comment_text_word_tokenize_no_normalization,comment_text_gpt_tokenize_no_normalization,comment_text_word_tokenize_normalization,comment_text_gpt_tokenize_normalization,comment_text_word_tokenize_full_normalization,comment_text_gpt_tokenize_full_normalization,comment_text_word_tokenize_simple_normalization,comment_text_gpt_tokenize_simple_normalization
140030,ed56f082116dcbd0,Grandma Terri Should Burn in Trash \nGrandma T...,1,0,0,0,0,0,1,Grandma Terri Should Burn in Trash Grandma Ter...,41251 1764 10335 462 12540 18530 304 71723 720...,grandma terri burn trash grandma terri trash ....,53766 1764 2024 462 8395 23701 83777 2024 462 ...,grandma terri burn trash grandma terri trash h...,53766 1764 2024 462 8395 23701 83777 2024 462 ...,grandma terri should burn in trash grandma ter...,53766 1764 2024 462 1288 8395 304 23701 720 53...
159124,f8e3cd98b63bf401,", 9 May 2009 (UTC)\nIt would be easiest if you...",0,0,0,0,0,0,0,", 9 May 2009 ( UTC ) It would be easiest if yo...",11 220 24 3297 220 1049 24 320 21872 340 2181 ...,", may ( utc ) would easy admit member involved...",11 1253 320 70696 883 1053 4228 17113 4562 653...,may utc would easy admit member involved portu...,18864 70696 1053 4228 17113 4562 6532 2700 773...,", may ( utc ) it would be easiest if you were ...",11 220 1253 220 320 29455 340 275 1053 387 306...
60006,a09e1bcf10631f9a,"""\n\nThe Objectivity of this Discussion is dou...",0,0,0,0,0,0,0,`` The Objectivity of this Discussion is doubt...,1875 791 3075 1968 315 420 36613 374 75699 320...,`` objectivity discussion doubtful ( non-exist...,14196 1665 1968 10430 75699 320 2536 60928 883...,objectivity discussion doubtful nonexistent in...,1735 1968 10430 75699 88034 13519 6931 3857 36...,`` the objectivity of this discussion is doubt...,1875 1820 1665 1968 315 420 10430 374 75699 32...
65432,af0ee0066c607eb8,Shelly Shock\nShelly Shock is. . .( ),0,0,0,0,0,0,0,Shelly Shock Shelly Shock is . . . ( ),2059 12160 48083 198 2059 12160 48083 374 13 6...,shelly shock shelly shock . . . ( ),939 12160 10988 559 12160 10988 662 662 662 32...,shelly shock shelly shock,939 12160 10988 559 12160 10988,shelly shock shelly shock is . . . ( ),939 12160 10988 198 939 12160 10988 374 13 662...
154979,b734772b1a807e09,I do not care. Refer to Ong Teng Cheong talk p...,0,0,0,0,0,0,0,I do not care . Refer to Ong Teng Cheong talk ...,40 656 539 2512 13 29734 311 507 983 350 833 8...,care . refer ong teng cheong talk page . la go...,10727 662 8464 389 70 42249 3091 647 3137 2199...,care refer ong teng cheong talk page la goutte...,10727 8464 389 70 42249 3091 647 3137 2199 120...,i do not care . refer to ong teng cheong talk ...,72 656 539 2512 13 8464 311 389 70 42249 3091 ...


### TFIDF

In [None]:
normalization_type = 'comment_text_word_tokenize_simple_normalization'

In [11]:
def prepare_data(df, normalization_type, batch_size = 32):

    vectorizer = TfidfVectorizer(tokenizer=str.split)
    X = vectorizer.fit_transform(df[normalization_type]).toarray()
    y = df[CATEGORIES].values

    X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
    y_tensor = torch.tensor(y, dtype=torch.float32).to(device)

    dataset = TensorDataset(X_tensor, y_tensor)

    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    input_dim = X.shape[1]

    return loader, input_dim

In [12]:
train_loader, input_dim = prepare_data(df_train, normalization_type)
val_loader, _ = prepare_data(df_val, normalization_type)
test_loader, _ = prepare_data(df_test, normalization_type)



MemoryError: Unable to allocate 183. GiB for an array with shape (127656, 192145) and data type float64

In [20]:
class TfidfNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_labels):
        super(TfidfNN, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, num_labels)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.fc_out(x)
        return x

#### Initialisation du modèle

In [None]:
hidden_dim = 64
num_labels = len(CATEGORIES)

model = TfidfNN(input_dim, hidden_dim, num_labels)
model = model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

#### Entrainement

In [21]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in tqdm(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)
        
        train_loss = train_loss / len(train_loader.dataset)
        
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in tqdm(val_loader):
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)
        
        val_loss = val_loss / len(val_loader.dataset)
        
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 7979/7979 [00:11<00:00, 722.11it/s]
100%|██████████| 1995/1995 [00:00<00:00, 2353.56it/s]


Epoch 1/10, Train Loss: 0.0670, Val Loss: 0.1894


100%|██████████| 7979/7979 [00:10<00:00, 783.57it/s]
100%|██████████| 1995/1995 [00:00<00:00, 2420.91it/s]


Epoch 2/10, Train Loss: 0.0494, Val Loss: 0.2412


100%|██████████| 7979/7979 [00:10<00:00, 741.83it/s]
100%|██████████| 1995/1995 [00:00<00:00, 2407.19it/s]


Epoch 3/10, Train Loss: 0.0420, Val Loss: 0.2630


100%|██████████| 7979/7979 [00:10<00:00, 741.61it/s]
100%|██████████| 1995/1995 [00:00<00:00, 2166.85it/s]


Epoch 4/10, Train Loss: 0.0359, Val Loss: 0.3095


100%|██████████| 7979/7979 [00:11<00:00, 708.17it/s]
100%|██████████| 1995/1995 [00:00<00:00, 2371.53it/s]


Epoch 5/10, Train Loss: 0.0312, Val Loss: 0.3371


100%|██████████| 7979/7979 [00:11<00:00, 707.02it/s]
100%|██████████| 1995/1995 [00:00<00:00, 2516.45it/s]


Epoch 6/10, Train Loss: 0.0278, Val Loss: 0.3120


100%|██████████| 7979/7979 [00:10<00:00, 756.33it/s]
100%|██████████| 1995/1995 [00:00<00:00, 2406.67it/s]


Epoch 7/10, Train Loss: 0.0251, Val Loss: 0.3033


100%|██████████| 7979/7979 [00:10<00:00, 727.77it/s]
100%|██████████| 1995/1995 [00:00<00:00, 2469.27it/s]


Epoch 8/10, Train Loss: 0.0232, Val Loss: 0.3055


100%|██████████| 7979/7979 [00:10<00:00, 726.82it/s]
100%|██████████| 1995/1995 [00:00<00:00, 2282.23it/s]


Epoch 9/10, Train Loss: 0.0216, Val Loss: 0.2875


100%|██████████| 7979/7979 [00:10<00:00, 749.57it/s]
100%|██████████| 1995/1995 [00:00<00:00, 2290.39it/s]

Epoch 10/10, Train Loss: 0.0205, Val Loss: 0.3042





#### Test du modele

In [22]:
model.eval()

y_pred = []
y_true = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        y_pred.extend(outputs.cpu().numpy())
        y_true.extend(targets.cpu().numpy())
          
y_true = torch.tensor(y_true)
y_pred = torch.tensor(y_pred) > 0.5 

y_true = y_true.numpy()
y_pred = y_pred.numpy()

print(classification_report(y_true, y_pred, target_names=CATEGORIES))

               precision    recall  f1-score   support

        toxic       0.10      0.05      0.06      6090
 severe_toxic       0.00      0.00      0.00       367
      obscene       0.08      0.01      0.03      3691
       threat       0.00      0.00      0.00       211
       insult       0.05      0.00      0.01      3427
identity_hate       0.02      0.00      0.00       712

    micro avg       0.09      0.02      0.04     14498
    macro avg       0.04      0.01      0.02     14498
 weighted avg       0.07      0.02      0.04     14498
  samples avg       0.00      0.00      0.00     14498



  y_true = torch.tensor(y_true)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
