In [49]:
import sys
sys.path.append('..')
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

from tqdm import tqdm

from constants import CATEGORIES

In [45]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')


Using device: cuda


In [30]:
df_train = pd.read_parquet('data/df_train_preprocessed.parquet')
df_val = pd.read_parquet('data/df_val_preprocessed.parquet')
df_test = pd.read_parquet('data/df_test_preprocessed.parquet')

In [31]:
df_train.columns

Index(['id', 'comment_text_baseline', 'toxic', 'severe_toxic', 'obscene',
       'threat', 'insult', 'identity_hate', 'overall_toxic',
       'comment_text_word_tokenize_no_normalization',
       'comment_text_gpt_tokenize_no_normalization',
       'comment_text_word_tokenize_normalization',
       'comment_text_gpt_tokenize_normalization',
       'comment_text_word_tokenize_full_normalization',
       'comment_text_gpt_tokenize_full_normalization',
       'comment_text_word_tokenize_simple_normalization',
       'comment_text_gpt_tokenize_simple_normalization'],
      dtype='object')

In [32]:
def create_vocab(df: pd.DataFrame, normalization_type: str):
    vectorizer = CountVectorizer()
    vectorizer.fit(df[normalization_type])
    return vectorizer

def transform_sentences(df: pd.DataFrame, vectorizer: CountVectorizer,normalization_type: str):
    return vectorizer.transform(df[normalization_type])

vectorizer = create_vocab(df_train, 'comment_text_gpt_tokenize_simple_normalization')
len(vectorizer.vocabulary_)

38697

In [33]:
X_train = transform_sentences(df_train, vectorizer, 'comment_text_gpt_tokenize_simple_normalization')
X_val = transform_sentences(df_val, vectorizer, 'comment_text_gpt_tokenize_simple_normalization')
X_test = transform_sentences(df_test, vectorizer, 'comment_text_gpt_tokenize_simple_normalization')

y_train = df_train[CATEGORIES].values
y_val = df_val[CATEGORIES].values
y_test = df_test[CATEGORIES].values

In [40]:
class ToxicCommentDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx].indices, dtype=torch.long), torch.tensor(self.y[idx], dtype=torch.float)

def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    targets = torch.stack(targets, dim=0)
    return inputs_padded, targets


train_dataset = ToxicCommentDataset(X_train, y_train)
val_dataset = ToxicCommentDataset(X_val, y_val)
test_dataset = ToxicCommentDataset(X_test, y_test)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [41]:
class ToxicCommentFFN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(ToxicCommentFFN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.sigmoid(x)


In [46]:
vocab_size = len(vectorizer.vocabulary_)
embed_dim = 128
hidden_dim = 64
output_dim = 6
model = ToxicCommentFFN(vocab_size, embed_dim, hidden_dim, output_dim).to(device)

In [47]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [50]:
# Training loop
epochs = 10
model.train()
for epoch in tqdm(range(epochs)):
    total_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)  # Move to GPU
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader)}')

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)  # Move to GPU
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()
    val_loss /= len(val_loader)
    print(f'Validation Loss: {val_loss}')
    model.train()  # Set back to training mode after validation


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10, Loss: 0.07259028111170174


 10%|█         | 1/10 [00:26<03:57, 26.33s/it]

Validation Loss: 0.07801217139469467
Epoch 2/10, Loss: 0.05955374908649869


 20%|██        | 2/10 [00:53<03:35, 26.95s/it]

Validation Loss: 0.06097023181439021
Epoch 3/10, Loss: 0.052890747998493026


 30%|███       | 3/10 [01:21<03:12, 27.51s/it]

Validation Loss: 0.0771486540194048
Epoch 4/10, Loss: 0.049968585353250584


 40%|████      | 4/10 [01:49<02:44, 27.42s/it]

Validation Loss: 0.0597462997025715
Epoch 5/10, Loss: 0.048073280080334506


 50%|█████     | 5/10 [02:16<02:16, 27.33s/it]

Validation Loss: 0.05982333763370959
Epoch 6/10, Loss: 0.046261869523685044


 60%|██████    | 6/10 [02:43<01:49, 27.31s/it]

Validation Loss: 0.06590504064146506
Epoch 7/10, Loss: 0.044866238706987374


 70%|███████   | 7/10 [03:10<01:21, 27.33s/it]

Validation Loss: 0.06198587646969885
Epoch 8/10, Loss: 0.04360788590611014


 80%|████████  | 8/10 [03:38<00:54, 27.46s/it]

Validation Loss: 0.061709379044525656
Epoch 9/10, Loss: 0.04259040273495668


 90%|█████████ | 9/10 [04:06<00:27, 27.43s/it]

Validation Loss: 0.0649175028506114
Epoch 10/10, Loss: 0.04180713957485251


100%|██████████| 10/10 [04:33<00:00, 27.36s/it]

Validation Loss: 0.0636984214041091





In [53]:
model.eval()

y_pred = []
y_true = []

with torch.no_grad():
    for inputs, targets in val_loader:
        inputs, targets = inputs.to(device), targets.to(device)  # Move to GPU
        outputs = model(inputs)
        y_pred.extend(outputs.cpu().numpy())  # Move to CPU before converting to numpy
        y_true.extend(targets.cpu().numpy())  # Move to CPU before converting to numpy

y_true = torch.tensor(y_true)
y_pred = torch.tensor(y_pred) > 0.5  # Convert probabilities to binary labels

y_true = y_true.numpy()
y_pred = y_pred.numpy()

print(classification_report(y_true, y_pred, target_names=CATEGORIES))


               precision    recall  f1-score   support

        toxic       0.76      0.75      0.76      3056
 severe_toxic       0.59      0.09      0.16       321
      obscene       0.84      0.72      0.77      1715
       threat       0.00      0.00      0.00        74
       insult       0.73      0.61      0.66      1614
identity_hate       0.75      0.01      0.02       294

    micro avg       0.77      0.64      0.70      7074
    macro avg       0.61      0.36      0.40      7074
 weighted avg       0.76      0.64      0.67      7074
  samples avg       0.07      0.06      0.06      7074



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
