In [12]:
import sys
sys.path.append('..')
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

from constants import CATEGORIES

In [13]:
df_train = pd.read_parquet('data/df_train_preprocessed.parquet')
df_val = pd.read_parquet('data/df_val_preprocessed.parquet')
df_test = pd.read_parquet('data/df_test_preprocessed.parquet')

In [14]:
df_train.columns

Index(['id', 'comment_text_baseline', 'toxic', 'severe_toxic', 'obscene',
       'threat', 'insult', 'identity_hate', 'overall_toxic',
       'comment_text_word_tokenize_no_normalization',
       'comment_text_gpt_tokenize_no_normalization',
       'comment_text_word_tokenize_normalization',
       'comment_text_gpt_tokenize_normalization'],
      dtype='object')

In [15]:
def create_vocab(df: pd.DataFrame, normalization_type: str):
    vectorizer = CountVectorizer()
    vectorizer.fit(df[normalization_type])
    return vectorizer

def transform_sentences(df: pd.DataFrame, vectorizer: CountVectorizer,normalization_type: str):
    return vectorizer.transform(df[normalization_type]).toarray()

vectorizer = create_vocab(df_train, 'comment_text_gpt_tokenize_normalization')
len(vectorizer.vocabulary_)

153881

In [16]:
X_train = transform_sentences(df_train, vectorizer, 'comment_text_gpt_tokenize_normalization')
X_val = transform_sentences(df_val, vectorizer, 'comment_text_gpt_tokenize_normalization')
X_test = transform_sentences(df_test, vectorizer, 'comment_text_gpt_tokenize_normalization')

# Extract labels
y_train = df_train[CATEGORIES].values
y_val = df_val[CATEGORIES].values
y_test = df_test[CATEGORIES].values

: 

In [None]:
class ToxicCommentDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.y[idx], dtype=torch.float)

batch_size = 32
train_dataset = ToxicCommentDataset(X_train, y_train)
val_dataset = ToxicCommentDataset(X_val, y_val)
test_dataset = ToxicCommentDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [11]:
class ToxicCommentFFN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(ToxicCommentFFN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.sigmoid(x)

In [None]:
vocab_size = len(vectorizer.vocabulary_)
embed_dim = 128
hidden_dim = 64
output_dim = 6
model = ToxicCommentFFN(vocab_size, embed_dim, hidden_dim, output_dim)

In [None]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
epochs = 10
model.train()
for epoch in range(epochs):
    total_loss = 0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader)}')

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()
    val_loss /= len(val_loader)
    print(f'Validation Loss: {val_loss}')

In [None]:
model.eval()

y_pred = []
y_true = []

with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        y_pred.extend(outputs.numpy())
        y_true.extend(targets.numpy())

y_true = torch.tensor(y_true)
y_pred = torch.tensor(y_pred) > 0.5

y_true = y_true.numpy()
y_pred = y_pred.numpy()

print(classification_report(y_true, y_pred, target_names=CATEGORIES))