In [50]:
import warnings
warnings.filterwarnings('always')

import os

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\denis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [51]:
DATA_PATH = os.path.join('data', 'train.csv')

df = pd.read_csv(DATA_PATH)
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \r\n\r\nThat...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \r\n\r\nUmm, theres no actual article ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [52]:
df = df.iloc[:, 1:]
df

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...
159566,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,You should be ashamed of yourself \r\n\r\nThat...,0,0,0,0,0,0
159568,"Spitzer \r\n\r\nUmm, theres no actual article ...",0,0,0,0,0,0
159569,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [53]:
train_data, valid_data = train_test_split(df, test_size=0.2, random_state=42)

train_data.shape, valid_data.shape

((127656, 7), (31915, 7))

In [54]:
from collections import Counter

In [55]:
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for col in label_columns:
    train_data[col] = pd.to_numeric(train_data[col], errors='coerce').fillna(0)
    valid_data[col] = pd.to_numeric(valid_data[col], errors='coerce').fillna(0)

In [56]:
train_data['tokenized'] = train_data['comment_text'].apply(word_tokenize)
valid_data['tokenized'] = valid_data['comment_text'].apply(word_tokenize)

VOCAB_SIZE = 5000

all_tokens = [token for tokens in train_data['tokenized'] for token in tokens]
vocab = Counter(all_tokens).most_common(VOCAB_SIZE - 2)

In [57]:
word_to_idx = {word: idx+2 for idx, (word, _) in enumerate(vocab)}  # +2 to reserve 0 for padding and 1 for unknown
word_to_idx['<pad>'] = 0
word_to_idx['<unk>'] = 1

In [58]:
def tokens_to_indices(tokens):
    return [word_to_idx.get(token, word_to_idx['<unk>']) for token in tokens]

In [59]:
train_data['indices'] = train_data['tokenized'].apply(tokens_to_indices)
valid_data['indices'] = valid_data['tokenized'].apply(tokens_to_indices)

In [60]:
class ToxicCommentDataset(Dataset):
    def __init__(self, data, label_columns):
        self.data = data
        self.label_columns = label_columns

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['indices']
        labels = self.data.iloc[idx][self.label_columns].values.astype(float)
        labels = np.array(labels, dtype=np.float32)  # Ensure labels are float32
        return torch.tensor(text, dtype=torch.long), torch.from_numpy(labels)

In [61]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = [len(text) for text in texts]
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=word_to_idx['<pad>'])
    return padded_texts, torch.tensor(lengths), torch.stack(labels)

In [62]:
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_dataset = ToxicCommentDataset(train_data, label_columns = label_columns)
valid_dataset = ToxicCommentDataset(valid_data, label_columns = label_columns)

BATCH_SIZE = 64

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [63]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super(LSTMClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                            dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        lstm_out, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(hidden[-1,:,:])  # Taking the last layer's hidden state
        output = self.fc(hidden)
        return output


In [64]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [65]:
vocab_size = 5000  
embedding_dim = 128
hidden_dim = 100
output_dim = 6  

n_layers = 2
dropout = 0.2

model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout).to(device)

print(model)

LSTMClassifier(
  (embedding): Embedding(5000, 128)
  (lstm): LSTM(128, 100, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=100, out_features=6, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [66]:
next(model.parameters()).is_cuda

True

In [67]:
from tqdm.auto import tqdm

In [68]:
labels = train_data[label_columns]

label_counts = np.sum(labels, axis=0)
total_counts = np.sum(label_counts)
class_weights = total_counts / (len(label_counts) * label_counts)

class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)

class_weights_tensor

  class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)


tensor([ 0.3817,  3.6661,  0.6936, 11.5611,  0.7458,  4.2040], device='cuda:0')

In [69]:
import torch.optim as optim
from sklearn.metrics import precision_score, recall_score, f1_score


criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights_tensor)  
optimizer = optim.Adam(model.parameters(), lr=0.001)


def multi_label_metrics(predictions, labels, threshold=0.5):
    preds = (torch.sigmoid(predictions) > threshold).cpu().numpy()
    labels = labels.cpu().numpy()
    precision = precision_score(labels, preds, average='macro', labels=np.unique(preds), zero_division=0.0)
    recall = recall_score(labels, preds, average='macro', labels=np.unique(preds), zero_division=0.0)
    f1 = f1_score(labels, preds, average='macro', labels=np.unique(preds), zero_division=0.0)
    return precision, recall, f1

In [70]:
def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        epoch_precision = 0.0
        epoch_recall = 0.0
        epoch_f1 = 0.0
        for texts, _, labels in tqdm(train_loader):
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            predictions = model(texts)
            loss = criterion(predictions.to(device), labels.to(device))
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            precision, recall, f1 = multi_label_metrics(predictions, labels)
            epoch_precision += precision
            epoch_recall += recall
            epoch_f1 += f1
        
        epoch_loss /= len(train_loader)
        epoch_precision /= len(train_loader)
        epoch_recall /= len(train_loader)
        epoch_f1 /= len(train_loader)
        print(f'Epoch {epoch+1}, Loss: {epoch_loss:.4f}, Precision: {epoch_precision:.4f}, Recall: {epoch_recall:.4f}, F1: {epoch_f1:.4f}')
        
        # Validation
        model.eval()
        val_loss = 0.0
        val_precision = 0.0
        val_recall = 0.0
        val_f1 = 0.0
        with torch.no_grad():
            for texts, _, labels in tqdm(val_loader):
                texts, labels = texts.to(device), labels.to(device)
                predictions = model(texts)
                loss = criterion(predictions.to(device), labels.to(device))
                val_loss += loss.item()
                precision, recall, f1 = multi_label_metrics(predictions, labels)
                val_precision += precision
                val_recall += recall
                val_f1 += f1
        
        val_loss /= len(val_loader)
        val_precision /= len(val_loader)
        val_recall /= len(val_loader)
        val_f1 /= len(val_loader)
        print(f'Validation Loss: {val_loss:.4f}, Validation Precision: {val_precision:.4f}, Validation Recall: {val_recall:.4f}, Validation F1: {val_f1:.4f}')
        model.train()


In [48]:
import warnings
warnings.filterwarnings('always')

In [None]:
num_epochs = 3
train_model(model, train_loader, valid_loader, criterion, optimizer, device, num_epochs)