In [1]:
import os

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/denisshpilka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
DATA_PATH = os.path.join('data', 'train.csv')

df = pd.read_csv(DATA_PATH)
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [3]:
df = df.iloc[:, 1:]
df

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...
159566,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [4]:
train_data, valid_data = train_test_split(df, test_size=0.2, random_state=42)

train_data.shape, valid_data.shape

((127656, 7), (31915, 7))

In [5]:
from collections import Counter

In [6]:
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for col in label_columns:
    train_data[col] = pd.to_numeric(train_data[col], errors='coerce').fillna(0)
    valid_data[col] = pd.to_numeric(valid_data[col], errors='coerce').fillna(0)

In [7]:
train_data['tokenized'] = train_data['comment_text'].apply(word_tokenize)
valid_data['tokenized'] = valid_data['comment_text'].apply(word_tokenize)

VOCAB_SIZE = 5000

all_tokens = [token for tokens in train_data['tokenized'] for token in tokens]
vocab = Counter(all_tokens).most_common(VOCAB_SIZE - 2)

In [8]:
word_to_idx = {word: idx+2 for idx, (word, _) in enumerate(vocab)}  # +2 to reserve 0 for padding and 1 for unknown
word_to_idx['<pad>'] = 0
word_to_idx['<unk>'] = 1

In [9]:
def tokens_to_indices(tokens):
    return [word_to_idx.get(token, word_to_idx['<unk>']) for token in tokens]

In [10]:
train_data['indices'] = train_data['tokenized'].apply(tokens_to_indices)
valid_data['indices'] = valid_data['tokenized'].apply(tokens_to_indices)

In [11]:
class ToxicCommentDataset(Dataset):
    def __init__(self, data, label_columns):
        self.data = data
        self.label_columns = label_columns

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['indices']
        labels = self.data.iloc[idx][self.label_columns].values.astype(float)
        labels = np.array(labels, dtype=np.float32)  # Ensure labels are float32
        return torch.tensor(text, dtype=torch.long), torch.from_numpy(labels)

In [12]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = [len(text) for text in texts]
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=word_to_idx['<pad>'])
    return padded_texts, torch.tensor(lengths), torch.stack(labels)

In [13]:
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_dataset = ToxicCommentDataset(train_data, label_columns = label_columns)
valid_dataset = ToxicCommentDataset(valid_data, label_columns = label_columns)

BATCH_SIZE = 64

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [14]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super(LSTMClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                            dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        lstm_out, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(hidden[-1,:,:])  # Taking the last layer's hidden state
        output = self.fc(hidden)
        return output
