In [9]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [3]:
train = pd.read_csv('CS 583 Project/jigsaw-toxic-comment-train.csv')
validation = pd.read_csv('CS 583 Project/validation.csv')
test = pd.read_csv('CS 583 Project/test.csv')

In [4]:
train.drop(['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
train = train.loc[:12000, :]

In [5]:
xtrain, xvalid, ytrain, yvalid = train_test_split(
    train['comment_text'], train['toxic'], stratify=train['toxic'], random_state=42, test_size=0.2, shuffle=True
)

In [7]:
def load_glove_vectors(file_path):
    word_vectors = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            word_vectors[word] = vector
    return word_vectors

glove_vectors = load_glove_vectors('CS 583 Project/glove.6B.300d.txt')

In [10]:
def build_vocab(texts, max_vocab_size=20000):
    word_counter = Counter()
    for text in texts:
        word_counter.update(text.split())
    most_common = word_counter.most_common(max_vocab_size)
    vocab = {word: idx + 2 for idx, (word, _) in enumerate(most_common)}
    vocab["<PAD>"] = 0
    vocab["<UNK>"] = 1
    return vocab

vocab = build_vocab(xtrain)

In [11]:
embedding_dim = 300
embedding_matrix = np.zeros((len(vocab), embedding_dim))
for word, idx in vocab.items():
    if word in glove_vectors:
        embedding_matrix[idx] = glove_vectors[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

# Tokenize data

In [12]:
def text_to_sequence(text, vocab, max_len=150):
    words = text.split()
    sequence = [vocab.get(word, vocab["<UNK>"]) for word in words[:max_len]]
    sequence += [vocab["<PAD>"]] * (max_len - len(sequence))
    return sequence

In [13]:
xtrain_seq = xtrain.apply(lambda x: text_to_sequence(x, vocab))
xvalid_seq = xvalid.apply(lambda x: text_to_sequence(x, vocab))

In [14]:
class ToxicCommentsDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.sequences.iloc[idx], dtype=torch.long),
            "target": torch.tensor(self.targets.iloc[idx], dtype=torch.float),
        }

In [15]:
train_dataset = ToxicCommentsDataset(xtrain_seq, ytrain)
valid_dataset = ToxicCommentsDataset(xvalid_seq, yvalid)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

In [16]:
class ToxicCommentRNNClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_size=128, num_layers=1):
        super(ToxicCommentRNNClassifier, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False  # Freeze embeddings
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        _, hidden = self.rnn(embedded)
        output = self.fc(hidden[-1])
        return self.sigmoid(output)


In [17]:
model = ToxicCommentRNNClassifier(embedding_matrix)
model = model.to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [20]:
def train_model(model, train_loader, valid_loader, criterion, optimizer, epochs=3):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            targets = batch["target"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids).squeeze()
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {total_loss / len(train_loader)}")

        # Validation
        model.eval()
        valid_preds = []
        valid_targets = []
        with torch.no_grad():
            for batch in valid_loader:
                input_ids = batch["input_ids"].to(device)
                targets = batch["target"].to(device)

                outputs = model(input_ids).squeeze()
                valid_preds.extend(outputs.cpu().numpy().flatten().tolist())  # Fix: Flatten and convert to list
                valid_targets.extend(targets.cpu().numpy().tolist())

        # Threshold for binary classification
        valid_preds_binary = np.array(valid_preds) > 0.5

        # Metrics
        auc = roc_auc_score(valid_targets, valid_preds)
        acc = accuracy_score(valid_targets, valid_preds_binary)
        precision = precision_score(valid_targets, valid_preds_binary)
        recall = recall_score(valid_targets, valid_preds_binary)
        f1 = f1_score(valid_targets, valid_preds_binary)
        cm = confusion_matrix(valid_targets, valid_preds_binary)

        print(f"Validation AUC: {auc}")
        print(f"Validation Accuracy: {acc}")
        print(f"Validation Precision: {precision}")
        print(f"Validation Recall: {recall}")
        print(f"Validation F1-Score: {f1}")
        print(f"Confusion Matrix:\n{cm}")



train_model(model, train_loader, valid_loader, criterion, optimizer)

Epoch 1/3, Training Loss: 0.3053610878934463
Validation AUC: 0.534108547552371
Validation Accuracy: 0.9058725531028738
Validation Precision: 1.0
Validation Recall: 0.004405286343612335
Validation F1-Score: 0.008771929824561403
Confusion Matrix:
[[2174    0]
 [ 226    1]]
Epoch 2/3, Training Loss: 0.3034588119635979
Validation AUC: 0.51777717437558
Validation Accuracy: 0.905039566847147
Validation Precision: 0.3333333333333333
Validation Recall: 0.004405286343612335
Validation F1-Score: 0.008695652173913044
Confusion Matrix:
[[2172    2]
 [ 226    1]]
Epoch 3/3, Training Loss: 0.30067873276770113
Validation AUC: 0.5180872060271774
Validation Accuracy: 0.9042065805914202
Validation Precision: 0.0
Validation Recall: 0.0
Validation F1-Score: 0.0
Confusion Matrix:
[[2171    3]
 [ 227    0]]
