Data source: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

Task: Binary classification of toxic comments

In [1]:
import torch
import pandas as pd
from torch import nn
from functools import partial
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler

In [2]:
def collate_batch(batch, vocab, tokenizer):
    text_pipeline = lambda x: vocab(tokenizer(str(x).lower()))
    label_pipeline = lambda x: x
    label_list, text_list, offsets = [], [], [0]
    for (_text,_label) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return text_list, label_list, offsets

def _yield_tokens(iterator, tokenizer):
    for text,_ in iterator:
        yield tokenizer(str(text))

def get_vocab(dataset, tokenizer):
    train_iterator = iter(dataset)
    vocab = build_vocab_from_iterator(_yield_tokens(train_iterator, tokenizer), specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])
    return vocab

### 1. Dataset

In [5]:
class CustomDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        self.x_train = X
        self.y_train = y
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, idx):
        return self.x_train[idx], self.y_train[idx]

    def get_sampler(self):
        class_weights = [0.1, 0.9] # Allows balanced batches
        sample_weights = [0] * len(self)
        for idx, (text, label) in enumerate(self):
            sample_weights[idx] = class_weights[label]
        sampler = WeightedRandomSampler(
            sample_weights, num_samples=len(sample_weights), replacement=True
        )
        return sampler

In [6]:
def load_data(data_dir):

    # Load dataframes
    df_train = pd.read_csv(f"{data_dir}/train.csv")
    df_val = pd.read_csv(f"{data_dir}/val.csv")
    df_test = pd.read_csv(f"{data_dir}/test.csv")

    # Load tokenizer
    tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

    # Create datasets
    train_dataset = CustomDataset(
    df_train["text"].values.tolist(), df_train["toxic"].values.tolist(), tokenizer
    )
    val_dataset = CustomDataset(
        df_val["text"].values.tolist(), df_val["toxic"].values.tolist(), tokenizer
    )
    test_dataset = CustomDataset(
        df_test["text"].values.tolist(), df_test["toxic"].values.tolist(), tokenizer
    )

    return train_dataset, val_dataset, test_dataset

In [7]:
train_dataset, val_dataset, _ = load_data("./data/")

In [13]:
train_dataset[5]

('could you please leave at least some articles alone is there any way to exclude a page from you pointless crusade you and the two trolls who took up your cause become more and more aggressive by the month i find your arguments totally unconvincing so please please please get a broad consensus or leave gervase of melkley as it is thank you',
 0)

### 2. Tokenizer

In [14]:
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
vocab = get_vocab(train_dataset, tokenizer)
vocab_size = len(vocab)

In [15]:
train_dataset[0]

('union avenue historic commercial district i have done some significant improvements to the article and references i not the greatest copy editor and have a slight coi since this is my hometown can you take a look and help make sure i progressing to a featured article sts i know it not ready yet but maybe a few pointers about things that oculd be expanded or reworded',
 0)

In [16]:
tokens = tokenizer("hello there how are you?")
tokens

['hello', 'there', 'how', 'are', 'you', '?']

In [17]:
indices = vocab.lookup_indices(tokens)
indices

[301, 38, 72, 18, 7, 0]

In [18]:
vocab.lookup_tokens(indices)

['hello', 'there', 'how', 'are', 'you', '<unk>']

### 3. DataLoader

In [19]:
batch_size = 64

In [24]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=partial(collate_batch, vocab=vocab, tokenizer=tokenizer),
    sampler=train_dataset.get_sampler(),
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=partial(collate_batch, vocab=vocab, tokenizer=tokenizer),
    sampler=val_dataset.get_sampler(),
)

### 4. Module

In [25]:
class Net(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_dim,
    ):
        super(Net, self).__init__()

        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.hidden_layers = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
        )

        self.output_layer = nn.Sequential(nn.Linear(16, 2), nn.Softmax(dim=1))

    def forward(self, x, offsets):
        x = self.embed(x, offsets)
        x = self.hidden_layers(x)
        x = self.output_layer(x)
        return x

    def embed(self, x, offsets):
        return self.embedding(x, offsets)

In [26]:
model = Net(vocab_size, 32)

### 5. Training

In [27]:
def _validate(model, dataloader, criterion):

    total_loss, total_acc = 0, 0
    with torch.no_grad():
        for batch, (text, label, offsets) in enumerate(dataloader):
            y_pred = model(text, offsets)
            loss = criterion(y_pred, label)
            total_acc += (y_pred.argmax(1) == label).sum().item()
            total_loss += loss.item()

    return total_acc / (len(dataloader) * dataloader.batch_size)

def train_and_validate(model, train_dataloader, val_dataloader, epochs=10, step_size=20, gamma=0.1):

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
    validation_losses = []

    for epoch in range(epochs):

        model.train()

        for batch, (text, label, offsets) in enumerate(train_dataloader):
            
            y_pred = model(text, offsets)
            loss = criterion(y_pred, label)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        
        model.eval()
        validation_loss = _validate(model, val_dataloader, criterion)
        validation_losses.append(validation_loss)
        print(f"Epoch {epoch}, validation loss: {validation_loss}")

        scheduler.step()

In [28]:
train_and_validate(model, train_dataloader, val_dataloader)

Epoch 0, validation loss: 0.7691071428571429
Epoch 1, validation loss: 0.8130803571428571
Epoch 2, validation loss: 0.835625
Epoch 3, validation loss: 0.8420982142857143


KeyboardInterrupt: 