In [133]:
import torch
import pandas as pd
import time
from torch import nn
from torchmetrics import Precision, Recall, F1Score, Accuracy
from torch.utils.data import DataLoader,Dataset, WeightedRandomSampler
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
#device = torch.device("mps")

### 1. Data

In [134]:
df_train = pd.read_csv("../data/processed/train.csv")
df_val = pd.read_csv("../data/processed/val.csv")
df_test = pd.read_csv("../data/processed/test.csv")

In [135]:
df_train.shape, df_val.shape, df_test.shape

((178839, 2), (22355, 2), (22355, 2))

### 2. Dataset

In [192]:
class CustomDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        self.x_train = X
        self.y_train = y
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, idx):
        return self.x_train[idx], self.y_train[idx]

    def get_sampler(self):
        class_weights = [0.1, 0.9]
        sample_weights = [0] * len(self)
        for idx, (text, label) in enumerate(self):
            sample_weights[idx] = class_weights[label]
        sampler = WeightedRandomSampler(
            sample_weights, num_samples=len(sample_weights), replacement=True
        )
        return sampler

In [193]:
tokenizer = get_tokenizer('spacy')



In [194]:
train_dataset = CustomDataset(
    df_train["text"].values.tolist(), df_train["toxic"].values.tolist(), tokenizer
)
val_dataset = CustomDataset(
    df_val["text"].values.tolist(), df_val["toxic"].values.tolist(), tokenizer
)
test_dataset = CustomDataset(
    df_test["text"].values.tolist(), df_test["toxic"].values.tolist(), tokenizer
)

### 3. Vocabulary

In [140]:
def yield_tokens(iterator, tokenizer):
    for text,_ in iterator:
        yield tokenizer(str(text))

In [141]:
train_iterator = iter(train_dataset)
vocab = build_vocab_from_iterator(yield_tokens(train_iterator, tokenizer), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [142]:
len(vocab)

196674

### 4. ``collate_fn``

In [143]:
def collate_batch(batch):
    text_pipeline = lambda x: vocab(tokenizer(str(x).lower()))
    label_pipeline = lambda x: x
    label_list, text_list, offsets = [], [], [0]
    for (_text,_label) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return text_list, label_list, offsets

### 5. Model

In [371]:
class Fish(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(Fish, self).__init__()

        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)

        self.linear_relu_stack = nn.Sequential(
            # Fish encoder
            nn.Linear(embed_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            # Fish decoder
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, num_class),
        )

        self.softmax = nn.Softmax(dim=1)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        for layer in self.linear_relu_stack:
            if isinstance(layer, nn.Linear):
                layer.weight.data.uniform_(-initrange, initrange)
                layer.bias.data.zero_()

    def forward(self, text, offsets):
        embedding_output = self.embedding(text, offsets)
        return self.softmax(self.linear_relu_stack(embedding_output))

### 6. Training

In [372]:
def train(model, dataloader, optimizer, criterion, epoch):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 1000
    start_time = time.time()

    for idx, (text, label, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()
    return model


def evaluate(model, dataloader, criterion):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (text, label, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

In [373]:
num_class = 2
vocab_size = len(vocab)
emsize = 128
model = Fish(vocab_size, emsize, num_class)

train_sampler = train_dataset.get_sampler()
train_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_batch, sampler=train_sampler)

val_sampler = val_dataset.get_sampler()
val_dataloader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_batch, sampler=val_sampler)

test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_batch)

EPOCHS = 10
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=5.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    model = train(model, train_dataloader, optimizer, criterion, epoch)
    accu_val = evaluate(model, val_dataloader, criterion)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |  1000/ 5589 batches | accuracy    0.793
| epoch   1 |  2000/ 5589 batches | accuracy    0.842
| epoch   1 |  3000/ 5589 batches | accuracy    0.860
| epoch   1 |  4000/ 5589 batches | accuracy    0.872
| epoch   1 |  5000/ 5589 batches | accuracy    0.879
-----------------------------------------------------------
| end of epoch   1 | time: 17.90s | valid accuracy    0.863 
-----------------------------------------------------------
| epoch   2 |  1000/ 5589 batches | accuracy    0.880
| epoch   2 |  2000/ 5589 batches | accuracy    0.885
| epoch   2 |  3000/ 5589 batches | accuracy    0.892
| epoch   2 |  4000/ 5589 batches | accuracy    0.894
| epoch   2 |  5000/ 5589 batches | accuracy    0.900
-----------------------------------------------------------
| end of epoch   2 | time: 16.86s | valid accuracy    0.886 
-----------------------------------------------------------
| epoch   3 |  1000/ 5589 batches | accuracy    0.900
| epoch   3 |  2000/ 5589 batches | accuracy

### 7. Evaluation

In [374]:
print('Checking the results of test dataset.')
accu_test = evaluate(model, test_dataloader, criterion)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.925


In [375]:
#model(torch.tensor(pipeline(df_test.iloc[2]["text"])), torch.tensor([0])).argmax(1).item()

In [376]:
pipeline = lambda x: vocab(tokenizer(str(x).lower()))
predicted_labels = []
for index, row in df_test.iterrows():
    text = torch.tensor(pipeline(row["text"]), dtype=torch.int64)
    offsets = torch.tensor([0])
    predicted_label = model(text, offsets)
    predicted_labels.append(predicted_label.argmax(1).item())
df_test["predicted_label"] = predicted_labels
df_test["error"] = df_test["toxic"] != df_test["predicted_label"]

In [377]:
df_test[df_test["error"] == True]

Unnamed: 0,text,toxic,predicted_label,error
3,introduction background an aortic ii signs sym...,0,1,True
12,so i see now that calling you exactly as you s...,0,1,True
17,exactly what i was going to say ridiculous sexism,0,1,True
21,to marissa behning and leanna feeney saying yo...,0,1,True
36,google name origin goo ogle stare at the unusa...,1,0,True
...,...,...,...,...
22329,possible racism of user rodhullandemu as this ...,1,0,True
22335,the article needs to say what he said on stage...,0,1,True
22344,it is sooooo gay,0,1,True
22345,look user theresa knott hates,0,1,True


In [378]:
df_test.iloc[32]["text"]

'about sitush arrogant believes his arguments and references are best he ca nt digest logical still trying to get the biology definition in math book eg has written about khatris origin from dashrath sharma on rajputs book ca nt understand references and read them racist you come under this definition still if you have some shame left in you quit wikipedia or start listening discussing'