In [2]:
import pandas as pd
import numpy as np
import re
import spacy

## Importing Datasets

In [3]:
# load the dataset
import torch
from torchtext import data

comment = data.Field(tokenize='spacy')

In [4]:
#read train csv
train_data, valid_data = data.TabularDataset.splits(
    path='cache/', format='csv', skip_header=True,
    train='dataset_train.csv', validation='dataset_val.csv',
    fields=[
        ('id', None),
        ('comment_text', comment),
        ('toxic', data.Field(
            use_vocab=False, sequential=False,
            dtype=torch.float))
    ])

In [5]:
#read test csv
test_data = data.TabularDataset(
    path='cache/dataset_test_merged.csv', format='csv', 
    skip_header=True,
    fields=[
        ('id', None),
        ('comment_text', comment),
        ('toxic', data.Field(
            use_vocab=False, sequential=False,
            dtype=torch.float))
    ])

In [6]:
#build vocab
comment.build_vocab(train_data, max_size=25000)

In [7]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    repeat = False,
    sort = False,
    device=device)

## Model Implementation

In [8]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.lin = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        
        embedded = self.embedding(x)
        
        output, hidden = self.rnn(embedded)
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.lin(hidden.squeeze(0))

In [9]:
INPUT_DIM = len(comment.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

## Train Model

In [10]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [11]:
criterion = nn.BCEWithLogitsLoss()

In [12]:
model = model.to(device)
criterion = criterion.to(device)

In [13]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [14]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.comment_text).squeeze(1)
        
        loss = criterion(predictions, batch.toxic)
        
        acc = binary_accuracy(predictions, batch.toxic)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.comment_text).squeeze(1)
            
            loss = criterion(predictions, batch.toxic)
            
            acc = binary_accuracy(predictions, batch.toxic)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [16]:
N_EPOCHS = 2

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.326 | Train Acc: 90.14% | Val. Loss: 0.328 | Val. Acc: 89.91% |
| Epoch: 02 | Train Loss: 0.317 | Train Acc: 90.43% | Val. Loss: 0.327 | Val. Acc: 89.95% |


## Results

In [17]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.317 | Test Acc: 90.39% |


## Exporting Model and Vocabulary

In [19]:
PATH = "model_naive.pt"
torch.save(model.state_dict(), PATH)

In [20]:
import dill

PATH1 = "vocab_naive.pt"
torch.save(comment, PATH1, pickle_module=dill)