In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import pandas as pd
from src.models.utils import *
from src.models.fish import Fish
from src.data.dataset import CustomDataset
from src.data.utils import collate_batch, get_vocab
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer

### 1. Data

In [3]:
df_train = pd.read_csv("../data/processed/train.csv")
df_val = pd.read_csv("../data/processed/val.csv")
df_test = pd.read_csv("../data/processed/test.csv")

### 2. Dataset

In [5]:
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [6]:
train_dataset = CustomDataset(
    df_train["text"].values.tolist(), df_train["toxic"].values.tolist(), tokenizer
)
val_dataset = CustomDataset(
    df_val["text"].values.tolist(), df_val["toxic"].values.tolist(), tokenizer
)
test_dataset = CustomDataset(
    df_test["text"].values.tolist(), df_test["toxic"].values.tolist(), tokenizer
)

### 3. Vocabulary

In [7]:
#vocab = get_vocab(train_dataset, tokenizer)
vocab = torch.load("../data/vocab.pt")

### 4. Training

In [8]:
num_class = 2
vocab_size = len(vocab)
embed_dim = 128
model = Fish(vocab_size,embed_dim, num_class)

train_sampler = train_dataset.get_sampler()
train_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=lambda x: collate_batch(batch=x, vocab=vocab, tokenizer=tokenizer), sampler=train_sampler)

val_sampler = val_dataset.get_sampler()
val_dataloader = DataLoader(val_dataset, batch_size=32, collate_fn=lambda x: collate_batch(batch=x, vocab=vocab, tokenizer=tokenizer), sampler=val_sampler)

test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=lambda x: collate_batch(batch=x, vocab=vocab, tokenizer=tokenizer))

train_classifier(model, train_dataloader, val_dataloader)

| epoch   1 |  1000/ 5589 batches | accuracy    0.776
| epoch   1 |  2000/ 5589 batches | accuracy    0.826
| epoch   1 |  3000/ 5589 batches | accuracy    0.838
| epoch   1 |  4000/ 5589 batches | accuracy    0.847
| epoch   1 |  5000/ 5589 batches | accuracy    0.853
-----------------------------------------------------------
| end of epoch   1 | time: 21.43s | valid accuracy    0.869 
-----------------------------------------------------------
| epoch   2 |  1000/ 5589 batches | accuracy    0.870
| epoch   2 |  2000/ 5589 batches | accuracy    0.868
| epoch   2 |  3000/ 5589 batches | accuracy    0.868
| epoch   2 |  4000/ 5589 batches | accuracy    0.869
| epoch   2 |  5000/ 5589 batches | accuracy    0.875
-----------------------------------------------------------
| end of epoch   2 | time: 18.30s | valid accuracy    0.867 
-----------------------------------------------------------
| epoch   3 |  1000/ 5589 batches | accuracy    0.883
| epoch   3 |  2000/ 5589 batches | accuracy

### 5. Evaluation

In [9]:
print('Checking the results of test dataset.')
accu_test = evaluate_classifier(model, test_dataloader, CrossEntropyLoss())
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.903
