In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
import torch
import pandas as pd
from src.models.ann import Ann
from src.models.autoencoder import Autoencoder
from src.models.utils import *
from src.data.dataset import CustomDataset
from src.data.utils import collate_batch, get_vocab
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer

### 1. Data

In [28]:
df_train = pd.read_csv("../data/processed/train.csv")
df_val = pd.read_csv("../data/processed/val.csv")
df_test = pd.read_csv("../data/processed/test.csv")

In [29]:
vocab_size = 196674
embed_dim = 128
autoencoder = Autoencoder(vocab_size, embed_dim)
autoencoder.load_state_dict(torch.load("../data/models/autoencoder.pt"))
autoencoder.eval()

Autoencoder(
  (embedding): EmbeddingBag(196674, 128, mode=mean)
  (encoder): Sequential(
    (0): Linear(in_features=128, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=16, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=128, bias=True)
    (3): ReLU()
  )
)

### 2. Dataset

In [30]:
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [31]:
train_dataset = CustomDataset(
    df_train["text"].values.tolist(), df_train["toxic"].values.tolist(), tokenizer
)
val_dataset = CustomDataset(
    df_val["text"].values.tolist(), df_val["toxic"].values.tolist(), tokenizer
)
test_dataset = CustomDataset(
    df_test["text"].values.tolist(), df_test["toxic"].values.tolist(), tokenizer
)

### 3. Vocabulary

In [32]:
#vocab = get_vocab(train_dataset, tokenizer)
vocab = torch.load("../data/vocab.pt")

### 4. Training

In [33]:
num_class = 2
vocab_size = len(vocab)
model = Ann(vocab_size,embed_dim,num_class,autoencoder.encoder)

train_sampler = train_dataset.get_sampler()
train_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=lambda x: collate_batch(batch=x, vocab=vocab, tokenizer=tokenizer), sampler=train_sampler)

val_sampler = val_dataset.get_sampler()
val_dataloader = DataLoader(val_dataset, batch_size=32, collate_fn=lambda x: collate_batch(batch=x, vocab=vocab, tokenizer=tokenizer), sampler=val_sampler)

test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=lambda x: collate_batch(batch=x, vocab=vocab, tokenizer=tokenizer))

train_classifier(model, train_dataloader, val_dataloader)

| epoch   1 |  1000/ 5589 batches | accuracy    0.754
| epoch   1 |  2000/ 5589 batches | accuracy    0.853
| epoch   1 |  3000/ 5589 batches | accuracy    0.876
| epoch   1 |  4000/ 5589 batches | accuracy    0.886
| epoch   1 |  5000/ 5589 batches | accuracy    0.896
-----------------------------------------------------------
| end of epoch   1 | time: 20.10s | valid accuracy    0.886 
-----------------------------------------------------------
| epoch   2 |  1000/ 5589 batches | accuracy    0.902
| epoch   2 |  2000/ 5589 batches | accuracy    0.906
| epoch   2 |  3000/ 5589 batches | accuracy    0.908
| epoch   2 |  4000/ 5589 batches | accuracy    0.907
| epoch   2 |  5000/ 5589 batches | accuracy    0.911
-----------------------------------------------------------
| end of epoch   2 | time: 19.06s | valid accuracy    0.857 
-----------------------------------------------------------
| epoch   3 |  1000/ 5589 batches | accuracy    0.933
| epoch   3 |  2000/ 5589 batches | accuracy

### 5. Evaluation

In [17]:
print('Checking the results of test dataset.')
accu_test = evaluate_classifier(model, test_dataloader, CrossEntropyLoss())
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.916
