In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import pandas as pd
from functools import partial
from src.models.ann import Ann
from src.models.autoencoder import Autoencoder
from src.models.utils import *
from src.data.utils import collate_batch
from src.data.load import load_data
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer

### 1. Data

In [None]:
# Hyperparameters
batch_size = 64
EPOCHS = 10

In [4]:
train_dataset, val_dataset, test_dataset = load_data("../data/processed/")
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
vocab = torch.load("../data/vocab.pt")
vocab_size = len(vocab)
embed_dim = 128 # NOT AN OPTIMIZATION PARAMETER

train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=partial(collate_batch, vocab=vocab, tokenizer=tokenizer),
    sampler=train_dataset.get_sampler(),
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=partial(collate_batch, vocab=vocab, tokenizer=tokenizer),
    sampler=val_dataset.get_sampler(),
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=partial(collate_batch, vocab=vocab, tokenizer=tokenizer),
)

autoencoder = Autoencoder(vocab_size, embed_dim)
autoencoder.load_state_dict(torch.load("../data/models/autoencoder.pt"))
autoencoder.eval()

Autoencoder(
  (embedding): EmbeddingBag(196674, 128, mode=mean)
  (encoder): Sequential(
    (0): Linear(in_features=128, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=16, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=128, bias=True)
    (3): ReLU()
  )
)

### 2. Training

In [5]:
model = Ann(vocab_size,embed_dim,2,autoencoder.encoder)

train_classifier(model, train_dataloader, val_dataloader, epochs=EPOCHS)

| epoch   1 |  1000/ 2795 batches | accuracy    0.814
| epoch   1 |  2000/ 2795 batches | accuracy    0.881
-----------------------------------------------------------
| end of epoch   1 | time:  5.62s | valid accuracy    0.878 
-----------------------------------------------------------
| epoch   2 |  1000/ 2795 batches | accuracy    0.907
| epoch   2 |  2000/ 2795 batches | accuracy    0.915
-----------------------------------------------------------
| end of epoch   2 | time:  5.48s | valid accuracy    0.894 
-----------------------------------------------------------
| epoch   3 |  1000/ 2795 batches | accuracy    0.923
| epoch   3 |  2000/ 2795 batches | accuracy    0.927
-----------------------------------------------------------
| end of epoch   3 | time:  5.54s | valid accuracy    0.884 
-----------------------------------------------------------
| epoch   4 |  1000/ 2795 batches | accuracy    0.952
| epoch   4 |  2000/ 2795 batches | accuracy    0.957
-------------------------

### 3. Evaluation

In [10]:
print('Checking the results of test dataset.')
accu_test = evaluate_classifier(model, test_dataloader, CrossEntropyLoss())
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.909
