In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import logging
import pandas as pd
from functools import partial
from src.models.ann import Ann
from src.models.autoencoder import Autoencoder
from src.models.utils import *
from src.data.utils import collate_batch
from src.data.load import load_data
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer

In [3]:
logging.basicConfig(filename="ann.log", encoding="utf-8", level=logging.DEBUG)

In [4]:
torch.manual_seed(21)

<torch._C.Generator at 0x11708d750>

### 1. Data

In [5]:
batch_size = 64

In [6]:
train_dataset, val_dataset, test_dataset = load_data("../data/processed/")
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
vocab = torch.load("../data/vocab.pt")
vocab_size = len(vocab)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=partial(collate_batch, vocab=vocab, tokenizer=tokenizer),
    sampler=train_dataset.get_sampler(),
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=partial(collate_batch, vocab=vocab, tokenizer=tokenizer),
    sampler=val_dataset.get_sampler(),
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=partial(collate_batch, vocab=vocab, tokenizer=tokenizer),
)

### 2. Training

In [7]:
EPOCHS = 60

configs = [
    {"embed_dim": 128, "n4": 48},
    {"embed_dim": 128, "n4": 64},
    {"embed_dim": 128, "n4": 88},
    {"embed_dim": 256, "n4": 64},
    {"embed_dim": 256, "n4": 128},
    {"embed_dim": 256, "n4": 192},
    {"embed_dim": 512, "n4": 64},
    {"embed_dim": 512, "n4": 128},
    {"embed_dim": 512, "n4": 256},
]

offset = 0

In [8]:
for i,config in enumerate(configs):
    
    logging.info("| Experiment {} | embedding dim: {} | n4: {}".format(i + offset,config["embed_dim"], config["n4"]))

    autoencoder = Autoencoder(vocab_size, config["embed_dim"])
    autoencoder.load_state_dict(torch.load(f"../data/models/autoencoder-{config['embed_dim']}.pt"))
    autoencoder.eval()

    # Train
    model = Ann(vocab_size,config["embed_dim"],autoencoder.encoder, n4=config["n4"])
    train_classifier(model, train_dataloader, val_dataloader, epochs=EPOCHS, log_file="ann.log")

    # Evaluate
    print('Checking the results of test dataset.')
    accu_test = evaluate_classifier(model, test_dataloader, CrossEntropyLoss())
    logging.info("| test accuracy {:8.6f} ".format(accu_test))
    print('test accuracy {:8.6f}'.format(accu_test))

    # Save
    torch.save(model.state_dict(), f"../data/models/ann-{config['embed_dim']}-{config['n4']}.pt")

| epoch   1 |  1000/ 2795 batches | accuracy 0.523477
| epoch   1 |  2000/ 2795 batches | accuracy 0.776047
----------------------------------------------------------------------------------------------------
| end of epoch   1 | time: 11.45s | train accuracy 0.649699 | validation accuracy 0.837397 
----------------------------------------------------------------------------------------------------
| epoch   2 |  1000/ 2795 batches | accuracy 0.878465
| epoch   2 |  2000/ 2795 batches | accuracy 0.899359
----------------------------------------------------------------------------------------------------
| end of epoch   2 | time: 10.82s | train accuracy 0.888907 | validation accuracy 0.873272 
----------------------------------------------------------------------------------------------------
| epoch   3 |  1000/ 2795 batches | accuracy 0.918394
| epoch   3 |  2000/ 2795 batches | accuracy 0.926828
----------------------------------------------------------------------------------------