In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import logging
import pandas as pd
from torch.nn import MSELoss
from functools import partial
from src.models.autoencoder import Autoencoder
from src.data.utils import collate_batch
from src.data.load import load_data
from src.models.utils import *
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer

In [3]:
logging.basicConfig(filename="autoencoder.log", encoding="utf-8", level=logging.DEBUG)

In [4]:
torch.manual_seed(21)

<torch._C.Generator at 0x105231750>

### 1. Data

In [5]:
batch_size = 64

In [6]:
train_dataset, val_dataset, test_dataset = load_data("../data/processed/")
tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
vocab = torch.load("../data/vocab.pt")

train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=partial(collate_batch, vocab=vocab, tokenizer=tokenizer),
    sampler=train_dataset.get_sampler(),
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=partial(collate_batch, vocab=vocab, tokenizer=tokenizer),
    sampler=val_dataset.get_sampler(),
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=partial(collate_batch, vocab=vocab, tokenizer=tokenizer),
)

### 2. Experiment

In [7]:
EPOCHS = 60

configs = [{"embed_dim":128}, {"embed_dim":256}, {"embed_dim":512}]

offset = 0

In [8]:
for i,config in enumerate(configs):

    logging.info("| Experiment {} | embed dim: {}".format(i + offset,config["embed_dim"]))

    # Train
    model = Autoencoder(len(vocab), config["embed_dim"])
    train_autoencoder(model, train_dataloader, val_dataloader, epochs=EPOCHS, log_file="autoencoder.log")

    # Evaluate
    print('Checking the results of test dataset.')
    loss_test = evaluate_autoencoder(model, test_dataloader, MSELoss())
    logging.info("| test accuracy {:8.6f} ".format(loss_test))
    print('test loss {:8.6f}'.format(loss_test))

    # Save
    torch.save(model.state_dict(), f"../data/models/autoencoder-{config['embed_dim']}.pt")

| epoch   1 |  1000/ 2795 batches | loss 0.000970
| epoch   1 |  2000/ 2795 batches | loss 0.000166
-----------------------------------------------------------
| end of epoch   1 | time:  7.18s | train loss 0.000568 | validation loss 0.000120 
-----------------------------------------------------------
| epoch   2 |  1000/ 2795 batches | loss 0.000105
| epoch   2 |  2000/ 2795 batches | loss 0.000095
-----------------------------------------------------------
| end of epoch   2 | time:  6.79s | train loss 0.000100 | validation loss 0.000094 
-----------------------------------------------------------
| epoch   3 |  1000/ 2795 batches | loss 0.000085
| epoch   3 |  2000/ 2795 batches | loss 0.000082
-----------------------------------------------------------
| end of epoch   3 | time:  6.71s | train loss 0.000084 | validation loss 0.000084 
-----------------------------------------------------------
| epoch   4 |  1000/ 2795 batches | loss 0.000077
| epoch   4 |  2000/ 2795 batches | lo