In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

import numpy as np
import random
import time
import os

import sys
sys.path.append("..")
from data import get_dataset # custom helper function to get dataset
from models.Seq2Seq import Seq2Seq, Encoder, Decoder

In [3]:
BATCH_SIZE = 256

In [4]:
class WrappedDataLoader:
    def __init__(self, dataloader, func):
        self.dataloader = dataloader
        self.func = func
        
    def __len__(self):
        return len(self.dataloader)
    
    def __iter__(self):
        iter_dataloader = iter(self.dataloader)
        for batch in iter_dataloader:
            yield self.func(*batch)
            
def preprocess(x, y):
    # x and y is [batch size, seq len, feature size]
    # to make them work with default assumption of LSTM,
    # here we transpose the first and second dimension
    # return size = [seq len, batch size, feature size]
    return x.transpose(0, 1), y.transpose(0, 1)

In [5]:
train_data, val_data, test_data = get_dataset(["train", "val", "test"])
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=6)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE * 2, shuffle=False, num_workers=6)

train_loader = WrappedDataLoader(train_loader, preprocess)
val_loader = WrappedDataLoader(val_loader, preprocess)
print(len(train_loader))
print(len(val_loader))

805
78


In [6]:
dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
for x, y in train_loader:
    print(x.shape)
    print(y.shape)
    break

torch.Size([20, 256, 2])
torch.Size([30, 256, 2])


In [8]:
INPUT_DIM = 2
OUTPUT_DIM = 2
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
HID_DIM = 256
N_LAYERS = 4
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, dev).to(dev)

In [9]:
model

Seq2Seq(
  (encoder): Encoder(
    (linear): Linear(in_features=2, out_features=128, bias=True)
    (rnn): LSTM(128, 256, num_layers=4, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Linear(in_features=2, out_features=128, bias=True)
    (rnn): LSTM(128, 256, num_layers=4, dropout=0.5)
    (linear): Linear(in_features=256, out_features=2, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [10]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,949,826 trainable parameters


In [11]:
optimizer = optim.Adam(model.parameters())
criterion = nn.MSELoss()

In [12]:
def train(model, dataloader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for i, (x, y) in enumerate(dataloader):
        # put data into GPU
        x = x.to(dev)
        y = y.to(dev)
        
        # zero all param gradients
        optimizer.zero_grad()
        
        # run seq2seq to get predictions
        y_pred = model(x, y)
        
        # get loss and compute model trainable params gradients though backpropagation
        loss = criterion(y_pred, y)
        loss.backward()
        
        # update model params
        optimizer.step()
        
        # add batch loss, since loss is single item tensor
        # we can get its value by loss.item()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

In [13]:
def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, (x, y) in enumerate(dataloader):
            x = x.to(dev)
            y = y.to(dev)
            
            # turn off teacher forcing
            y_pred = model(x, y, teacher_forcing_ratio = 0)
            
            loss = criterion(y_pred, y)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

In [14]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
import os
N_EPOCHES = 100
best_val_loss = float('inf')

# load previous best model params if exists
model_dir = "saved_models/Seq2Seq"
saved_model_path = model_dir + "/best_seq2seq.pt"
if os.path.isfile(saved_model_path):
    model.load_state_dict(torch.load(saved_model_path))
    print("successfully load previous best model parameters")
    
for epoch in range(N_EPOCHES):
    start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss = evaluate(model, val_loader, criterion)
    
    end_time = time.time()
    
    mins, secs = epoch_time(start_time, end_time)
    
    print(F'Epoch: {epoch+1:02} | Time: {mins}m {secs}s')
    print(F'\tTrain Loss: {train_loss:.3f}')
    print(F'\t Val. Loss: {val_loss:.3f}')

    if val_loss < best_val_loss:
        os.makedirs(model_dir, exist_ok=True)
        torch.save(model.state_dict(), saved_model_path)
        

successfully load previous best model parameters
Epoch: 01 | Time: 2m 58s
	Train Loss: 36629.582
	 Val. Loss: 127474.822
Epoch: 02 | Time: 2m 59s
	Train Loss: 32897.483
	 Val. Loss: 117809.916
Epoch: 03 | Time: 2m 59s
	Train Loss: 28319.573
	 Val. Loss: 119895.755
Epoch: 04 | Time: 2m 58s
	Train Loss: 24012.915
	 Val. Loss: 122357.672
Epoch: 05 | Time: 2m 58s
	Train Loss: 21408.488
	 Val. Loss: 122434.448
Epoch: 06 | Time: 2m 58s
	Train Loss: 19969.975
	 Val. Loss: 119592.575
Epoch: 07 | Time: 2m 58s
	Train Loss: 18388.910
	 Val. Loss: 128100.033
