In [None]:
import torch
from torch import nn, optim
from tqdm import tqdm
from torch.utils.data import DataLoader
from dataset import SequenceDataset
from model import MultiLayerGRU
from utils import sample

In [None]:
SEQ_LENGTH = 100
HIDDEN_SIZE = 512
NUM_LAYERS = 3
DROPOUT = 0.5

LR = 0.001
BATCH_SIZE = 128
EPOCHS = 1000
DEV = torch.device("cuda")

In [None]:
dataset = SequenceDataset("shakespeare-sonnet.txt", seq_length=SEQ_LENGTH)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
model = MultiLayerGRU(len(dataset.vocab), HIDDEN_SIZE, NUM_LAYERS, DROPOUT).to(DEV)
opt = optim.Adam(model.parameters(), lr = LR)
crit = nn.CrossEntropyLoss()

In [None]:
for e in range(1, EPOCHS + 1):
    loop = tqdm(loader, total=len(loader), leave=True, position=0)
    loop.set_description(f"Epoch : [{e}/{EPOCHS}] | ")
    total_loss = 0
    total_len = 0
    for x, y in loop:
        opt.zero_grad()
        h = torch.zeros((NUM_LAYERS, x.shape[0], HIDDEN_SIZE)).to(DEV)
        yhat, h = model.forward(x.to(DEV), h)
        loss = crit(yhat.view(-1, yhat.shape[-1]), y.view(-1, y.shape[-1]).to(DEV))
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 5)
        opt.step()

        total_loss += loss.item()
        total_len += 1
        loop.set_postfix(average_loss = total_loss / total_len)

    if e % 10 == 0:
        model = model.eval()
        with torch.no_grad():
            print(f"\n{'=' * 50}\nSample output: \n{sample(model, dataset, 'thou', HIDDEN_SIZE, 400, DEV, NUM_LAYERS)}\n{'=' * 50}\n")

In [None]:
torch.save(model.state_dict(), "gru-weights-final.pth")