In [None]:
import time
import math
import torch
import torch.nn as nn
from model import WD_LSTM
from data import Corpus
from helpers import Config, repackage_hidden, batchify, get_batch

In [None]:
SEED = 42
DATA = '/floyd/input/ptb/'
CUDA = True
LOG_INTERVAL = 200
LR_ANNEALING_RATE = 0.25
CONFIG_NAME = 'language_model_base'
device = torch.device("cuda" if CUDA else "cpu")
args = Config(CONFIG_NAME)

In [None]:
corpus = Corpus(DATA)

In [None]:
ntokens = len(corpus.dictionary)
train_data = batchify(corpus.train, args.batch_size, device)
valid_data = batchify(corpus.valid, args.batch_size, device)
test_data = batchify(corpus.test, args.batch_size, device)

In [None]:
model = WD_LSTM(
    ntokens, 
    args.emsize,
    args.nhid, 
    args.nlayers, 
    args.dropout, 
    weight_drop=args.weight_drop, 
    weight_tying=args.weight_tying
).to(device)

In [None]:
lr = args.lr
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [None]:
def evaluate(data_source):
    model.eval()  # disable dropout
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, args.bptt):
            data, targets = get_batch(data_source, i, args)
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [None]:
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i, args)
        hidden = repackage_hidden(hidden)
        optimizer.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        optimizer.step()
        total_loss += loss.item()
        if batch % LOG_INTERVAL == 0 and batch > 0:
            cur_loss = total_loss / LOG_INTERVAL
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:3.2E} | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / LOG_INTERVAL, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [None]:
best_val_loss = 1e20
try:
    for epoch in range(1, args.epochs + 1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(valid_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
            epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss)))
        print('-' * 89)
        if val_loss < best_val_loss:
            with open(args.save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            lr *= LR_ANNEALING_RATE
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

In [None]:
# Load the best saved model.
with open(args.save, 'rb') as f:
    model = torch.load(f, map_location=device)

In [None]:
# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(test_loss, math.exp(test_loss)))
print('=' * 89)