In [1]:
import time
import math
import torch
import torch.nn as nn
from model import WD_LSTM
from data import Corpus
from randomize_bptt import get_bptt_sequence_lengths
from helpers import Config, repackage_hidden, batchify, get_batch

In [2]:
SEED = 42
DATA = '/floyd/input/ptb/'
CUDA = True
LOG_INTERVAL = 50
LR_ANNEALING_RATE = 0.25
CONFIG_NAME = 'language_model_base'
device = torch.device("cuda" if CUDA else "cpu")
args = Config(CONFIG_NAME)

In [3]:
corpus = Corpus(DATA)

In [4]:
ntokens = len(corpus.dictionary)
train_data = batchify(corpus.train, args.batch_size, device)
valid_data = batchify(corpus.valid, args.batch_size, device)
test_data = batchify(corpus.test, args.batch_size, device)

In [5]:
model = WD_LSTM(
    ntokens, 
    args.emsize,
    args.nhid, 
    args.nlayers, 
    args.dropout, 
    weight_drop=args.weight_drop, 
    weight_tying=args.weight_tying
).to(device)
model

WD_LSTM(
  (drop): Dropout(p=0.2)
  (encoder): Embedding(10000, 400)
  (rnns): ModuleList(
    (0): WeightDrop(
      (module): LSTM(400, 800)
    )
    (1): WeightDrop(
      (module): LSTM(800, 800)
    )
    (2): WeightDrop(
      (module): LSTM(800, 400)
    )
  )
  (decoder): Linear(in_features=800, out_features=10000, bias=True)
)

In [6]:
lr = args.lr
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=args.weight_decay)
criterion = nn.CrossEntropyLoss()

In [7]:
def evaluate(data_source):
    model.eval()  # disable dropout
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, args.bptt_seq_len):
            data, targets = get_batch(data_source, i, args.bptt_seq_len)
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [8]:
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, (i, seq_len, lr_scale) in enumerate(get_bptt_sequence_lengths(
        train_data.size(0), 
        args.bptt_seq_len, 
        args.bptt_random_scaling, 
        args.bptt_p, 
        args.bptt_s, 
        args.bptt_min_len, 
        args.bptt_max_len
    )):
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr * lr_scale
        data, targets = get_batch(train_data, i, seq_len)
        hidden = repackage_hidden(hidden)
        optimizer.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        optimizer.step()
        total_loss += loss.item()
        if batch % LOG_INTERVAL == 0 and batch > 0:
            cur_loss = total_loss / LOG_INTERVAL
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:3.2E} | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt_seq_len, lr,
                elapsed * 1000 / LOG_INTERVAL, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [9]:
best_val_loss = 1e20
try:
    for epoch in range(1, args.epochs + 1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(valid_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
            epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss)))
        print('-' * 89)
        if val_loss < best_val_loss:
            with open(args.save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            lr *= LR_ANNEALING_RATE
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

  result = self.forward(*input, **kwargs)


| epoch   1 |    50/  663 batches | lr 2.00E-03 | ms/batch 172.08 | loss  7.05 | ppl  1149.87
| epoch   1 |   100/  663 batches | lr 2.00E-03 | ms/batch 171.04 | loss  6.64 | ppl   762.05
| epoch   1 |   150/  663 batches | lr 2.00E-03 | ms/batch 164.13 | loss  6.62 | ppl   748.85
| epoch   1 |   200/  663 batches | lr 2.00E-03 | ms/batch 170.39 | loss  6.36 | ppl   577.08
| epoch   1 |   250/  663 batches | lr 2.00E-03 | ms/batch 165.59 | loss  6.25 | ppl   520.09
| epoch   1 |   300/  663 batches | lr 2.00E-03 | ms/batch 164.86 | loss  6.17 | ppl   480.10
| epoch   1 |   350/  663 batches | lr 2.00E-03 | ms/batch 165.73 | loss  6.12 | ppl   454.14
| epoch   1 |   400/  663 batches | lr 2.00E-03 | ms/batch 168.38 | loss  5.95 | ppl   383.49
| epoch   1 |   450/  663 batches | lr 2.00E-03 | ms/batch 166.22 | loss  5.92 | ppl   374.02
| epoch   1 |   500/  663 batches | lr 2.00E-03 | ms/batch 169.25 | loss  5.93 | ppl   376.42
| epoch   1 |   550/  663 batches | lr 2.00E-03 | ms/batch 1

In [10]:
# Load the best saved model.
with open(args.save, 'rb') as f:
    model = torch.load(f, map_location=device)

In [11]:
# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(test_loss, math.exp(test_loss)))
print('=' * 89)

| End of training | test loss  4.49 | test ppl    88.94
