In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [58]:
# import argparse
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx

from utils import data
from model.RNNModel import RNNModel

In [4]:
device = torch.device("cpu")

In [26]:
corpus = data.Corpus('./data/wikitext')

In [32]:
print(corpus.valid.shape)
print(corpus.valid.size())

torch.Size([217646])
torch.Size([217646])


In [33]:
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

In [36]:
train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [37]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [39]:
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [40]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / (len(data_source) - 1)

In [63]:
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
    start_time = time.time()

In [68]:
batch_size = 64
eval_batch_size = 10
bptt = 30
lr = 0.1
clip = 0.25
log_interval = 10
epochs = 1

In [71]:
ntokens = len(corpus.dictionary)
model = RNNModel(ntokens).to(device)

criterion = nn.CrossEntropyLoss()

best_val_loss = None

In [70]:
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(args.save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

| epoch   1 |    10/ 1087 batches | lr 0.10 | ms/batch 2297.50 | loss 12.48 | ppl 262476.24
| epoch   1 |    20/ 1087 batches | lr 0.10 | ms/batch 4424.05 | loss 11.33 | ppl 83098.57
| epoch   1 |    30/ 1087 batches | lr 0.10 | ms/batch 6633.24 | loss 11.31 | ppl 81952.56
| epoch   1 |    40/ 1087 batches | lr 0.10 | ms/batch 8739.09 | loss 11.30 | ppl 80786.68
| epoch   1 |    50/ 1087 batches | lr 0.10 | ms/batch 11029.83 | loss 11.28 | ppl 79533.09
| epoch   1 |    60/ 1087 batches | lr 0.10 | ms/batch 13139.33 | loss 11.27 | ppl 78558.06
| epoch   1 |    70/ 1087 batches | lr 0.10 | ms/batch 15221.36 | loss 11.26 | ppl 77353.32
| epoch   1 |    80/ 1087 batches | lr 0.10 | ms/batch 17315.48 | loss 11.24 | ppl 76134.95
| epoch   1 |    90/ 1087 batches | lr 0.10 | ms/batch 19420.89 | loss 11.22 | ppl 74956.49
| epoch   1 |   100/ 1087 batches | lr 0.10 | ms/batch 21538.03 | loss 11.21 | ppl 74060.50
| epoch   1 |   110/ 1087 batches | lr 0.10 | ms/batch 23622.86 | loss 11.19 | ppl 

| epoch   1 |   900/ 1087 batches | lr 0.10 | ms/batch 188396.80 | loss  9.13 | ppl  9246.30
| epoch   1 |   910/ 1087 batches | lr 0.10 | ms/batch 190552.93 | loss  9.15 | ppl  9389.24
| epoch   1 |   920/ 1087 batches | lr 0.10 | ms/batch 192676.07 | loss  9.20 | ppl  9930.50
| epoch   1 |   930/ 1087 batches | lr 0.10 | ms/batch 194875.73 | loss  9.16 | ppl  9493.79
| epoch   1 |   940/ 1087 batches | lr 0.10 | ms/batch 197022.25 | loss  9.12 | ppl  9104.87
| epoch   1 |   950/ 1087 batches | lr 0.10 | ms/batch 199170.11 | loss  9.07 | ppl  8712.15
| epoch   1 |   960/ 1087 batches | lr 0.10 | ms/batch 201270.12 | loss  9.09 | ppl  8900.62
| epoch   1 |   970/ 1087 batches | lr 0.10 | ms/batch 203544.54 | loss  9.13 | ppl  9237.01
| epoch   1 |   980/ 1087 batches | lr 0.10 | ms/batch 205710.41 | loss  9.15 | ppl  9458.77
| epoch   1 |   990/ 1087 batches | lr 0.10 | ms/batch 207829.86 | loss  9.08 | ppl  8794.52
| epoch   1 |  1000/ 1087 batches | lr 0.10 | ms/batch 210005.32 | los

NameError: name 'best_val_loss' is not defined

In [41]:
for batch, i in enumerate(range(0, 100 - 1, bptt)):
    print(batch,i)

(0, 0)
(1, 30)
(2, 60)
(3, 90)
