[Data Set link](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/)

In [1]:
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
!unzip ./wikitext-103-v1.zip

--2019-08-21 05:48:13--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.139.125
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.139.125|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 190229076 (181M) [application/zip]
Saving to: ‘wikitext-103-v1.zip’


2019-08-21 05:48:18 (42.2 MB/s) - ‘wikitext-103-v1.zip’ saved [190229076/190229076]

Archive:  ./wikitext-103-v1.zip
   creating: wikitext-103/
  inflating: wikitext-103/wiki.test.tokens  
  inflating: wikitext-103/wiki.valid.tokens  
  inflating: wikitext-103/wiki.train.tokens  


In [2]:
# !wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
# !unzip ./wikitext-2-v1.zip

In [3]:
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [4]:
import os
import math
import time
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from io import open

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'wiki.train.tokens'))
        self.valid = self.tokenize(os.path.join(path, 'wiki.valid.tokens'))
        self.test = self.tokenize(os.path.join(path, 'wiki.test.tokens'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word])
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids


In [5]:
class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)

In [6]:
argsdata = './wikitext-103/' # or './wikitext-103'
argsbatch_size = 30
argsemsize=200
argsnhead=2
argsnhid=200
argsnlayers=2
argsdropout=0.5
argslog_interval=200
argseval_interval=2000
argsclip=0.5
argsseed=42
argsbptt=35
argscuda=True
argslr=10
argsepochs=3
argstemperature = 1.0
argssave='./model.pt'
argscheckpoint = './model.pt'
argsoutf='generated.txt'
argswords=100
argstied = True
argsmodel = 'LSTM' # (RNN_TANH, RNN_RELU, LSTM, GRU)

In [7]:
%%time
# Set the random seed manually for reproducibility.
torch.manual_seed(argsseed)
if torch.cuda.is_available():
    if not argscuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if argscuda else "cpu")
###############################################################################
# Load data
###############################################################################

if(os.path.exists('./corpus')):
    with open('corpus', 'rb') as data_file:
        corpus = pickle.load(data_file)
else:
    corpus = Corpus(argsdata)
    with open('corpus', 'wb') as data_file:
        pickle.dump(corpus, data_file)

ntokens = len(corpus.dictionary)
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

eval_batch_size = 100
train_data = batchify(corpus.train, argsbatch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

CPU times: user 1min 49s, sys: 3.64 s, total: 1min 53s
Wall time: 2min


In [8]:
def print_gentext():
    """Generate some example text form model """
    input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
    hidden = model.init_hidden(1)
    with torch.no_grad():  # no tracking history
        for i in range(argswords):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(argstemperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.fill_(word_idx)
            word = corpus.dictionary.idx2word[word_idx]
            print(word + ('\n' if i % 20 == 19 else ' '),end='')
            
def evaluate1():
    global best_val_loss
    global lr
    val_loss = evaluate(val_data)
    print('-' * 89)
    print('| epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
            'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                       val_loss, math.exp(val_loss)))
    print('-' * 89)
    print('Generated Text:')
    print_gentext()
    # Save the model if the validation loss is the best we've seen so far.
    if not best_val_loss or val_loss < best_val_loss:
        with open(argssave, 'wb') as f:
            torch.save(model, f)
        best_val_loss = val_loss
    else:
        # Anneal the learning rate if no improvement has been seen in the validation dataset.
        lr /= 4.0

In [9]:

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)

model = RNNModel(argsmodel, ntokens, argsemsize, argsnhid, argsnlayers, argsdropout, argstied).to(device)

criterion = nn.CrossEntropyLoss()
###############################################################################
# Training code
###############################################################################

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)


# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.

def get_batch(source, i):
    seq_len = min(argsbptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, argsbptt):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    model.train()
    return total_loss / (len(data_source) - 1)


def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(argsbatch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, argsbptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), argsclip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()
        if batch% argseval_interval == 0 and batch > 0:
            evaluate1()
            
        if batch % argslog_interval == 0 and batch > 0:
            cur_loss = total_loss / argslog_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // argsbptt, lr,
                elapsed * 1000 / argslog_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

# Loop over epochs.
lr = argslr
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, argsepochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        print('Generated Text:')
        print_gentext()
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(argssave, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(argssave, 'rb') as f:
    model = torch.load(f)
    model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| epoch   1 |   200/98311 batches | lr 10.00 | ms/batch 102.41 | loss  9.12 | ppl  9160.28
| epoch   1 |   400/98311 batches | lr 10.00 | ms/batch 101.53 | loss  8.03 | ppl  3086.11
| epoch   1 |   600/98311 batches | lr 10.00 | ms/batch 101.24 | loss  7.61 | ppl  2014.13
| epoch   1 |   800/98311 batches | lr 10.00 | ms/batch 101.21 | loss  7.30 | ppl  1486.48
| epoch   1 |  1000/98311 batches | lr 10.00 | ms/batch 101.22 | loss  7.11 | ppl  1222.51
| epoch   1 |  1200/98311 batches | lr 10.00 | ms/batch 101.35 | loss  7.00 | ppl  1098.00
| epoch   1 |  1400/98311 batches | lr 10.00 | ms/batch 101.15 | loss  6.93 | ppl  1017.79
| epoch   1 |  1600/98311 batches | lr 10.00 | ms/batch 101.15 | loss  6.79 | ppl   886.20
| epoch   1 |  1800/98311 batches | lr 10.00 | ms/batch 101.24 | loss  6.78 | ppl   882.59
-----------------------------------------------------------------------------------------
| epoch   1 | time: 208.57s | valid loss  6.52 | valid ppl   677.47
-----------------------

In [10]:
with open(argssave, 'wb') as f:
    torch.save(model, f)

In [11]:
###############################################################################
# Language Modeling on Wikitext-103
#
# This generates new sentences sampled from the language model
#
###############################################################################

argswords = 1000
# Set the random seed manually for reproducibility.
torch.manual_seed(argsseed)
if torch.cuda.is_available():
    if not argscuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if argscuda else "cpu")

if argstemperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

with open(argscheckpoint, 'rb') as f:
    model = torch.load(f).to(device)
model.eval()

if(os.path.exists('./corpus')):
    with open('corpus', 'rb') as data_file:
        corpus = pickle.load(data_file)
else:
    corpus = Corpus(argsdata)
    with open('corpus', 'wb') as data_file:
        pickle.dump(corpus, data_file)

ntokens = len(corpus.dictionary)
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
hidden = model.init_hidden(1)

with open(argsoutf, 'w') as outf:
    with torch.no_grad():  # no tracking history
        for i in range(argswords):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(argstemperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.fill_(word_idx)
            word = corpus.dictionary.idx2word[word_idx]
            outf.write(word + ('\n' if i % 20 == 19 else ' '))
            print(word + ('\n' if i % 20 == 19 else ' '),end='')
            
            #if i % argslog_interval == 0:
            #    print('| Generated {}/{} words'.format(i, argswords))

College on layered battery , mainly based on their gameplay and stability . In the season , he was difficult
for her position in command of the BBC Echo , and said the East Side 's memo included her trial
. The first feat of ornamenting had joined his two colliculus . The publisher found he was willing to cope
with to say he was not executed by the field 's script 's ending into the role , but meant
he had been interpreted in Russell CBT and he was a whole . The Australian finalists were also engaged as
an whole amateur name . Having another documented reformer as Tyler , Barry worked to Junchen and of マーヴル .
<eos> On an six @-@ year @-@ old demo season festival , Kyle Wevertown about the Mizukage , Petersen (
née <unk> ) , was material to discuss her vocal help , with the men back in the forbidding aforementioned
@-@ time friendship , the actor Bob <unk> culminates over the Christmas therapy , Lary , and he ran home
with the Dale ERCC1 , " king . " The early spirit of their one book C