[Data Set link](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/)

In [1]:
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
!unzip ./wikitext-103-v1.zip

--2019-08-30 15:46:18--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.134.85
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.134.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 190229076 (181M) [application/zip]
Saving to: ‘wikitext-103-v1.zip’


2019-08-30 15:46:23 (41.5 MB/s) - ‘wikitext-103-v1.zip’ saved [190229076/190229076]

Archive:  ./wikitext-103-v1.zip
   creating: wikitext-103/
  inflating: wikitext-103/wiki.test.tokens  
  inflating: wikitext-103/wiki.valid.tokens  
  inflating: wikitext-103/wiki.train.tokens  


In [2]:
# !wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
# !unzip ./wikitext-2-v1.zip

In [3]:
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [4]:
argsdata = './wikitext-103' # or './wikitext-103'
argsbatch_size = 30
argsemsize=200
argsnhead=2
argsnhid=200
argsnlayers=2
argsdropout=0.4
argslog_interval=200
argseval_interval=5000
argsclip=0.25
argsseed=42
argsbptt=35
argscuda=True
argslr=5
argsepochs=2
argstemperature = 1.0
argssave='./model.pt'
argscheckpoint = './model.pt'
argsoutf='generated.txt'
argswords=300

In [5]:
import os
import math
import time
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from io import open

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'wiki.train.tokens'))
        self.valid = self.tokenize(os.path.join(path, 'wiki.valid.tokens'))
        self.test = self.tokenize(os.path.join(path, 'wiki.test.tokens'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word])
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids


In [6]:
# Temporarily leave PositionalEncoding module here. Will be moved somewhere else.
class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens
        in the sequence. The positional encodings have the same dimension as
        the embeddings, so that the two can be summed. Here, we use sine and cosine
        functions of different frequencies.
    .. math::
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """
        
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    """Container module with an encoder, a recurrent or transformer module, and a decoder."""

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)
        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return F.log_softmax(output, dim=-1)

In [7]:
%%time
# Set the random seed manually for reproducibility.
torch.manual_seed(argsseed)
if torch.cuda.is_available():
    if not argscuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if argscuda else "cpu")
###############################################################################
# Load data
###############################################################################

if(os.path.exists('./corpus')):
    with open('corpus', 'rb') as data_file:
        corpus = pickle.load(data_file)
else:
    corpus = Corpus(argsdata)
    with open('corpus', 'wb') as data_file:
        pickle.dump(corpus, data_file)

ntokens = len(corpus.dictionary)
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

eval_batch_size = 50
train_data = batchify(corpus.train, argsbatch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

CPU times: user 1min 52s, sys: 4.6 s, total: 1min 56s
Wall time: 2min 4s


In [8]:
def print_gentext():
    input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
    with torch.no_grad():  # no tracking history
        for i in range(argswords):
            output = model(input, False)
            word_weights = output[-1].squeeze().div(argstemperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            word_tensor = torch.Tensor([[word_idx]]).long().to(device)
            input = torch.cat([input, word_tensor], 0)
            word = corpus.dictionary.idx2word[word_idx]
            print(word + ('\n' if i % 20 == 19 else ' '),end='')

def evaluate1():
    global best_val_loss
    global lr
    val_loss = evaluate(val_data)
    print('-' * 89)
    print('| epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
            'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                       val_loss, math.exp(val_loss)))
    print('-' * 89)
    print('Generated Text:')
    print_gentext()
    # Save the model if the validation loss is the best we've seen so far.
    if not best_val_loss or val_loss < best_val_loss:
        with open(argssave, 'wb') as f:
            torch.save(model, f)
        best_val_loss = val_loss
    else:
        # Anneal the learning rate if no improvement has been seen in the validation dataset.
        lr /= 4.0

In [9]:

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)

model = TransformerModel(ntokens, argsemsize, argsnhead, argsnhid, argsnlayers, argsdropout).to(device)

criterion = nn.CrossEntropyLoss()
###############################################################################
# Training code
###############################################################################

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)


# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.

def get_batch(source, i):
    seq_len = min(argsbptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, argsbptt):
            data, targets = get_batch(data_source, i)
            output = model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)


def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, argsbptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), argsclip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)
        
        total_loss += loss.item()
        if batch% argseval_interval == 0 and batch > 0:
            evaluate1()
            
        if batch % argslog_interval == 0 and batch > 0:
            cur_loss = total_loss / argslog_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // argsbptt, lr,
                elapsed * 1000 / argslog_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

# Loop over epochs.
lr = argslr
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, argsepochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        print('Generated Text:')
        print_gentext()
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(argssave, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(argssave, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| epoch   1 |   200/98311 batches | lr 5.00 | ms/batch 134.72 | loss  9.00 | ppl  8095.04
| epoch   1 |   400/98311 batches | lr 5.00 | ms/batch 130.10 | loss  7.94 | ppl  2816.82
| epoch   1 |   600/98311 batches | lr 5.00 | ms/batch 130.19 | loss  7.58 | ppl  1955.19
| epoch   1 |   800/98311 batches | lr 5.00 | ms/batch 130.11 | loss  7.31 | ppl  1496.95
| epoch   1 |  1000/98311 batches | lr 5.00 | ms/batch 130.22 | loss  7.16 | ppl  1283.49
| epoch   1 |  1200/98311 batches | lr 5.00 | ms/batch 130.03 | loss  7.07 | ppl  1179.68
| epoch   1 |  1400/98311 batches | lr 5.00 | ms/batch 129.98 | loss  7.01 | ppl  1103.22
| epoch   1 |  1600/98311 batches | lr 5.00 | ms/batch 130.01 | loss  6.87 | ppl   965.58
| epoch   1 |  1800/98311 batches | lr 5.00 | ms/batch 129.97 | loss  6.89 | ppl   978.35
| epoch   1 |  2000/98311 batches | lr 5.00 | ms/batch 130.01 | loss  6.81 | ppl   909.36
| epoch   1 |  2200/98311 batches | lr 5.00 | ms/batch 130.09 | loss  6.74 | ppl   844.66
| epoch   

In [10]:
!nvidia-smi

Fri Aug 30 23:03:15 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  On   | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    51W / 250W |  13755MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [11]:
with open(argssave, 'wb') as f:
    torch.save(model, f)

In [12]:
###############################################################################
# Language Modeling on Wikitext-103
#
# This generates new sentences sampled from the language model
#
###############################################################################

# Set the random seed manually for reproducibility.
torch.manual_seed(argsseed)
if torch.cuda.is_available():
    if not argscuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if argscuda else "cpu")

if argstemperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

with open(argscheckpoint, 'rb') as f:
    model = torch.load(f).to(device)
model.eval()

if(os.path.exists('./corpus')):
    with open('corpus', 'rb') as data_file:
        corpus = pickle.load(data_file)
else:
    corpus = Corpus(argsdata)
    with open('corpus', 'wb') as data_file:
        pickle.dump(corpus, data_file)

ntokens = len(corpus.dictionary)

input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)

with open(argsoutf, 'w') as outf:
    with torch.no_grad():  # no tracking history
        for i in range(argswords):

            output = model(input, False)
            word_weights = output[-1].squeeze().div(argstemperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            word_tensor = torch.Tensor([[word_idx]]).long().to(device)
            input = torch.cat([input, word_tensor], 0)

            word = corpus.dictionary.idx2word[word_idx]

            outf.write(word + ('\n' if i % 20 == 19 else ' '))
            print(word + ('\n' if i % 20 == 19 else ' '),end='')
            
            #if i % argslog_interval == 0:
            #    print('| Generated {}/{} words'.format(i, argswords))

Leela Schwarzkopf noted " the charming girl " and the bad paternal love " , which she said is capable
of keeping it involved " a little realization featuring a [ classic ' s ] or heartfelt miracle against those
previous friends , conceptual opening decorate season though she is COFS , " praising her persona of storyteller " .
" . In 1855 second " myth and the destroyed " , Dwight and subtle settings ( the Western likes
— they were arranged poem beat Hindsight ' White American " that of a sophisticated Welsh album , deprecating theme
and familiar image of much extraordinary scholar Britney bluesier " according to date in the crescendos " ( Mii girl
, fancy and encounters with the director of animation drama wordplay , penetrating parclose distorted , Ragam , intensely ,
given . Ryan 's popularity and that Rodman ] exciting and not so large numbers around her story arc —
and an attractive , the classic hip , Captive emotional and horror commentator , Nirguna the song @-@ organic vein
wou