In [1]:
import os, time, math, argparse
import torch, torchtext

In [2]:
# test CUDA
torch.randn(4).cuda()

tensor([ 0.0702,  0.5333, -1.9662, -2.0433], device='cuda:0')

In [3]:
batch_size = 20
bptt_len = 30   # length of sequences for backpropagation through time 
clip = 0.25
lr = 20
log_interval = 100
embedding_dim = 200
num_layers = 2
dropout = 0.2

# data

In [4]:
TEXT = torchtext.data.Field(lower = True, batch_first = True)

In [5]:
# make splits for data
train, valid, test = torchtext.datasets.WikiText2.splits(TEXT, root = 'data')

In [6]:
print(len(train[0].text))
print(train[0].text[:10])
print(" ".join(train[0].text[:100]))

2088628
['<eos>', '=', 'valkyria', 'chronicles', 'iii', '=', '<eos>', '<eos>', 'senjō', 'no']
<eos> = valkyria chronicles iii = <eos> <eos> senjō no valkyria 3 : <unk> chronicles ( japanese : 戦場のヴァルキュリア3 , lit . valkyria of the battlefield 3 ) , commonly referred to as valkyria chronicles iii outside japan , is a tactical role @-@ playing video game developed by sega and media.vision for the playstation portable . released in january 2011 in japan , it is the third game in the valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the


In [7]:
# trim the last extra words which form a small batch
print(f"train: {len(train[0].text)}, validation: {len(valid[0].text)}, test: {len(test[0].text)}")
train[0].text = train[0].text[:len(train[0].text)//batch_size*batch_size]
valid[0].text = valid[0].text[:len(valid[0].text)//batch_size*batch_size]
test[0].text  = test[0].text[ :len(test[0].text)// batch_size*batch_size]
print(f"train: {len(train[0].text)}, validation: {len(valid[0].text)}, test: {len(test[0].text)}")

train: 2088628, validation: 217646, test: 245569
train: 2088620, validation: 217640, test: 245560


In [8]:
TEXT.build_vocab(train)
print('len(TEXT.vocab):', len(TEXT.vocab))
print('The vocabulary contains all unique words and "<pad>":', set(TEXT.vocab.itos) == set(train[0].text + ['<pad>']))

len(TEXT.vocab): 28913
The vocabulary contains all unique words and "<pad>": True


In [9]:
train_iter, valid_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, valid, test), batch_size = batch_size, bptt_len = bptt_len, device = 'cuda',repeat = False
)

In [10]:
I = iter(valid_iter)
batch1 = next(I)
batch2 = next(I)
print("In LSTM, input's shape is (sequence_length, batch_size):", batch1.text.shape, '\n')
print(' '.join(valid[0].text[:100]))
print("\nEach column is a sequence:")
print(' '.join([ TEXT.vocab.itos[i] for i in batch1.text[:, 0] ]))
print(' '.join([ TEXT.vocab.itos[i] for i in batch2.text[:, 0] ]))
print("\nThe target contains next words:")
print(' '.join([ TEXT.vocab.itos[i] for i in batch1.target[:, 0] ]))
print(' '.join([ TEXT.vocab.itos[i] for i in batch2.target[:, 0] ]))

In LSTM, input's shape is (sequence_length, batch_size): torch.Size([30, 20]) 

<eos> = homarus gammarus = <eos> <eos> homarus gammarus , known as the european lobster or common lobster , is a species of <unk> lobster from the eastern atlantic ocean , mediterranean sea and parts of the black sea . it is closely related to the american lobster , h. americanus . it may grow to a length of 60 cm ( 24 in ) and a mass of 6 kilograms ( 13 lb ) , and bears a conspicuous pair of claws . in life , the lobsters are blue , only becoming " lobster red " on cooking

Each column is a sequence:
<eos> = homarus gammarus = <eos> <eos> homarus gammarus , known as the european lobster or common lobster , is a species of <unk> lobster from the eastern atlantic ocean
, mediterranean sea and parts of the black sea . it is closely related to the american lobster , h. americanus . it may grow to a length of 60

The target contains next words:
= homarus gammarus = <eos> <eos> homarus gammarus , known as the eu

# model

In [11]:
x = batch1.text.cpu()
print("input:     ", x.shape)
encoder = torch.nn.Embedding(len(TEXT.vocab), 100)
x = encoder(x)
print("embedding: ", x.shape)
x, h = torch.nn.LSTM(100, 100, 3)(x)
print("lstm:      ", x.shape, f", hidden: {len(h)}*{h[0].shape}")
decoder = torch.nn.Linear(100, len(TEXT.vocab))  
x = decoder(x)
print("decode:    ", x.shape)

input:      torch.Size([30, 20])
embedding:  torch.Size([30, 20, 100])
lstm:       torch.Size([30, 20, 100]) , hidden: 2*torch.Size([3, 20, 100])
decode:     torch.Size([30, 20, 28913])


In [12]:
class RNNModel(torch.nn.Module):
    def __init__(self, num_embeddings, embedding_dim, num_layers, dropout):
        super().__init__()
        hidden_size = embedding_dim  # because encoder & decoder share same weight 
        self.drop1 = torch.nn.Dropout(dropout)
        self.encoder = torch.nn.Embedding(num_embeddings, embedding_dim)
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_size, num_layers, dropout = dropout)
        self.drop2 = torch.nn.Dropout(dropout)
        self.decoder = torch.nn.Linear(hidden_size, num_embeddings)
        self.decoder.weight = self.encoder.weight # tie weights
        self.init_weights()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        
    def forward(self, input, hidden): 
        sequence_len, batch_size = input.shape
        embedding = self.encoder(input)
        embedding = self.drop1(embedding)
        output, hidden = self.rnn(embedding, hidden)
        output = self.drop2(output)
        return self.decoder(output), hidden
    
    def init_hidden(self, batch_size):
        hidden_shape = (self.num_layers, batch_size, self.hidden_size)
        weight1 = next(self.parameters()).data   # ensure same dtype and device
        return (weight1.new_zeros(*hidden_shape), weight1.new_zeros(*hidden_shape))

In [13]:
criterion = torch.nn.CrossEntropyLoss()
lstm = RNNModel(len(TEXT.vocab), embedding_dim, num_layers, dropout).cuda()
print(lstm)

RNNModel(
  (drop1): Dropout(p=0.2)
  (encoder): Embedding(28913, 200)
  (rnn): LSTM(200, 200, num_layers=2, dropout=0.2)
  (drop2): Dropout(p=0.2)
  (decoder): Linear(in_features=200, out_features=28913, bias=True)
)


# Training

In [14]:
def train(model, data_loader, epoch):
    # Turn on training mode which enables dropout.
    model.train()
    hidden = model.init_hidden(batch_size)
    for i, batch in enumerate(data_loader):
        input, target = batch.text, batch.target
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = [h.detach() for h in hidden]
        model.zero_grad()
        output, hidden = model(input, hidden)
        loss = criterion(output.view(-1, len(TEXT.vocab)), target.view(-1))
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters(): p.data.add_(-lr*p.grad.data)
        if (i + 1) % log_interval == 0: print(f"epoch {epoch:03d}, {i:4d}/{len(data_loader):4d} batches, lr: {lr:02.2f}, loss: {loss.item():5.3f}")

In [15]:
def evaluate(model, data_loader):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    loss = 0   
    hidden = model.init_hidden(batch_size)
    with torch.no_grad():
        for batch in data_loader:        
            input, target = batch.text, batch.target
            output, hidden = model(input, hidden)
            loss += criterion(output.view(-1, len(TEXT.vocab)), target.view(-1))
            hidden = [h.detach() for h in hidden]
    return loss.item()/len(data_loader)

In [16]:
best_val_loss = None
for epoch in range(2):
    train(lstm, train_iter, epoch + 1)
    val_loss = evaluate(lstm, valid_iter)
    print(f'validation loss {val_loss:5.2f}\n' + '-' * 89)
    if not best_val_loss or val_loss < best_val_loss:
        best_val_loss = val_loss
    else:
        # Anneal the learning rate if no improvement has been seen in the validation dataset.
        lr /= 4.0

print(f'\n\nTest loss {evaluate(lstm, test_iter):5.2f}')

epoch 001,   99/3481 batches, lr: 20.00, loss: 7.177
epoch 001,  199/3481 batches, lr: 20.00, loss: 6.662
epoch 001,  299/3481 batches, lr: 20.00, loss: 6.512
epoch 001,  399/3481 batches, lr: 20.00, loss: 6.184
epoch 001,  499/3481 batches, lr: 20.00, loss: 6.153
epoch 001,  599/3481 batches, lr: 20.00, loss: 6.177
epoch 001,  699/3481 batches, lr: 20.00, loss: 6.039
epoch 001,  799/3481 batches, lr: 20.00, loss: 6.132
epoch 001,  899/3481 batches, lr: 20.00, loss: 6.154
epoch 001,  999/3481 batches, lr: 20.00, loss: 5.780
epoch 001, 1099/3481 batches, lr: 20.00, loss: 5.939
epoch 001, 1199/3481 batches, lr: 20.00, loss: 6.003
epoch 001, 1299/3481 batches, lr: 20.00, loss: 5.698
epoch 001, 1399/3481 batches, lr: 20.00, loss: 5.548
epoch 001, 1499/3481 batches, lr: 20.00, loss: 5.694
epoch 001, 1599/3481 batches, lr: 20.00, loss: 5.468
epoch 001, 1699/3481 batches, lr: 20.00, loss: 5.915
epoch 001, 1799/3481 batches, lr: 20.00, loss: 6.010
epoch 001, 1899/3481 batches, lr: 20.00, loss:

In [17]:
lstm.eval()
hidden = lstm.init_hidden(batch_size)
input = next(iter(test_iter)).text
with torch.no_grad(): output = torch.argmax( lstm(input, hidden)[0], dim = 2)

sentence = [[]]* batch_size
for j in range(batch_size): sentence[j] = [TEXT.vocab.itos[i] for i in output[:, j]]
_  = [print(' '.join(s) + '\n') for s in sentence]

<eos> = <unk> ( <eos> <eos> the <unk> ( a american son in a , <unk> , , the was a <unk> appearance time <unk> in the <unk> series ,

@-@ century . , and the <unk> were also with the . years . which the longer was <unk> by <eos> <unk> <unk> of was <unk> also by the ,

. <eos> , the was a first in in of and , , to <unk> . who the of a <unk> state of <eos> was first was @-@ also @.@

than , % year ( , , the <unk> the <unk> of the up game to , the <unk> were <unk> <unk> was up and to the <unk> <unk> .

, <eos> <unk> of the was <unk> the the <unk> , <unk> , , was also of the <unk> <unk> . . by be a , the . the the

<unk> @-@ . the . states <eos> <eos> = = = <unk> = the = = = = <eos> <eos> the the december 2006 , was a @-@ , ,

<unk> . . the <eos> first of the <unk> of the the of the of starlings <unk> schools . and the to the <unk> . and <unk> <unk> , and

, <unk> @-@ . the so . the was the the one on first year , the example first time , the was announced by the <unk> <eos> game was

the i