In [1]:
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
from data_utils import Dictionary, Corpus

In [2]:
# Hyper parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 1000
batch_size = 20
seq_length = 30
learning_rate = 0.002

In [3]:
# Load Penn Treebank dataset
train_path = './data/train.txt'
sample_path = './sample.txt'
corpus = Corpus()
ids = corpus.get_data(train_path, batch_size)
vocab_size = len(corpus.dictionary)
num_batches = ids.size(1) // seq_length

In [4]:
print(ids)


    0     1     2  ...    152  4955  4150
   93   718   590  ...    170  6784   133
   27   930    42  ...    392  4864    26
       ...          ⋱          ...       
  997    42   507  ...    682  6849  6344
  392  5518  3034  ...   2264    42  3401
 4210   467  1496  ...   9999   119  1143
[torch.LongTensor of size 20x46479]



In [5]:
print(vocab_size)

10000


In [6]:
print(num_batches)

1549


In [7]:
# RNN Based Language Model
class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.init_weights()
    
    def init_weights(self):
        self.embed.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)
        self.linear.weight.data.uniform_(-0.1, 0.1)
        
    def forward(self, x, h):
        # Embed word ids to vectors
        x = self.embed(x)
        
        # Forward propagate RNN
        out, h = self.lstm(x, h)
        
        # Reshape output to (batch_size * sequence_length, hidden_size)
        out = out.contiguous().view(out.size(0)*out.size(1), out.size(2))
        
        # Decode hidden states of all time step
        out = self.linear(out)
        return out, h

In [8]:
model = RNNLM(vocab_size, embed_size, hidden_size, num_layers)

In [9]:
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Truncated Backpropagation
def detach(states):
    return [state.detach() for state in states]

In [10]:
# Training
for epoch in range(num_epochs):
    # Initial hidden and memory states
    states = (Variable(torch.zeros(num_layers, batch_size, hidden_size)),
             Variable(torch.zeros(num_layers, batch_size, hidden_size)))
    
    for i in range(0, ids.size(1) - seq_length, seq_length):
        # Get batch inputs and targets
        inputs = Variable(ids[:, i:i+seq_length])
        targets = Variable(ids[:, (i+1):(i+1)+seq_length].contiguous())
        
        # Forward + Backward + Optimize
        model.zero_grad()
        states = detach(states)
        outputs, states = model(inputs, states)
        loss = criterion(outputs, targets.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.5)
        optimizer.step()
        
        step = (i+1) // seq_length
        if step % 100 == 0:
            print('Epoch [%d/%d], Step[%d/%d], Loss: %.3f, Perplexity: %5.2f' % (epoch+1, num_epochs, step, num_batches, loss.data[0], np.exp(loss.data[0])))

Epoch [1/5], Step[0/1549], Loss: 9.206, Perplexity: 9955.61
Epoch [1/5], Step[100/1549], Loss: 6.027, Perplexity: 414.55
Epoch [1/5], Step[200/1549], Loss: 5.927, Perplexity: 375.21


KeyboardInterrupt: 

In [None]:
# Sampling
with open(sample_path, 'w') as f:
    state = (Variable(torch.zeros(num_layers, 1, hidden_size)),
            Variable(torch.zeros(num_layers, 1, hidden_size)))
    
    # Select one word id randomly
    prob = torch.ones(vocab_size)
    input = Variable(torch.multinomial(prob, num))