In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
from copy import deepcopy
from tqdm import tqdm
from pprint import pprint
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1994)

In [2]:
USE_CUDA = torch.cuda.is_available()
print(USE_CUDA)
if USE_CUDA:
    gpus = [0]
    torch.cuda.set_device(0)

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

True


In [3]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if to_index.get(w) is not None else to_index["<unk>"], seq))
    return LongTensor(idxs)

In [4]:
def prepare_ptb_dataset(filename, word2index = None):
    corpus = open(filename, 'r', encoding = 'cp1252').readlines()
    corpus = flatten([co.strip().split() + ['<\s'] for co in corpus])
    if word2index is None:
        vocab = list(set(corpus))
        word2index = {'<unk>' : 0}
        for vo in vocab:
            if word2index.get(vo) is None:
                word2index[vo] = len(word2index)
    return prepare_sequence(corpus, word2index), word2index

In [5]:
def batchify(data, bsz):
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).contiguous()
    if USE_CUDA:
        data = data.cuda()
    return data

In [6]:
def getBatch(data, seq_length):
    for i in range(0, data.size(1) - seq_length, seq_length):
        inputs = Variable(data[:, i: i + seq_length])
        targets = Variable(data[:, (i + 1): (i + 1) + seq_length].contiguous())
        yield inputs, targets

In [7]:
train_data, word2index = prepare_ptb_dataset("Harry Potter 1 - Sorcerer's Stone.txt",)
dev_data, _ = prepare_ptb_dataset("Harry Potter 2 - Chamber of Secrets.txt", word2index)
test_data, _ = prepare_ptb_dataset("Harry Potter 3 - The Prisoner Of Azkaban.txt", word2index)

In [8]:
len(word2index)

11899

In [9]:
index2word = {v: k for k, v in word2index.items()}

In [10]:
class LanguageModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_size, n_layers=1, dropout_p = 0.5):
        super(LanguageModel, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_size, n_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout_p)
        
    def init_weight(self):
        self.embed.weight = nn.init.xavier_uniform_(self.embed.weight)
        self.linear.weight = nn.init.xavier_uniform_(self.linear.weight)
        self.linear.bias.data.fill_(0)
        
    def init_hidden(self, batch_size):
        hidden = Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))
        context = Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))
        return (hidden.cuda(), context.cuda()) if USE_CUDA else (hidden, context)
    
    def detach_hidden(self, hiddens):
        return tuple([hidden.detach() for hidden in hiddens])
    
    def forward(self, inputs, hidden, is_training=False):
        embeds = self.embed(inputs)
        if is_training:
            embeds = self.dropout(embeds)
        out, hidden = self.rnn(embeds, hidden)
        return self.linear(out.contiguous().view(out.size(0) * out.size(1), -1)), hidden
        

In [11]:
EMBED_DIM = 128
HIDDEN_SIZE = 1024
NUM_LAYER = 1
LR = 0.01
SEQ_LENGTH = 30 
BATCH_SIZE = 20
EPOCH = 40
RESCHEDULED = False

In [12]:
train_data = batchify(train_data, BATCH_SIZE)
dev_data = batchify(dev_data, BATCH_SIZE//2)
test_data = batchify(test_data, BATCH_SIZE//2)

In [58]:
model = LanguageModel(len(word2index), EMBED_DIM, HIDDEN_SIZE, NUM_LAYER, 0.5)
model.init_weight()
if USE_CUDA:
    model = model.cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = LR)

In [59]:
for epoch in range(EPOCH):
    total_loss = 0
    losses = []
    hidden = model.init_hidden(BATCH_SIZE)
    for i, batch in enumerate(getBatch(train_data, BATCH_SIZE)):
        inputs, targets = batch
        hidden = model.detach_hidden(hidden)
        model.zero_grad()
        preds, hidden = model(inputs, hidden, True)
        
        loss = loss_function(preds, targets.view(-1))
        losses.append(loss.data.cpu().numpy())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        if i > 0 and i % 100 == 0:  
            print("[%02d/%d] mean_loss : %0.2f, Perplexity : %0.2f" % (epoch,EPOCH, np.mean(losses), np.exp(np.mean(losses))))
            losses = [] 
    if RESCHEDULED == False and epoch == EPOCH//2:
        LR *= 0.1
        optimizer = optim.Adam(model.parameters(), lr=LR)
        RESCHEDULED = True

[00/40] mean_loss : 7.34, Perplexity : 1545.71
[00/40] mean_loss : 6.93, Perplexity : 1026.98
[01/40] mean_loss : 7.16, Perplexity : 1283.28
[01/40] mean_loss : 6.27, Perplexity : 530.85
[02/40] mean_loss : 6.02, Perplexity : 412.16
[02/40] mean_loss : 5.85, Perplexity : 346.58
[03/40] mean_loss : 5.65, Perplexity : 284.58
[03/40] mean_loss : 5.45, Perplexity : 233.81
[04/40] mean_loss : 5.29, Perplexity : 197.81
[04/40] mean_loss : 5.08, Perplexity : 161.46
[05/40] mean_loss : 4.95, Perplexity : 141.48
[05/40] mean_loss : 4.75, Perplexity : 115.79
[06/40] mean_loss : 4.64, Perplexity : 103.06
[06/40] mean_loss : 4.41, Perplexity : 82.64
[07/40] mean_loss : 4.26, Perplexity : 71.14
[07/40] mean_loss : 4.11, Perplexity : 61.08
[08/40] mean_loss : 3.94, Perplexity : 51.38
[08/40] mean_loss : 3.76, Perplexity : 43.09
[09/40] mean_loss : 3.61, Perplexity : 36.87
[09/40] mean_loss : 3.43, Perplexity : 30.95
[10/40] mean_loss : 3.33, Perplexity : 27.88
[10/40] mean_loss : 3.18, Perplexity : 

In [65]:
total_loss = 0
hidden = model.init_hidden(BATCH_SIZE//2)
for batch in getBatch(test_data, SEQ_LENGTH):
    inputs,targets = batch
        
    hidden = model.detach_hidden(hidden)
    model.zero_grad()
    preds, hidden = model(inputs, hidden)
    total_loss += inputs.size(1) * loss_function(preds, targets.view(-1)).data

total_loss = total_loss/test_data.size(1)
print("Test Perpelexity : %5.2f" % (np.exp(total_loss.data.cpu().numpy())))

Test Perpelexity : 24889.33


Our training set was Harry Potter 1st book, our dev set was Harry Potter 2nd book, and our test set was Harry Potter 3rd book. This shows that the books are different from each other not only in terms of events, but also in terms of language modeling. I hypothesize that since there are words in all books which are not easily identiable with "English" language modeling and they change from book to book, the algorithm had difficulties identifying structure on the test set when trained on a totally different book. The other conclusion is that we might be overfitting. 

Next step is to divide one book into train/dev/test to see if it make any improvements.