In [2]:
# Text text processing library
import torchtext
from torchtext.vocab import Vectors
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import itertools as it
from models import *
from helpers import *
debug = False

In [3]:
# Our input $x$
TEXT = torchtext.data.Field()

# Data distributed with the assignment
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
    path=".", 
    train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT)

TEXT.build_vocab(train)
if debug:
    TEXT.build_vocab(train, max_size=1000)

train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=10, device=-1, bptt_len=32, repeat=False)

# Build the vocabulary with word embeddings
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))

In [4]:
it = iter(test_iter)
batch = next(it)
print(batch.text.size())
# print(batch.text[:,3])
print(' '.join([TEXT.vocab.itos[i] for i in batch.text[:,4].data]))
print(' '.join([TEXT.vocab.itos[i] for i in batch.text[:,5].data]))

torch.Size([32, 10])
makes some executives nervous <eos> last year the research and development division of weyerhaeuser co. the large <unk> concern invited a <unk> to its <unk> wash. offices <eos> phil <unk> a software
more expensive than direct treasury borrowing said rep. <unk> stark d. calif. the bill 's chief sponsor <eos> the complex financing plan in the s&l bailout law includes raising $ N billion


Perplexity goals:
count: 120-200
feedforward: 100-150
recurrent: below 100 (between 80-100)

In [5]:
train_iter, _, _  = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=1, device=-1, bptt_len=10000, repeat=False)
tgram = Trigram(TEXT)
tgram.train_counts(train_iter)
tgram.set_alpha(0.25, 0.25)

Iteration 0


KeyboardInterrupt: 

In [None]:
_, val_iter, _  = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=1, device=-1, bptt_len=10000, repeat=False)
for (a1, a2) in [(0.3, 0.5)]: #it.product(np.arange(0.1, 1, 0.1), repeat=2):
    if a1 + a2 >= 1:
        continue
    tgram.set_alpha(a1, a2)
    le = LangEvaluator(tgram, TEXT, evalmetric='perplexity')
    print(a1, a2, le.evaluate(val_iter))

In [8]:
train_iter, _, _  = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=10, device=-1, bptt_len=32, repeat=False)
model_nnlm = NNLM(TEXT)
trainer = LangTrainer(TEXT, model_nnlm)
trainer.train(train_iter)

Iteration 0, loss: 9.225551, norm: 0.000000
Iteration 10, loss: 9.177421, norm: 0.000000
Iteration 20, loss: 9.110247, norm: 0.000000
Iteration 30, loss: 9.021410, norm: 0.000000
Iteration 40, loss: 8.914558, norm: 0.000000
Iteration 50, loss: 8.717389, norm: 0.000000
Iteration 60, loss: 8.572845, norm: 0.000000
Iteration 70, loss: 8.263859, norm: 0.000000
Iteration 80, loss: 7.970739, norm: 0.000000
Iteration 90, loss: 7.974110, norm: 0.000000


In [9]:
params_train = list(filter(lambda p : p.requires_grad, model_nnlm.parameters()))
print([p.size() for p in params_train])

[torch.Size([10001, 300]), torch.Size([100, 1, 5, 300]), torch.Size([100]), torch.Size([10001, 100]), torch.Size([10001])]


In [20]:
class LangTrainer(object):
    def __init__(self, TEXT, model, **kwargs):
        # Settings:
        optimizer = kwargs.get('optimizer', optim.SGD)
        self._optimizer = optimizer(filter(lambda p : p.requires_grad,
                                           model.parameters()),
                                    lr=kwargs.get('lr', 0.1))        
        self.cuda = kwargs.get('cuda', True) and \
            torch.cuda.is_available()
        if self.cuda:
            print('Using CUDA...')
        self.clip_norm = kwargs.get('clip_norm', 5)
            
        self._TEXT = TEXT
        self.model = model
        # TODO: implement validation thing for early stopping
        self.training_losses = list()
        self.training_norms = list()
        if self.cuda:
            self.model.cuda()
    
    # Here batch is output from a RNN/NNLM/Trigram model:
    # [..., size_vocab], and output are the real words: [...]
    @staticmethod
    def loss_nll(batch, output, mode='mean'):
        # [batch_size * sent_len, size_vocab]
        vocab_len = output.size()[-1]
        output = output.view(-1, vocab_len)
        # [batch_size * sent_len]
        batch = batch.view(-1, 1)
        batch_probs = -1 * torch.gather(output, 1, 
                                        batch) #.type(torch.LongTensor))
        if mode == 'mean':
            return torch.mean(batch_probs)
        else:
            return torch.sum(batch_probs)
        return
    
    @staticmethod
    def loss_perplexity(*args):
        return torch.exp(self.loss_nll(*args))
    
    def get_feature(self, batch):
        return torch.t(batch.text.data).contiguous()

    # The labels we use as the true words: same as features
    def get_label(self, batch):
        return self.get_feature(batch)
    
    # We are doing a slightly funky thing of taking a 
    # variable's data and then making a new 
    # variable...this seems cleaner though
    def make_loss(self, batch):
        if self.cuda:
            feature, label = self.get_feature(batch).cuda(), \
                            self.get_label(batch).cuda()
        else:
            feature, label = self.get_feature(batch), \
                            self.get_label(batch)
        var_feature = autograd.Variable(feature)
        var_label = autograd.Variable(label)
        loss = self.loss_nll(var_label, self.model(var_feature))
        return loss
    
    def train(self, train_iter, **kwargs):
        train_iter = iter(train_iter)
        for i in range(kwargs.get('num_iter', 100)):
            batch = next(train_iter)
            self.model.zero_grad()
            loss = self.make_loss(batch)
            self.training_losses.append(loss.data.numpy()[0])

                
                
            # Norm clipping: returns a float
            norm = nn.utils.clip_grad_norm(filter(lambda p : p.requires_grad,
                                                  self.model.parameters()), self.clip_norm)
            self.training_norms.append(norm)    
            if i % kwargs.get('skip_iter', 10) == 0:
                print('Iteration %d, loss: %f, norm: %f' % (i, self.training_losses[-1],
                                                            self.training_norms[-1]))
            # Do gradient updates
            loss.backward(retain_graph=True)
            self._optimizer.step()
            

In [21]:
class LSTMLM(nn.Module):
    def __init__(self, TEXT, **kwargs):
        super(LSTMLM, self).__init__()
        
        # Save parameters:
        self.hidden_dim = kwargs.get('hidden_dim', 50)
        
        # V is size of vocab, D is dim of embedding
        V = TEXT.vocab.vectors.size()[0]
        D = TEXT.vocab.vectors.size()[1]
        self.embeddings = nn.Embedding(V, D)
        self.embeddings.weight = nn.Parameter(
            TEXT.vocab.vectors, requires_grad= \
            kwargs.get('train_embeddings', True))
        
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(D, self.hidden_dim)
        
        # The linear layer that maps from hidden state space to label space
        self.linear = nn.Linear(self.hidden_dim, V)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))
        
    def forward(self, x):
        sent_len = x.size(1)
        btch_sz = x.size(0)
        x = self.embeddings(x) # [btch_sz, sent_len, D]
        lstm_out, self.hidden = self.lstm(
            x.view(sent_len, btch_sz, -1), self.hidden)
        pred = self.linear(lstm_out.view(sent_len, btch_sz, -1))
        return F.log_softmax(pred, dim=2)


In [22]:
train_iter, _, _  = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=10, device=-1, bptt_len=32, repeat=False)
model_lstmlm = LSTMLM(TEXT)
trainer = LangTrainer(TEXT, model_lstmlm)
trainer.train(train_iter)

Iteration 0, loss: 9.208703, norm: 0.000000
Iteration 10, loss: 9.180125, norm: 0.000000
Iteration 20, loss: 9.161605, norm: 0.000000
Iteration 30, loss: 9.135946, norm: 0.000000
Iteration 40, loss: 9.127443, norm: 0.000000
Iteration 50, loss: 9.104375, norm: 0.000000
Iteration 60, loss: 9.091858, norm: 0.000000
Iteration 70, loss: 9.070498, norm: 0.000000
Iteration 80, loss: 9.032392, norm: 0.000000
Iteration 90, loss: 9.019320, norm: 0.000000


In [16]:
_, val_iter, _  = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=1, device=-1, bptt_len=10000, repeat=False)
le = LangEvaluator(model_lstmlm, TEXT, evalmetric='perplexity')
print(le.evaluate(val_iter))

RuntimeError: size mismatch, m1: [10000 x 500], m2: [50 x 10001] at /Users/soumith/code/builder/wheel/pytorch-src/torch/lib/TH/generic/THTensorMath.c:1416

## IGNORE STUFF BELOW HERE

In [128]:
    # NOT USED!
    # Here arr is a 1-D numpy array; this returns 
    # groups of n consecutive words (with overlapping)
    def get_ngrams(self, arr, n=3):
        len_ngrams = arr.shape[0] - n + 1
        ngram_inds = np.tile(np.reshape(np.arange(len_ngrams), [len_ngrams, 1]), [1, n]) + \
                    np.tile(np.reshape(np.arange(n), [1, n]), [len_ngrams, 1])
        return np.take(arr, ngram_inds)

In [288]:
a = [1,2,3]
b = [2 * x + i for i,x in enumerate(a)]
print(b)

[2, 5, 8]


In [285]:
print(TEXT.vocab.vectors.size())

torch.Size([10001, 300])
