In [131]:
# Text text processing library
import torchtext
from torchtext.vocab import Vectors
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
debug = False

In [66]:
# Our input $x$
TEXT = torchtext.data.Field()

# Data distributed with the assignment
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
    path=".", 
    train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT)

TEXT.build_vocab(train)
if debug:
    TEXT.build_vocab(train, max_size=1000)

train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=10, device=-1, bptt_len=32, repeat=False)

In [111]:
it = iter(test_iter)
batch = next(it)
print(batch.text.size())
# print(batch.text[:,3])
print(' '.join([TEXT.vocab.itos[i] for i in batch.text[:,4].data]))
print(' '.join([TEXT.vocab.itos[i] for i in batch.text[:,5].data]))

torch.Size([32, 10])
makes some executives nervous <eos> last year the research and development division of weyerhaeuser co. the large <unk> concern invited a <unk> to its <unk> wash. offices <eos> phil <unk> a software
more expensive than direct treasury borrowing said rep. <unk> stark d. calif. the bill 's chief sponsor <eos> the complex financing plan in the s&l bailout law includes raising $ N billion


Perplexity goals:
count: 120-200
feedforward: 100-150
recurrent: below 100 (between 80-100)

In [None]:
tgram = Trigram(TEXT)
tgram.train_counts(train_iter)
tgram.set_alpha(0.25, 0.25)

Iteration 0
Iteration 1000
Iteration 2000


In [None]:
le = LangEvaluator(tgram, TEXT, evalmetric='perplexity')
le.evaluate(val_iter)

Iteration 0


In [132]:
class Trigram(nn.Module):
    def __init__(self, TEXT, **kwargs):
        super(Trigram, self).__init__()
        self._TEXT = TEXT
        self._text_vocab_len = len(TEXT.vocab)
        
        # Use dictionaries since we don't want to have to 
        # store the vast majority of bi/tri gram counts 
        # which are 0.
        self.cnts = [dict(), dict(), dict()]
        
    def set_alpha(self, *args):
        self.alphas = list(args)
        if len(self.alphas) < 3:
            assert len(self.alphas) == 2
            self.alphas.append(1 - sum(self.alphas))
        
    def train_counts(self, train_iter, num_iter=None):
        if num_iter is None:
            num_iter = len(train_iter)
        train_iter = iter(train_iter)
        for i in range(num_iter):
            batch = next(train_iter)
            if i % 1000 == 0:
                print('Iteration %d' % i)
            self.update_trigram_cnts(torch.t(batch.text).data)
            
    # Batch is a torch tensor of size [size_batch, sentence_len, size_vocab]; 
    # this returns the probability vectors for each of the words
    # TODO: havven't checked yet!
    def forward(self, batch):
        ret_arr = torch.zeros(batch.size()[0], batch.size()[1], 
                              self._text_vocab_len)
        for i in range(batch.size()[0]):
            for n in range(0,3):
                for j in range(batch.size()[1]):
                    key = () if max(0, j-n) == j else tuple(batch[i, max(0, j-n):j])
                    if key in self.cnts[n]:
                        ret_arr[i,:] += self.alphas[n] * self.cnts[n][key]
        # Use log probabilities to make everything fit together
        return torch.log(ret_arr / torch.sum(ret_arr))
                
    # Batch is an torch tensor of size [batch_size, bptt_len]
    def update_trigram_cnts(self, batch):
        # We don't glue rows together since they may be shuffled 
        # (this is all kind of silly since ideally we'd just do 
        # this in one big 'sentence', but perhaps we want a 'fair 
        # comparison'...)
        for j in range(batch.shape[0]):
            for n in range(0,3):
                for k in range(batch.shape[1] - n):
                    dict_key = () if k == k+n else tuple(batch[j, k:k+n])
                    if not dict_key in self.cnts[n]:
                        # We never want to return 0 probability (or else PPL = infty), 
                        # so make sure we've got mass everywhere
                        self.cnts[n][dict_key] = torch.zeros(self._text_vocab_len) if n > 0 \
                                else torch.ones(self._text_vocab_len)
                    # Here's where we increment the ocunt
                    self.cnts[n][dict_key][batch[j, k+n]] += 1    

In [146]:
class LangEvaluator(object):
    def __init__(self, model, TEXT, **kwargs):
        self._TEXT = TEXT
        self.model = model
        self.eval_metric = kwargs.get('evalmetric', 'perplexity')
        
    def evaluate(self, test_iter):
        sum_nll = 0
        cnt_nll = 0
        for i,batch in enumerate(test_iter):
            if i % 100 == 0:
                print('Iteration %d' % i)
            # Model output: [batch_size, sent_len, size_vocab]; these aren't actually 
            # probabilities if the model is a Trigram, but this doesn't 
            # matter.
            batch_transpose = torch.t(batch.text).contiguous() # [batch_size, sent_len]
            log_probs = self.model(batch_transpose)
            cnt_nll += batch_transpose.size()[0] * batch_transpose.size()[1]
            sum_nll += LangTrainer.loss_nll(batch_transpose.data, log_probs, mode='sum')
            
        return np.exp(sum_nll / cnt_nll)

In [140]:
class LangTrainer(object):
    def __init__(self, TEXT, model, **kwargs):
        self._TEXT = TEXT
        self._model = model
    
    # Here batch is output from a RNN/NNLM/Trigram model:
    # [..., size_vocab], and output are the real words: [...]
    @staticmethod
    def loss_nll(batch, output, mode='mean'):
        # [batch_size * sent_len, size_vocab]
        vocab_len = output.size()[-1]
        output = output.view(-1, vocab_len)
        # [batch_size * sent_len]
        batch = batch.view(-1, 1)
        batch_probs = -1 * torch.gather(output, 1, batch)
        if mode == 'mean':
            return torch.mean(batch_probs)
        else:
            return torch.sum(batch_probs)
        return
    
    @staticmethod
    def loss_perplexity(*args):
        return torch.exp(LangTrainer.loss_nll(*args))
        

In [127]:
a = torch.LongTensor([1,2,3])
c = tuple(a)
b = np.array([1,2,3])
d = tuple(b)
print(d == c)

True


## IGNORE STUFF BELOW HERE

In [128]:
    # NOT USED!
    # Here arr is a 1-D numpy array; this returns 
    # groups of n consecutive words (with overlapping)
    def get_ngrams(self, arr, n=3):
        len_ngrams = arr.shape[0] - n + 1
        ngram_inds = np.tile(np.reshape(np.arange(len_ngrams), [len_ngrams, 1]), [1, n]) + \
                    np.tile(np.reshape(np.arange(n), [1, n]), [len_ngrams, 1])
        return np.take(arr, ngram_inds)

In [145]:
np.exp(1)

2.7182818284590451