In [270]:
# Text text processing library
import torchtext
from torchtext.vocab import Vectors
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import itertools as it
from models import *
from helpers import *
debug = False

In [283]:
# Our input $x$
TEXT = torchtext.data.Field()

# Data distributed with the assignment
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
    path=".", 
    train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT)

TEXT.build_vocab(train)
if debug:
    TEXT.build_vocab(train, max_size=1000)

train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=10, device=-1, bptt_len=32, repeat=False)

# Build the vocabulary with word embeddings
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))

In [111]:
it = iter(test_iter)
batch = next(it)
print(batch.text.size())
# print(batch.text[:,3])
print(' '.join([TEXT.vocab.itos[i] for i in batch.text[:,4].data]))
print(' '.join([TEXT.vocab.itos[i] for i in batch.text[:,5].data]))

torch.Size([32, 10])
makes some executives nervous <eos> last year the research and development division of weyerhaeuser co. the large <unk> concern invited a <unk> to its <unk> wash. offices <eos> phil <unk> a software
more expensive than direct treasury borrowing said rep. <unk> stark d. calif. the bill 's chief sponsor <eos> the complex financing plan in the s&l bailout law includes raising $ N billion


Perplexity goals:
count: 120-200
feedforward: 100-150
recurrent: below 100 (between 80-100)

In [245]:
train_iter, _, _  = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=1, device=-1, bptt_len=10000, repeat=False)
tgram = Trigram(TEXT)
tgram.train_counts(train_iter)
tgram.set_alpha(0.25, 0.25)

Iteration 0


In [269]:
_, val_iter, _  = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=1, device=-1, bptt_len=10000, repeat=False)
for (a1, a2) in [(0.3, 0.5)]: #it.product(np.arange(0.1, 1, 0.1), repeat=2):
    if a1 + a2 >= 1:
        continue
    tgram.set_alpha(a1, a2)
    le = LangEvaluator(tgram, TEXT, evalmetric='perplexity')
    print(a1, a2, le.evaluate(val_iter))

Iteration 0
0.3 0.5 191.339042201


In [294]:
class NNLM(nn.Module):
    def __init__(self, TEXT, **kwargs):
        super(NNLM, self).__init__()

        # Save parameters:
        self.activation = kwargs.get('activation', F.tanh)
        self.direct_cxn = kwargs.get('direct_cxn', False)
        
        # V is size of vocab, D is dim of embedding
        V = TEXT.vocab.vectors.size()[0]
        D = TEXT.vocab.vectors.size()[1]
        self.embeddings = nn.Embedding(V, D)
        self.embeddings.weight = nn.Parameter(
            TEXT.vocab.vectors, requires_grad= \
            kwargs.get('train_embeddings', True))
        
        in_channels = 1
        out_channels = 60
        self.kernel_sizes_inner = [6] 
        self.kernel_size_direct = 6

        # List of convolutional layers
        self.convs_inner = nn.ModuleList(
            [nn.Conv2d(in_channels, out_channels, (K, D),
                       padding=(K, 0)) for K in self.kernel_sizes_inner])
        
        self.conv_direct = nn.Conv2d(
            in_channels, V, (K, D),
            padding=(self.kernel_size_direct,0))

        self.dropout = nn.Dropout(kwargs.get('dropout', 0.5))
        
        self.linear = nn.Linear(len(self.kernel_sizes_inner) * out_channels, V)
    
    # x is [batch_sz, sent_len]: words are encoded as integers (indices)
    def forward(x):
        x = self.embeddings(x) # [btch_sz, sent_len, D]
        x = x.unsqueeze(1) # [btch_sz, in_channels, sent_len, D]
        # [btch_sz, out_channels, sent_len] * len(kerns)
        x = [self.activation(conv(x)).squeeze(3)\
             [:,:,:-(self.kernel_sizes_inner[i]+1)] for \
             i,conv in enumerate(self.convs1)]
        # [btch_sz, out_channels * len(kerns), sent_len]
        x = torch.cat(x, 1)
        # [btch_sz, sent_len, out_channels * len(kerns)]
        x = x.permute(0, 2, 1)
        
        # x = self.dropout(x) # Bengio et al. doesn't mention dropout 
        # (it hadn't been 'discovered')
        
        # [btch_sz, sent_len, V]
        x = self.linear(x) # has a bias term
        
        if self.direct_cxn:
            # [btch_sz, V, sent_len]
            y = self.conv_direct(x)[:,:,:-(self.kernel_size_direct+1)]
            # [btch_sz, sent_len, V]
            y = y.permute(0, 2, 1)
            
        return F.log_softmax(x + y, dim=2)        

In [296]:
class LangTrainer2(object):
    def __init__(self, TEXT, model, **kwargs):
        # Settings:
        self._optimizer = optimizer(filter(lambda p : p.requires_grad,
                                           model.parameters()),
                                    lr=kwargs.get('lr', 0.1))        
        self.cuda = kwargs.get('cuda', True) and \
            torch.cuda.is_available()
        self.clip_norm = kwargs.get('clip_norm', 5)
            
        self._TEXT = TEXT
        self.model = model
        # TODO: implement validation thing for early stopping
        self.training_losses = list()
        self.training_norms = list()
        if self.cuda:
            self.model.cuda()
    
    # Here batch is output from a RNN/NNLM/Trigram model:
    # [..., size_vocab], and output are the real words: [...]
    @staticmethod
    def loss_nll(batch, output, mode='mean'):
        # [batch_size * sent_len, size_vocab]
        vocab_len = output.size()[-1]
        output = output.view(-1, vocab_len)
        # [batch_size * sent_len]
        batch = batch.view(-1, 1)
        batch_probs = -1 * torch.gather(output, 1, batch)
        if mode == 'mean':
            return torch.mean(batch_probs)
        else:
            return torch.sum(batch_probs)
        return
    
    @staticmethod
    def loss_perplexity(*args):
        return torch.exp(self.loss_nll(*args))
    
    def get_feature(self, batch):
        return torch.t(batch.text.data).contiguous()

    # The labels we use as the true words: same as features
    def get_label(self, batch):
        return self.get_feature(batch)
    
    # We are doing a slightly funky thing of taking a 
    # variable's data and then making a new 
    # variable...this seems cleaner though
    def make_loss(self, batch):
        if self.cuda:
            feature, label = self.get_feature(batch).cuda(), \
                            self.get_label(batch).cuda()
        else:
            feature, label = self.get_feature(batch), \
                            self.get_label(batch)
        var_feature = autograd.Variable(feature)
        var_label = autograd.Variable(label)
        loss = self.loss_nll(self.model(var_feature), var_label)
        return loss
    
    def train(self, train_iter, **kwargs):
        train_iter = iter(train_iter)
        for i in range(kwargs.get('num_iter', 100)):
            batch = next(train_iter)
            self.model.zero_grad()
            loss = self.make_loss(batch)
            self.training_losses.append(loss.data.numpy()[0])

                
                
            # Norm clipping
            norm = nn.utils.clip_grad_norm(filter(lambda p : p.requires_grad,
                                                 model.parameters()), self.clip_norm)
            self.training_norms.append(norm.data.numpy()[0])    
            if i % kwargs.get('skip_iter', 10) == 0:
                print('Iteration %d, loss: %f, norm: %f' % (i, self.training_losses[-1],
                                                           self.training_norms[-1]))
            loss.backward()
            self._optimizer.step()
            

In [127]:
a = torch.LongTensor([1,2,3])
c = tuple(a)
b = np.array([1,2,3])
d = tuple(b)
print(d == c)

True


## IGNORE STUFF BELOW HERE

In [128]:
    # NOT USED!
    # Here arr is a 1-D numpy array; this returns 
    # groups of n consecutive words (with overlapping)
    def get_ngrams(self, arr, n=3):
        len_ngrams = arr.shape[0] - n + 1
        ngram_inds = np.tile(np.reshape(np.arange(len_ngrams), [len_ngrams, 1]), [1, n]) + \
                    np.tile(np.reshape(np.arange(n), [1, n]), [len_ngrams, 1])
        return np.take(arr, ngram_inds)

In [288]:
a = [1,2,3]
b = [2 * x + i for i,x in enumerate(a)]
print(b)

[2, 5, 8]


In [285]:
print(TEXT.vocab.vectors.size())

torch.Size([10001, 300])
