In [1]:
# Text text processing library
import torchtext
from torchtext.vocab import Vectors
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import itertools as it
from models import *
from helpers import *
debug = True

In [2]:
# Our input $x$
TEXT = torchtext.data.Field()

# Data distributed with the assignment
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
    path=".", 
    train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT)

TEXT.build_vocab(train)
if debug:
    TEXT.build_vocab(train, max_size=100)

train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=10, device=-1, bptt_len=32, repeat=False)

# Build the vocabulary with word embeddings
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))

In [3]:
it = iter(test_iter)
batch = next(it)
print(batch.text.size())
# print(batch.text[:,3])
print(' '.join([TEXT.vocab.itos[i] for i in batch.text[:,4].data]))
print(' '.join([TEXT.vocab.itos[i] for i in batch.text[:,5].data]))

torch.Size([32, 10])
<unk> some <unk> <unk> <eos> last year the <unk> and <unk> <unk> of <unk> co. the <unk> <unk> <unk> <unk> a <unk> to its <unk> <unk> <unk> <eos> <unk> <unk> a <unk>
more <unk> than <unk> <unk> <unk> said <unk> <unk> <unk> <unk> <unk> the <unk> 's <unk> <unk> <eos> the <unk> <unk> <unk> in the <unk> <unk> <unk> <unk> <unk> $ N billion


Perplexity goals:
count: 120-200
feedforward: 100-150
recurrent: below 100 (between 80-100)

In [4]:
class EmbeddingsLM(nn.Module):
    def __init__(self, TEXT, **kwargs):
        super(EmbeddingsLM, self).__init__()
        # Initialize dropout
        self.dropout = nn.Dropout(kwargs.get('dropout', 0.5))
        
        # V is size of vocab, D is dim of embedding
        self.V = TEXT.vocab.vectors.size()[0]
        max_embed_norm = kwargs.get('max_embed_norm', 10)
        if kwargs.get('pretrain_embeddings', True):
            self.D = TEXT.vocab.vectors.size()[1]
            self.embeddings = nn.Embedding(self.V, self.D, max_norm=max_embed_norm)
            self.embeddings.weight = nn.Parameter(
                TEXT.vocab.vectors, requires_grad= \
                kwargs.get('train_embeddings', True))
        else:
            self.D = kwargs.get('word_features', 100)
            self.embeddings = nn.Embedding(self.V, self.D, max_norm=max_embed_norm)
        

class NNLM(EmbeddingsLM):
    def __init__(self, TEXT, **kwargs):
        # sets up self.embeddings, self.D, self.V, self.dropout
        super(NNLM, self).__init__(TEXT, **kwargs)

        # Save parameters:
        self.activation = kwargs.get('activation', F.tanh)
                
        in_channels = 1
        out_channels = kwargs.get('hidden_size', 100)
        self.kernel_sizes_inner = [kwargs.get('kern_size_inner', 5)] 
        self.kernel_size_direct = kwargs.get('kern_size_direct', -1)

        # List of convolutional layers
        self.convs_inner = nn.ModuleList(
            [nn.Conv2d(in_channels, out_channels, (K, self.D),
                       padding=(K, 0)) for K in self.kernel_sizes_inner])
        if self.kernel_size_direct > 0:
            # Bias is already in self.linear, so don't put another here
            self.conv_direct = nn.Conv2d(
                in_channels, self.V, (self.kernel_size_direct, self.D),
                padding=(self.kernel_size_direct,0), bias=False)

        
        self.linear = nn.Linear(len(self.kernel_sizes_inner) * out_channels,
                                self.V)
    
    # x is [batch_sz, sent_len]: words are encoded as integers (indices)
    def forward(self, x):
        x = self.embeddings(x) # [btch_sz, sent_len, D]
        x = x.unsqueeze(1) # [btch_sz, in_channels, sent_len, D]
        # [btch_sz, out_channels, sent_len] * len(kerns)
        x = [self.activation(conv(x)).squeeze(3)\
             [:,:,:-(self.kernel_sizes_inner[i]+1)] for \
             i,conv in enumerate(self.convs_inner)]
        # [btch_sz, out_channels * len(kerns), sent_len]
        x = torch.cat(x, 1)
        # [btch_sz, sent_len, out_channels * len(kerns)]
        x = x.permute(0, 2, 1)
        
        x = self.dropout(x) # Bengio et al. doesn't mention dropout 
        # (it hadn't been 'discovered')
        
        # [btch_sz, sent_len, V]
        x = self.linear(x) # has a bias term
        
        if self.kernel_size_direct > 0:
            # [btch_sz, V, sent_len]
            y = self.conv_direct(x)[:,:,:-(self.kernel_size_direct+1)]
            # [btch_sz, sent_len, V]
            y = y.permute(0, 2, 1)
            x = x + y # '+' should be overloaded
            
        return F.log_softmax(x, dim=2)        

class LSTMLM2(EmbeddingsLM):
    def __init__(self, TEXT, **kwargs):
        # sets up self.D, self.V, self.embeddings, self.dropout
        super(LSTMLM2, self).__init__(TEXT, **kwargs)
        
        # Save parameters:
        self.hidden_dim = kwargs.get('hidden_dim', 650)
        self.num_layers = kwargs.get('num_layers', 1)
        
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        # TODO: Make sure LSTM does dropout the right way on the inner parameters
        self.lstm = nn.LSTM(self.D, self.hidden_dim,
                            num_layers=self.num_layers,
                            dropout=kwargs.get('dropout', 0.5),
                            batch_first=True)
        
        # The linear layer that maps from hidden state space to label space
        self.linear = nn.Linear(self.hidden_dim, self.V)

    # hidden should be [batch_sz, num_layers, hidden_dim]
    def forward(self, x, hidden):
        sent_len = x.size(1)
        btch_sz = x.size(0)
        x = self.embeddings(x) # [btch_sz, sent_len, D]

        # hidden_out is [batch_sz, num_layers, hidden_dim]
        lstm_out, hidden_out = self.lstm(x, hidden)

        lstm_out = self.dropout(lstm_out)
        # lstm_out is [batch_sz, sent_len, hidden]
        pred = self.linear(lstm_out)
        return F.log_softmax(pred, dim=2), hidden_out

In [5]:
train_iter, _, _  = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=10, device=-1, bptt_len=32, repeat=False)
model_lstmlm = LSTMLM2(TEXT)
trainer = LangTrainer(TEXT, model_lstmlm, use_hidden=True, num_iter=10)
trainer.train(train_iter)

CUDA is unavailable...
<torchtext.data.batch.Batch object at 0x102ff0a90>
<torchtext.data.batch.Batch object at 0x10a651908>
<torchtext.data.batch.Batch object at 0x102ff0a90>
<torchtext.data.batch.Batch object at 0x10a651a90>
<torchtext.data.batch.Batch object at 0x102ff0a90>
<torchtext.data.batch.Batch object at 0x10a651b38>
<torchtext.data.batch.Batch object at 0x102ff0a90>
<torchtext.data.batch.Batch object at 0x10a651080>
<torchtext.data.batch.Batch object at 0x102ff0a90>
<torchtext.data.batch.Batch object at 0x10a651908>
<torchtext.data.batch.Batch object at 0x102ff0a90>
<torchtext.data.batch.Batch object at 0x10a651a90>
<torchtext.data.batch.Batch object at 0x102ff0a90>
<torchtext.data.batch.Batch object at 0x10a651b38>
<torchtext.data.batch.Batch object at 0x102ff0a90>
<torchtext.data.batch.Batch object at 0x10a651080>
<torchtext.data.batch.Batch object at 0x102ff0a90>
<torchtext.data.batch.Batch object at 0x10a651908>
<torchtext.data.batch.Batch object at 0x102ff0a90>
<torchte

KeyboardInterrupt: 

In [16]:
_, val_iter, _  = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=1, device=-1, bptt_len=10000, repeat=False)
le = LangEvaluator(model_lstmlm, TEXT, evalmetric='perplexity')
print(le.evaluate(val_iter))

RuntimeError: size mismatch, m1: [10000 x 500], m2: [50 x 10001] at /Users/soumith/code/builder/wheel/pytorch-src/torch/lib/TH/generic/THTensorMath.c:1416

## IGNORE STUFF BELOW HERE

In [128]:
    # NOT USED!
    # Here arr is a 1-D numpy array; this returns 
    # groups of n consecutive words (with overlapping)
    def get_ngrams(self, arr, n=3):
        len_ngrams = arr.shape[0] - n + 1
        ngram_inds = np.tile(np.reshape(np.arange(len_ngrams), [len_ngrams, 1]), [1, n]) + \
                    np.tile(np.reshape(np.arange(n), [1, n]), [len_ngrams, 1])
        return np.take(arr, ngram_inds)

In [288]:
a = [1,2,3]
b = [2 * x + i for i,x in enumerate(a)]
print(b)

[2, 5, 8]


In [285]:
print(TEXT.vocab.vectors.size())

torch.Size([10001, 300])
