## Setup

In [0]:
!pip install -q torch torchtext opt_einsum
!pip install -qU git+https://github.com/harvardnlp/namedtensor

[?25l[K    17% |█████▌                          | 10kB 19.6MB/s eta 0:00:01[K    34% |███████████                     | 20kB 2.3MB/s eta 0:00:01[K    51% |████████████████▌               | 30kB 3.3MB/s eta 0:00:01[K    68% |██████████████████████          | 40kB 2.1MB/s eta 0:00:01[K    86% |███████████████████████████▌    | 51kB 2.6MB/s eta 0:00:01[K    100% |████████████████████████████████| 61kB 2.8MB/s 
[?25h  Building wheel for opt-einsum (setup.py) ... [?25ldone
[?25h  Building wheel for namedtensor (setup.py) ... [?25ldone
[?25h

In [0]:
import torch
import torchtext
from torchtext.vocab import Vectors
from torch.autograd import Variable
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from namedtensor import ntorch
from google.colab import files
from namedtensor.text import NamedField
import numpy as np
from torchtext.data.iterator import BPTTIterator
from torchtext.data import Batch, Dataset
import math
import matplotlib.pyplot as plt
from google.colab import files # only needed to save and upload files on google colab
import itertools
from collections import defaultdict
from collections import Counter
import heapq


# Our input $x$
TEXT = NamedField(names=("seqlen",))

# Fetch dataset
!curl -qO https://raw.githubusercontent.com/harvard-ml-courses/cs287-s18/master/HW2/input.txt
!curl -qO https://raw.githubusercontent.com/harvard-ml-courses/cs287-s18/master/HW2/train.5k.txt
!curl -qO https://raw.githubusercontent.com/harvard-ml-courses/cs287-s18/master/HW2/train.txt
!curl -qO https://raw.githubusercontent.com/harvard-ml-courses/cs287-s18/master/HW2/valid.txt
    
# Data distributed with the assignment
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
    path=".", 
    train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT)

# Build vocab
TEXT.build_vocab(train)

class NamedBpttIterator(BPTTIterator):
    def __iter__(self):
        text = self.dataset[0].text
        TEXT = self.dataset.fields['text']
        TEXT.eos_token = None
        text = text + ([TEXT.pad_token] * int(math.ceil(len(text) / self.batch_size)
                                              * self.batch_size - len(text)))
        data = TEXT.numericalize(
            [text], device=self.device)
        data = (data
            .stack(("seqlen", "batch"), "flat")
            .split("flat", ("batch", "seqlen"), batch=self.batch_size)
            .transpose("seqlen", "batch")
        )

        dataset = Dataset(examples=self.dataset.examples, fields=[
            ('text', TEXT), ('target', TEXT)])
        while True:
            for i in range(0, len(self) * self.bptt_len, self.bptt_len):
                self.iterations += 1
                seq_len = min(self.bptt_len, len(data) - i - 1)
                yield Batch.fromvars(
                    dataset, self.batch_size,
                    text = data.narrow("seqlen", i, seq_len),
                    target = data.narrow("seqlen", i+1, seq_len),
                )
                         
            if not self.repeat:
                return

# create batches and iterators
train_iter, val_iter, test_iter = NamedBpttIterator.splits(
    (train, val, test), batch_size=10, device=torch.device("cuda"), bptt_len=32, repeat=False)

# global variable
vocab_size = len(TEXT.vocab)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  185k  100  185k    0     0  1626k      0 --:--:-- --:--:-- --:--:-- 1626k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  595k  100  595k    0     0  5003k      0 --:--:-- --:--:-- --:--:-- 5003k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4982k  100 4982k    0     0  31.3M      0 --:--:-- --:--:-- --:--:-- 31.3M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  390k  100  390k    0     0  3648k      0 --:--:-- --:--:-- --:--:-- 3683k


## Assignment Task

In this homework you will be building several varieties of language models.

We ask that you construct the following models in Torch:

1. A count-based trigram model with linear-interpolation. $$p(y_t | y_{1:t-1}) =  \alpha_1 p(y_t | y_{t-2}, y_{t-1}) + \alpha_2 p(y_t | y_{t-1}) + (1 - \alpha_1 - \alpha_2) p(y_t) $$
2. A neural network language model (consult *A Neural Probabilistic Language Model* http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)
3. An LSTM language model (consult *Recurrent Neural Network Regularization*, https://arxiv.org/pdf/1409.2329.pdf) 

 


## Utility Functions

In [0]:
# Repackage a hidden variable
def forget_hist(h):
    while not isinstance(h, Variable): 
        h = Variable(h.data)
    return Variable(h.data)

# Returns the perplexity given the cost
def perplexity(cost):
    return float('%.2f'%(np.exp(cost)))

# Generates a plot of epochs vs perplexity per epoch
def graph_perplexity(epoch_perplexity, model_name):
    y = epoch_perplexity
    x = range(len(epoch_perplexity))
    print("min perplexity = ", np.min(epoch_perplexity))
    plt.plot(x,y)
    plt.title(model_name)
    plt.ylabel('Perplexity')
    plt.xlabel('Epoch')
    plt.show()  
    
def download(file_name):
    files.download(file_name)


## Model 1 - Count-based Trigram with Linear-Interpolation

$$p(y_t | y_{1:t-1}) =  \alpha_1 p(y_t | y_{t-2}, y_{t-1}) + \alpha_2 p(y_t | y_{t-1}) + (1 - \alpha_1 - \alpha_2) p(y_t) $$

In [0]:
def addToAnyDict(anyDict, previousWordsTuple,w):
    if previousWordsTuple not in anyDict:
        anyDict[previousWordsTuple] = defaultdict(int)

    anyDict[previousWordsTuple][w] +=1

    # increment count for that word
    anyDict[previousWordsTuple][-1] += 1
    

def getDicts(numDicts):
    count = Counter()
    dicts = []
    dicts.append(Counter())
    for k in range(1,numDicts):
        dicts.append({})

    totalNumWords = 0

    for b in iter(train_iter):
        for i in range(b.text.shape["batch"]):
            seq = b.text[{"batch": i}].tolist()

            # Update each ngram dict
            for z in range(1,numDicts):
                for w in range(len(seq) - z):
                    addToAnyDict(dicts[z], tuple([seq[w + j] for j in range(z)]),seq[w+z])

            # Update total count
            values = b.text.values.contiguous().view(-1).tolist()
            dicts[0].update(values)
            totalNumWords += len(values) 
  
    dicts[0][-1] = totalNumWords
    return dicts


# Get prob of word given all the ngrams
def getProbYtFromGrams(dicts, previousWordsTup, yt, alphas):
    numDicts = len(previousWordsTup) + 1
    unigramProb = dicts[0][yt] / dicts[0][-1]
    gramProbs = [unigramProb]
    
    for z in range(1,numDicts):
        myDict = dicts[z]
        tupKey = previousWordsTup[numDicts - z - 1:]
        if tupKey in myDict:
            gramProb = myDict[tupKey][yt]/myDict[tupKey][-1] if myDict[tupKey][-1] != 0 else 0
        else:
            gramProb = 0
        gramProbs.append(gramProb)
    
    return sum([gramProbs[i] * alphas[numDicts - i - 1] for i in range(numDicts)])


# Predict next best numWords based on previousWord tuple
def getNextWords(dicts, previousWordsTup, numWords, alphas):
    lenVocab = len(TEXT.vocab)
    probs = [(i, getProbYtFromGrams(dicts, previousWordsTup,i, alphas)) for i in range(lenVocab)]
    nLargest = heapq.nlargest(numWords, probs, key= lambda x: x[1] )
    return [TEXT.vocab.itos[x[0]] for x in nLargest]  

# Predict probabilities for next word
def getNextProbs(dicts, previousWordsTup, alphas):
    lenVocab = len(TEXT.vocab)
    probs = [(i, getProbYtFromGrams(dicts, previousWordsTup,i, alphas)) for i in range(lenVocab)]
    return probs 

In [0]:
# Trigram
n = 3
alphas = [.1,.8,.1]

# 4-gram
# n = 4
# alphas = [.1,.1,.8,.1]

dicts = getDicts(n)


In [0]:
# computes ngram perplexity

numDictsMinusOne = n - 1
criterion = nn.CrossEntropyLoss()
totalTimes = 0
totalLoss = 0
for b in iter(val_iter):
    for i in range(b.text.shape["batch"]):
        if totalTimes > 500:
            break
        seq = [i for i in b.text[{"batch": i}].tolist()]

        for j in range(len(seq) - numDictsMinusOne):
            nextProbs = getNextProbs(dicts, tuple([seq[j + k] for k in range(numDictsMinusOne)]),alphas)
          
            output = Variable(torch.FloatTensor([nextProbs])).log()
            target = Variable(torch.LongTensor([seq[j+ n - 1]]))
          

            loss = criterion(output,target)
            totalLoss += loss
            totalTimes += 1
            
print(np.exp(totalLoss/totalTimes))

## Model 2 - Neural Network Language Model

In [0]:
# Model 2 - A Neural Probabilistic Language Model
class NPLM(nn.Module):
    def __init__(self, h, m, n):
        super(NPLM, self).__init__()
        self.h = h
        self.m = m
        self.n = n
        self.vocab_size = len(TEXT.vocab)
        self.embedding = ntorch.nn.Embedding(self.vocab_size, self.m)
        self.embedding.weight.requires_grad = True
        self.conv = ntorch.nn.Conv1d(self.m, self.h, self.n-1).spec("embedding", "seqlen", "state") # input, time, output
        self.fc = ntorch.nn.Linear(self.h, self.vocab_size).spec("state", "preds")
        
    def forward(self, text):
        padding = nn.ConstantPad2d((0,0,self.n-2,0), TEXT.vocab.stoi["<pad>"])
        text = ntorch.tensor(padding(text._tensor), names=("seqlen", "batch"))
        batch_embedding = self.embedding(text).transpose("batch", "embedding", "seqlen")
        batch_size = batch_embedding.shape["batch"]
        
        x = self.conv(batch_embedding).tanh()
        x = self.fc(x)
        return x.transpose("seqlen", "batch", "preds")
        
lr = 0.5

nplm = NPLM(h=50, m=60, n=6).cuda()
criterion = ntorch.nn.CrossEntropyLoss().spec("preds")
optimizer = optim.Adadelta(nplm.parameters(), lr=lr)

epochs = 75

for i in range(epochs):
    total_loss = 0
    for batch in train_iter:
        nplm.zero_grad()
        preds = nplm(batch.text.cuda())
        loss = criterion(preds.stack(("seqlen", "batch"), "batch"), batch.target.stack(("seqlen", "batch"), "batch"))
        total_loss += loss
        loss.backward()
        optimizer.step()
    print("total loss", total_loss.item()/len(train_iter))

total loss 6.404776920180723
total loss 6.014144524526678
total loss 5.899896460843373
total loss 5.822559434165233
total loss 5.764310590576592
total loss 5.717507530120482
total loss 5.678421498493976
total loss 5.64483850580895
total loss 5.615340267319277
total loss 5.5889337483864026
total loss 5.564986956755594
total loss 5.543106914264199
total loss 5.522956782487091
total loss 5.504269309380379
total loss 5.486844139952668
total loss 5.470476549053356
total loss 5.45496954335198
total loss 5.440308667706541
total loss 5.426422991071429
total loss 5.413234859079174
total loss 5.400747969556798
total loss 5.388891055292599
total loss 5.3776002447289155
total loss 5.366818725796041
total loss 5.3564900225903616
total loss 5.34657110585198
total loss 5.33703743545611
total loss 5.3278456459767645
total loss 5.318955397482788
total loss 5.310372740963856
total loss 5.302085910606713
total loss 5.294073391781412
total loss 5.286319720847676
total loss 5.278820863812393
total loss 5.2

## Model 3 - LSTM Language Model

In [0]:
# LSTM Language Model
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()

        # Global Hyperparameters
        self.num_layers = 2
        self.lr = 1
        
        # Hyperparameters - Zaremba (MEDIUM)
        self.input_size = 650
        param_init = 0.05
        self.drop_prob = 0.50
        self.grad_clip = 5
        self.num_epochs = 39
        self.lr_decay = 1.2
        self.epoch_flat_lr = 6.0
        
        # Hyperparameters - Zaremba (LARGE)
        # self.input_size = 1500
        # param_init = 0.04
        # self.drop_prob = 0.65
        # grad_clip = 10
        # self.num_epochs = 55
        # self.lr_decay = 1.5
        # self.epoch_flat_lr = 14.0

        self.dropout = nn.Dropout(self.drop_prob)
        self.lstm = nn.LSTM(input_size=self.input_size, hidden_size=self.input_size, num_layers=self.num_layers, dropout=self.drop_prob)
        self.embeddings = nn.Embedding(vocab_size, self.input_size)
        self.embeddings.weight.data.uniform_(-param_init, param_init)
        self.lin_trans = nn.Linear(self.input_size,vocab_size)
        self.lin_trans.weight.data.uniform_(-param_init, param_init)

    def forward(self, inputs, h, c):
        embeds = self.dropout(self.embeddings(inputs))
        result, (h, c) = self.lstm(embeds, (h, c))        
        result = self.dropout(result)
        result = self.lin_trans(result.view(-1, self.input_size))
        return result, (h, c)

In [0]:
def run_epoch(lstm_model, data_iter, is_training):
    
    batch_size = vars(train_iter)['batch_size']
    num_steps = vars(train_iter)['bptt_len']
    
    if is_training:
        lstm_model.train()
    else:
        lstm_model.eval()
        
    # Init hidden and cell variables
    h = Variable(torch.zeros(lstm_model.num_layers, batch_size, lstm_model.input_size)).cuda()
    c = Variable(torch.zeros(lstm_model.num_layers, batch_size, lstm_model.input_size)).cuda()

    costs = 0.0
    for batch in data_iter:

        # Padd text and target to seqlen 32
        if batch.text.shape["seqlen"] != num_steps:
            pad_text = ntorch.ones((num_steps - batch.text.shape["seqlen"],batch.text.shape["batch"]), names=("seqlen","batch")).long().cuda()
            batch.text = ntorch.cat((pad_text, batch.text), dim="seqlen")
            pad_target = ntorch.ones((num_steps - batch.target.shape["seqlen"],batch.target.shape["batch"]), names=("seqlen","batch")).long().cuda()
            batch.target = ntorch.cat((pad_target, batch.target), dim="seqlen")

        lstm_model.zero_grad()
        h, c = forget_hist(h), forget_hist(c)
        inputs = Variable(batch.text.values.contiguous())
        outputs, (h,c) = lstm_model(inputs, h, c)

        # Format outputs/targets to the same size
        outputs = outputs.view(num_steps, batch_size, vocab_size).view(-1, vocab_size)            
        targets = torch.squeeze(Variable(batch.target.values.contiguous()).view(-1, batch_size * num_steps))

        loss = criterion(outputs, targets)
        costs += loss.data.item()
        
        if is_training:
            loss.backward()
            nn.utils.clip_grad_norm_(lstm_model.parameters(), lstm_model.grad_clip)

            for p in lstm_model.parameters():
                p.data.add_(-lstm_model.lr, p.grad.data)

    return perplexity(costs/len(data_iter))

In [0]:
# Train model
def train_lstm(lstm_model):
    
    print("Training...")
   
    epochs_perplexity = []
    for epoch in range(lstm_model.num_epochs):
        
        # Decay learning rate
        if epoch > lstm_model.epoch_flat_lr:
            lstm_model.lr = lstm_model.lr / lstm_model.lr_decay
            
        perp = run_epoch(lstm_model, train_iter, True)
        epochs_perplexity.append(perp)
        print('epoch:'+str(epoch+1)+'/'+str(lstm_model.num_epochs)+' perplexity:'+str(perp))
        
    print("Done Training")
    return lstm_model, epochs_perplexity
 

# Initialize and Train LSTM model
criterion = nn.CrossEntropyLoss()
lstm_model = LSTM().cuda()
lstm_model, epoch_perplexity = train_lstm(lstm_model)

# Epoch vs Perplexity visualization
model_name = "LSTM LM - Zaremba (Medium) Params"
graph_perplexity(epoch_perplexity, model_name)

Training...
shapes:
torch.Size([320, 10001])
torch.Size([320])
epoch:1/39 perplexity:1.0
shapes:
torch.Size([320, 10001])
torch.Size([320])
epoch:2/39 perplexity:1.0
shapes:
torch.Size([320, 10001])
torch.Size([320])
epoch:3/39 perplexity:1.0
shapes:
torch.Size([320, 10001])
torch.Size([320])
epoch:4/39 perplexity:1.0
shapes:
torch.Size([320, 10001])
torch.Size([320])
epoch:5/39 perplexity:1.0
shapes:
torch.Size([320, 10001])
torch.Size([320])
epoch:6/39 perplexity:1.0
shapes:
torch.Size([320, 10001])
torch.Size([320])
epoch:7/39 perplexity:1.0
shapes:
torch.Size([320, 10001])
torch.Size([320])
epoch:8/39 perplexity:1.0
shapes:
torch.Size([320, 10001])
torch.Size([320])
epoch:9/39 perplexity:1.0
shapes:
torch.Size([320, 10001])
torch.Size([320])
epoch:10/39 perplexity:1.0


KeyboardInterrupt: ignored

In [0]:
# Validate the trained LSTM
def validate_lstm(lstm_model, num_steps, batch_size):
    perp = run_epoch(lstm_model, val_iter, False)
    print('LSTM Validation Perplexity:'+str(perp))
    
validate_lstm(lstm_model, 32, 10)

In [0]:
# Generate and download Kaggle predictions
def test_lstm(model, batch_text):
    
    indices = torch.tensor([10]).cuda()
    predictions = []  
    model.eval()

    # Init hidden and cell
    h = Variable(torch.zeros(model.num_layers, test_data.shape['batch'], model.input_size)).cuda()
    c = Variable(torch.zeros(model.num_layers, test_data.shape['batch'], model.input_size)).cuda()
    
    inputs = Variable(batch_text.values.contiguous())
    h, c = forget_hist(h), forget_hist(c)

    outputs, (h,c) = model(inputs.t(), h, c)    
    last_words = outputs.view(-1, 3165, 10001)
    
    # Fetch max 20 probs
    for word_prob in last_words[-1]:
        word_prob = np.array(word_prob.tolist())

        # Remove (unk), (pad), and (eos)
        word_prob[0] = 0
        word_prob[1] = 0
        word_prob[3] = 0
        
        word_ids = word_prob.argsort()[-20:][::-1]
        entry_pred = ' '.join([TEXT.vocab.itos[i] for i in word_ids])
        predictions.append(entry_pred)
            
    with open("predictions.txt", "w") as fout:
        print("id,word", file=fout)
        for i, l in enumerate(predictions, 1):
            print("%d,%s"%(i, l), file=fout)
            
    download('predictions.txt')

# Generate/download Kaggle predictions
test_lstm(lstm_model, test_data)
