## Linear Language Model with Data Loader

Status of Notebook: Work in Progress

Difference from `loglin-lm.ipynb` is that we use a data loader to load the data.

In [1]:
import torch
import random
import torch
import torch.nn as nn
import math
import time
import numpy as np

### Download the Data

In [6]:
# uncomment to download the datasets
#!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/ptb/test.txt
#!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/ptb/train.txt
#!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/ptb/valid.txt

### Process the Data

In [2]:
# function to read in data, process each line and split columns by " ||| "
def read_data(filename):
    data = []
    with open(filename, "r") as f:
        for line in f:
            line = line.strip().split(" ")
            data.append(line)
    return data

# read the data
train_data = read_data('data/ptb/train.txt')
val_data = read_data('data/ptb/valid.txt')

# creating the word and tag indices and special tokens
word_to_index = {}
index_to_word = {}
word_to_index["<s>"] = len(word_to_index)
index_to_word[len(word_to_index)-1] = "<s>"
word_to_index["<unk>"] = len(word_to_index) # add <UNK> to dictionary
index_to_word[len(word_to_index)-1] = "<unk>"

# create word to index dictionary and tag to index dictionary from data
def create_dict(data, check_unk=False):
    for line in data:
        for word in line:
            if check_unk == False:
                if word not in word_to_index:
                    word_to_index[word] = len(word_to_index)
                    index_to_word[len(word_to_index)-1] = word
            
            # has no effect because data already comes with <unk>
            # should work with data without <unk> already processed
            else: 
                if word not in word_to_index:
                    word_to_index[word] = word_to_index["<unk>"]
                    index_to_word[len(word_to_index)-1] = word

create_dict(train_data)
create_dict(val_data, check_unk=True)

# create word and tag tensors from data
def create_tensor(data):
    for line in data:
        yield [word_to_index[word] for word in line]

train_data = [*create_tensor(train_data)]
val_data = [*create_tensor(val_data)]

number_of_words = len(word_to_index)

### Convert data to PyTorch Dataset

In [3]:
from torch.utils.data import Dataset, DataLoader

class PTB(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.as_tensor(self.data[idx])

train_dataset = PTB(train_data)
val_dataset = PTB(val_data)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True)

In our implementation we are using batched training. There are a few differences from the original implementation found [here](https://github.com/neubig/nn4nlp-code/blob/master/02-lm/loglin-lm.py). 

### Define the Model

In [6]:
## define the model

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# length of the n-gram
N = 2

# logilinear model
class LogLinear(nn.Module):
    def __init__(self, number_of_words, ngram_length):
        super(LogLinear, self).__init__()

        # different lookups for each position in the n-gram
        self.embeddings = nn.ModuleList([nn.Embedding(number_of_words, number_of_words) for _ in range(ngram_length)])
        self.bias = torch.zeros(number_of_words, requires_grad=True).type(torch.FloatTensor).to(device)

        # initialize
        for i in range(N):
            nn.init.xavier_uniform_(self.embeddings[i].weight)

    def forward(self, x):
        # calculate score
        embs = torch.cat([lookup(x) for x, lookup in zip(x.T, self.embeddings)]).view(N, x.shape[0], -1) # N x batch_size x embedding_size
        embs = torch.sum(embs, dim=0) # batch_size x embedding_size
        scores = embs + self.bias
        
        return scores

### Model Settings and Functions

In [7]:
model = LogLinear(number_of_words, N)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
criterion = torch.nn.CrossEntropyLoss()

if torch.cuda.is_available():
    model.to(device)

# function to calculate the sentence loss
def calc_sent_loss(sent):
    S = word_to_index["<s>"]
    
    # initial history is equal to end of sentence symbols
    hist = [S] * N
    
    # collect all target and histories
    all_targets = []
    all_histories = []
    
    # step through the sentence, including the end of sentence token
    for next_word in sent + torch.Tensor([S]):
        all_histories.append(list(hist))
        all_targets.append(next_word)
        hist = hist[1:] + [next_word]

    logits = model(torch.LongTensor(all_histories).to(device))
    loss = criterion(logits, torch.LongTensor(all_targets).to(device))

    return loss

MAX_LEN = 100
# Function to generate a sentence
def generate_sent():
    S = word_to_index["<s>"]
    hist = [S] * N
    sent = []
    while True:
        logits = model(torch.LongTensor([hist]).to(device))
        p = torch.nn.functional.softmax(logits) # 1 x number_of_words
        next_word = p.multinomial(num_samples=1).item()
        if next_word == S or len(sent) == MAX_LEN:
            break
        sent.append(next_word)
        hist = hist[1:] + [next_word]
    return sent

### Train the Model

In [9]:
# start training
for ITER in range (10): # CHANGE to 100
    # training

    model.train()
    train_words, train_loss  = 0, 0.0
    for sent_id, sent in enumerate(train_loader):
        
        my_loss = calc_sent_loss(sent[0])
        
        train_loss += my_loss.item()
        train_words += len(sent)

        optimizer.zero_grad()
        my_loss.backward()
        optimizer.step()

        if (sent_id+1) % 5000 == 0:
            print("--finished %r sentences" % (sent_id+1))
    print("iter %r: train loss/word=%.4f, ppl=%.4f" % (ITER, train_loss/train_words, math.exp(train_loss/train_words)))

    # evaluation
    model.eval()
    dev_words, dev_loss = 0, 0.0
    start = time.time()
    for sent_id, sent in enumerate(val_loader):
        my_loss = calc_sent_loss(sent[0])
        dev_loss += my_loss.item()
        dev_words += len(sent)
    print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start))

    # Generate a few sentences
    for _ in range(5):
        sent = generate_sent()
        print(" ".join([index_to_word[x] for x in sent]))

--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 0: train loss/word=9.0947, ppl=8907.6500
iter 0: dev loss/word=9.7668, ppl=17444.9221, time=1.76s
in this case of the trade deficit of the globe weeks columnist months <unk> from a <unk> character succeed reflects as an effort will teaching mr. chestman was essentially flat to deal with the board is this time the <unk> an international <unk> machines are n't being any at this time you were n't disclosed this week it to take over a company said it will introduce a new york <unk> that since friday 's sharp swings in the field sales were down on N at a <unk> company said it will invest in quarterly profit by the new securities
on monday at N yen $ N million navy contract for advanced there were <unk> when he 's no decision has been done by the bush administration has of ne



advertisers and advertising rates for the s&p N issue of the issues <unk> pace with rival very small amounts to veto the constitution <unk> sen coordinator of the big three <unk> the las vegas 's increased <unk> activity is only one or for one thing is important as of as many as N million navy contract for the government is <unk> by mr. <unk> has <unk> business conditions and the earnings or N on the firm of that this is that mr. gorbachev 's economic activity and only half of the proposal to reduce interest rates in the <unk> he
--finished 5000 sentences


KeyboardInterrupt: 