# Word-level Language Modeling

## Overview

LAMBADA is a collection of narrative passages sharing the characteristics such that human subjects are able to guess accurately given sufficient context, but not so if they only see the last sentence containing the target word. On average, the context contains 4.6 sentence, and the testing performance is evaluated by having the model the last element of the target sentence (i.e. the very last word).

Most of the existing computational models fail on this task (without the help of external memory unit, such as neural cache). See the original LAMBADA [paper](https://arxiv.org/pdf/1606.06031.pdf) for more results on applying RNNs on LAMBADA.

**Examples**:

```
Context: "Yes, I thought I was going to lose the baby." "I was scared too," he stated, sincerity flooding his eyes. "You were?" "Yes, of course. Why do you even ask?" "This baby wasn't exactly planned for."

Target sentence: "Do you honestly think that I would want you to have a ____"

Target word: miscarriage
```

**NOTE**: 

- Just like in a recurrent network implementation, where it is common to repackage hidden units when a new sequence begins, we pass into TCN a sequence `T` consisting of two parts: 
    1) effective history `L1`.
    2) valid sequence `L2`.
```
Sequence [------T------>] = [--L1--> ------L2------>]
```

- In the forward pass, the whole sequence is passed into TCN, but only the `L2` portion is used for training. This ensures that the training data are also provided with sufficient history. The size of `T` and `L2` can be adjusted via flag `seq_len` and `validseqlen`.

- The choice of data to load can be specified via the `data` flag, followed by the path to the directory containing the data.

## Settings

In [1]:
import torch as th
import torch.nn as nn
from tqdm.notebook import tqdm
import torch.nn.functional as F

BATCH_SIZE = 20
DEVICE = "cuda:1"
DROPOUT = 0.1
EMB_DROPOUT = 0.1
CLIP = 0.4
EPOCHS = 10
KSIZE = 4
DATA_ROOT = "/home/densechen/dataset"
EMSIZE = 500
LEVELS = 5
LR = 4
NHID = 500
SEED = 1111
TIED = True
OPTIM = "SGD"
VALID_SEQ_LEN = 50
SEQ_LEN = 100
CORPUS = False

CHANNEL_SIZES = [NHID] * (LEVELS - 1) + [EMSIZE]

th.manual_seed(SEED)

<torch._C.Generator at 0x7ff5e80a1990>

## Data Generation

The meaning of batch size in PTB is different from that in MNIST example. In MNIST, batch size is the # of sample data that is considered in each iteration; in PTB, however, it is the number of segments to speed up computation.

The goal of PTB is to train a language model to predict the next word.

**NOTE**: You will need to download the lambada dataset from [here](http://clic.cimec.unitn.it/lambada/).

In [3]:
import pickle
import os
import re

def data_generator():
    if os.path.exists(os.path.join(DATA_ROOT, "corpus")) and not CORPUS:
        corpus = pickle.load(open(os.path.join(DATA_ROOT, "corpus"), "rb"))
    else:
        print("Creating Corpus...")
        corpus = Corpus(os.path.join(DATA_ROOT,  "lambada-vocab-2.txt"), DATA_ROOT)
        pickle.dump(corpus, open(os.path.join(DATA_ROOT, "corpus"), "wb"))
    
    train_data = batchify(corpus.train, BATCH_SIZE)
    val_data = [[0] * (SEQ_LEN - len(line)) + line for line in corpus.valid]
    test_data = [[0] * (SEQ_LEN - len(line)) + line for line in corpus.test]
    return train_data, val_data, test_data, corpus

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        
    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

class Corpus(object):
    def __init__(self, dict_path, path):
        self.dictionary = Dictionary()
        self.prep_dict(dict_path)
        self.train = th.LongTensor(self.tokenize(os.path.join(path, "train-novels")))
        self.valid = self.tokenize(os.path.join(path, "lambada_development_plain_text.txt"), eval=True)
        self.test = self.tokenize(os.path.join(path, "lambada_test_plain_text.txt"), eval=True)
    
    def prep_dict(self, dict_path):
        assert os.path.exists(dict_path)
        
        # Add words to the dictionary
        with open(dict_path, "r") as f:
            tokens = 0
            for line in f:
                word = line.strip()
                tokens += 1
                self.dictionary.add_word(word)
        
        if "<eos>" not in self.dictionary.word2idx:
            self.dictionary.add_word("<eos>")
            tokens += 1
        
        print(f"The dictionary captured a covalbulary of size {tokens}")
    
    def tokenize(self, path, eval=False):
        assert os.path.exists(path)
        
        ids = []
        token = 0
        misses = 0
        if not path.endswith(".txt"):
            for subdir in os.listdir(path):
                for filename in os.listdir(path + "/" + subdir):
                    if filename.endswith(".txt"):
                        full_path = os.path.join(path, subdir, filename)
                        # Tokenize file content
                        delta_ids, delta_token, delta_miss = self._tokenize_file(full_path, eval=eval)
                        ids += delta_ids
                        token += delta_token
                        misses += delta_miss
                    
        else:
            ids, token, misses = self._tokenize_file(path, eval=eval)

        print(token, misses)
        return ids

    def _tokenize_file(self, path, eval=False):
        with open(path, "r") as f:
            token = 0
            ids = []
            misses = 0
            for line in f:
                line_ids = []
                words = line.strip().split() + ["<eos>"]
                if eval:
                    words = words[:-1]
                for word in words:
                    # these words are in the text but not vocabulary
                    if word == "n't":
                        word = "not"
                    elif word == "'s":
                        word = "is"
                    elif word == "'re":
                        word = "are"
                    elif word == "'ve":
                        word = "have"
                    elif word == "wo":
                        word = "will"
                    if word not in self.dictionary.word2idx:
                        word = re.sub(r'[^\w\s]', '', word)
                    if word not in self.dictionary.word2idx:
                        misses += 1
                        continue
                    line_ids.append(self.dictionary.word2idx[word])
                    token += 1
                if eval:
                    ids.append(line_ids)
                else:
                    ids += line_ids
        return ids, token, misses

def batchify(data, batch_size):
    """the output should have size [L x batch size], where L could be a long sequence length.
    """
    # work out how cleanly we can divide the dataset into batch size parts
    # i.e. continuous seqs.
    nbatch = len(data) // batch_size
    # trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # evently, divide the data across the batch size batches.
    data = data.view(batch_size, -1)
    print(data.size())
    data = data.to(DEVICE)
    return data

def get_batch(source, i, seq_len=None, evaluation=False):
    seq_len = min(SEQ_LEN, source.size(1) - 1 - i)
    data = source[:, i:i+seq_len]
    target = source[:, i+1:i+1+seq_len]
    
    return data, target

print("Producing data...")
train_data, val_data, test_data, corpus = data_generator()

n_words = len(corpus.dictionary)
print(f"Total # of words: {n_words}")
print("Finished.")

Producing data...
torch.Size([20, 737368])
Total # of words: 112747
Finished.


## Define Model

In [4]:
from core.tcn import TemporalConvNet

class TCN(nn.Module):
    def __init__(self, input_size, output_size, num_channels, 
                 kernel_size=2, dropout=0.3, emb_dropout=0.1, tied_weights=False):
        super().__init__()
        self.encoder = nn.Embedding(output_size, input_size)
        self.tcn = TemporalConvNet(input_size, num_channels, kernel_size, dropout=dropout)
        
        self.decoder = nn.Linear(num_channels[-1], output_size)
        if tied_weights:
            if num_channels[-1] != input_size:
                raise ValueError("When using the tied flag, nhid must be equal to emsize")
            self.decoder.weight = self.encoder.weight
            print("weight tied")
        
        self.drop = nn.Dropout(emb_dropout)
        self.emb_dropout = emb_dropout
    
    def forward(self, input):
        """Input ought to have dimension (N, C_in, L_in), where L_in is the 
        seq_len, here the input is (N, L, C).
        """
        emb = self.drop(self.encoder(input))
        y = self.tcn(emb.transpose(1, 2)).transpose(1, 2)
        y = self.decoder(y)
        return y.contiguous()
    


print("Building model...")
model = TCN(EMSIZE, n_words, CHANNEL_SIZES, dropout=DROPOUT,
            emb_dropout=EMB_DROPOUT, kernel_size=KSIZE, tied_weights=TIED)
model = model.to(DEVICE)

optimizer = getattr(th.optim, OPTIM)(model.parameters(), lr=LR)
print("Finished.")

Building model...
weight tied
Finished.


## Run

In [None]:
def evaluate(data_source):
    model.eval()
    processed_data_size = 0
    total_loss = 0
    with th.no_grad():
        for i in range(len(data_source)):
            data, targets = th.LongTensor(data_source[i]).view(1, -1), th.LongTensor([data_source[i][-1]]).view(1, -1)
            data, targets = data.to(DEVICE), targets.to(DEVICE)
            output = model(data)
            final_output = output[:, -1].contiguous().view(-1, n_words)
            final_target = targets[:, -1].contiguous().view(-1)
            loss = F.cross_entropy(final_output, final_target)
            processed_data_size += 1
            total_loss += loss
    return total_loss.item() / processed_data_size

def train(ep):
    model.train()
    process = tqdm(range(0, train_data.size(1) - 1, VALID_SEQ_LEN))
    for i in process:
        if i + SEQ_LEN - VALID_SEQ_LEN >= train_data.size(1) - 1:
            continue
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        output = model(data)
        eff_history = SEQ_LEN - VALID_SEQ_LEN
        if eff_history < 0:
            raise ValueError("Valid sequence length must be smaller than sequence length!")
        final_target = targets[:, eff_history:].contiguous().view(-1)
        final_output = output[:, eff_history:].contiguous().view(-1, n_words)
        loss = F.cross_entropy(final_output, final_target)
        loss.backward()
        if CLIP > 0:
            th.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()

        process.set_description(f"Train Epoch: {ep:2d}, loss: {loss.item():.4f}")

for epoch in range(1, EPOCHS+1):
    train(epoch)
    val_loss = evaluate(val_data)
    test_loss = evaluate(test_data)
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | valid loss {val_loss:5.2f}')
    print(f'| end of epoch {epoch:3d} | test loss {test_loss:5.2f}')
    print('-' * 89)

  0%|          | 0/14748 [00:00<?, ?it/s]