 The following Python libraries are required for this part, and have been tested on Python 3.9 and Python 3.7.
 If you use Google Colab, PyTorch is already installed.
  - [PyTorch](https://pytorch.org/get-started/locally/) (tested with 1.10)

## Data

In [1]:
# You may prefer to upload the data to your google drive and mount your google drive to this colab, 
# because the data will be erased if you stop using this colab for a while.
# Uncomment the code below to do so. After mounting, navigate to the appropriate folder, right click, and "copy path".
# Assign DATA_DIR global variable to that path.
# For more mounting instructions: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=XDg9OBaYqRMd
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# If imported from google drive, config for your file directory. Mine is 'lm_data'.
DATA_DIR = "/content/drive/MyDrive/a3_data/lm_data"

# the goal is that DATA_DIR points to where the training/validation/test data is. 

In [3]:
import os
from io import open
import torch
import math
import torch.nn as nn
import time

In [4]:
SEED = 0
TRAIN_BATCH_SIZE = 100
TEST_BATCH_SIZE = 100
WORD_EMBED_DIM = 200
HID_EMBED_DIM = 200 
N_LAYERS = 2 
DROPOUT = 0.5 
LOG_INTERVAL = 100
EPOCHS = 10
BPTT = 50 # sequence length
CLIP = 0.25
TIED = False
SAVE_BEST = os.path.join(DATA_DIR, 'model.pt')

## Build vocabulary and convert text in corpus to lists of word index

In [5]:
import re
import math

def read_text_file_to_list(file):
    """
    Read txt file to list of string
    Splits on newline
    Input:
        file: (str) filename
    Output:
        lines: list of (str)
    """
    with open(file) as f:
        lines = f.read().splitlines()
        return lines

In [6]:
class WordDict(object):
    def __init__(self):
      # mapping between word type to its index
      self.word2idx = {}
      # mapping between index to word type
      self.idx2word = {}
      self.next_id = 0 # default value
      self.add_word("<sos>")
      self.add_word("<eos>")

    def add_word(self, word):
      """
      Add word and id to word2ids and idx2word
      """
      if word not in self.word2idx:
        self.word2idx[word] = self.next_id
        self.idx2word[self.next_id] = word
        self.next_id += 1

    def __len__(self):
      return len(self.idx2word)

class Corpus(object):
    def __init__(self, path):
      self.train_file = read_text_file_to_list(os.path.join(path, 'train.txt'))
      self.valid_file = read_text_file_to_list(os.path.join(path, 'valid.txt'))
      self.test_file = read_text_file_to_list(os.path.join(path, 'test.txt'))

      self.dictionary = WordDict() 
      self.add_data_to_vocab()

      self.train = self.tokenize(self.train_file)
      self.valid = self.tokenize(self.valid_file)
      self.test = self.tokenize(self.test_file)
    
    def add_data_to_vocab(self):
      """
      Add tokens from train, validation, and test set to vocab
      """
      for data in [self.train_file, self.valid_file, self.test_file]:
        for line in data:
          line = re.sub('\s+',' ',line)
          line = line.strip()
          if line:
            line = line.lower()
            for token in line.split():
              self.dictionary.add_word(token)
              
    def tokenize(self, text):
        ################################
        ## TODO: 
        ## (1) build vocabulary on three given files, using class WordDict 
        ## (2) tokenize each file content with the vocabulary, return a list of token ids
        ## Note that in this implementation, we add words in validation and test file into the vocabulary,
        ## so there is no unknown word.
        ################################

        all_token_ids = []

        for line in text:
          line = re.sub('\s+',' ',line)
          line = line.strip()
          if line:
            line = line.lower()
            line_tokens = ['<sos>'] + line.split() + ['<eos>']
            all_token_ids.extend([self.dictionary.word2idx[token] for token in line_tokens])
        return all_token_ids

corpus = Corpus(DATA_DIR)
print(len(corpus.train))
print(len(corpus.valid))
print(len(corpus.test))
print(len(corpus.dictionary))
assert len(corpus.dictionary) == 28913
assert len(corpus.train) == 2099444
assert len(corpus.valid) == 218808
assert len(corpus.test) == 246993

2099444
218808
246993
28913


In [7]:
def batchify(ids, batch_size):
    """
    batchify arranges the dataset into columns.
    # Parameters
    ids : Tensor
        1-dimensional tensor of token ids
    batch_size : Int
        batch_size
    # Returns
    data: a torch.LongTensor with shape of (len(ids)//batch_size, batch_size)
        batchified corpus data

    For example, the input ids [1,2,3,4,5,6,7,8,9] and batch_size=2
    output is:
    [ [1, 5],
      [2, 6],
      [3, 7],
      [4, 8] ]
    The shape of the tensor is 4x2. 
    We trim off any extra elements (9 in this example) that wouldn't cleanly fit.
    ***Again, note that the text order is in the column.***
    """ 
    num_cols = math.floor(len(ids)/batch_size)
    list_subsets = [ids[i:i+num_cols] for i in 
                    range(0, len(ids)-num_cols+1, num_cols)]

    batchified = torch.transpose(torch.LongTensor(list_subsets),0,1)
    return batchified
    
# print(batchify([1,2,3,4,5,6,7,8,9], 2))
# print(batchify([1,2,3,4,5,6,7,8,9], 3))
# print(batchify([1,2,3,4,5,6,7,8,9,10], 3))
# print(batchify([1,2,3,4,5,6,7,8,9,10], 2))

train_data = batchify(corpus.train, TRAIN_BATCH_SIZE)
val_data = batchify(corpus.valid, TEST_BATCH_SIZE)
test_data = batchify(corpus.test, TEST_BATCH_SIZE)

print(train_data.shape)
print(val_data.shape)
print(test_data.shape)

torch.Size([20994, 100])
torch.Size([2188, 100])
torch.Size([2469, 100])


In [8]:
print(train_data)

tensor([[    0,   701,    10,  ...,    18, 28809,   272],
        [    2,  1791,    14,  ...,   438,  8623, 20553],
        [    3,   130,   119,  ...,   984,    18,   300],
        ...,
        [  794,    18,  7030,  ...,   436,   407,    17],
        [ 3536,  5282,    11,  ...,   843,   206,   157],
        [ 1566,   725,  3917,  ...,    16,   290,   857]])


In [9]:
def get_batch(source, i, bptt = BPTT):
    """
    # Parameters
    source : Tensor
        corpus as 2-dimensional tensor
    i : Int
        minibatch index

    # Returns
    data : 2D tensor 
        LSTM input
    target : 1D tensor
        LSTM output target

    Consider the following example where "source" is a 2d tensor of shape (4, 2).
    In this example we have 4 batches, each of size 2.
    [ [1, 5],
      [2, 6],
      [3, 7],
      [4, 8] ]

    Suppose we set BPTT (backpropagation through time, see A3 pdf for details) to 2.
    At index i = 0, the input to our LSTM becomes:
    [ [1, 5],
      [2, 6] ]
    This corresponds to the first 2 batches in the sequence.
    The target would correspondingly be (again, since BPTT is 2): 
    [ [2, 6],
      [3, 7] ]
    However, we need to reshape it from 2-dimensions to 1-dimension:
    [2, 6, 3, 7]

    For the next batch, index i = prev_i + BPTT = 2. 
    However, i + BPTT = 2 + 2 = 4 and 4 >= len(source). This wouldn't work.
    So, to account for this edge case, we consider BPTT to be:
    len(source) - 1 - i = 4 - 1 - 2 = 1
    As such, our input now becomes:
    [ [3, 7] ]
    and target is 
    [4, 8]. 
    """ 
    ###################################################
    # TODO Assign these variables to the right values #
    ###################################################
    seq_len =  (len(source) - 1 - i) if (i + bptt  >= len(source)) else bptt
    data = source[i:i+seq_len]
    target = torch.flatten(source[i+1:i+1+seq_len])
    return data, target


test = batchify([1,2,3,4,5,6,7,8,9], 2)
print(get_batch(test, i=0, bptt=2))
print(get_batch(test, i=2, bptt=2))

data, targets = get_batch(train_data, 0)
print(data.shape)
print(targets.shape)
print(data)

(tensor([[1, 5],
        [2, 6]]), tensor([2, 6, 3, 7]))
(tensor([[3, 7]]), tensor([4, 8]))
torch.Size([50, 100])
torch.Size([5000])
tensor([[    0,   701,    10,  ...,    18, 28809,   272],
        [    2,  1791,    14,  ...,   438,  8623, 20553],
        [    3,   130,   119,  ...,   984,    18,   300],
        ...,
        [   35,    17,  5419,  ...,  5099,    16,    14],
        [   36,   346,    62,  ...,    14,     1,  1625],
        [   37,  3544,    38,  ...,  7773,     0,  1654]])


In [10]:
################################
## TODO: Implement RNN LSTM
## documentation of pytorch LSTM interface: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
################################

class LSTMModel(nn.Module):

    def __init__(self, vocab_size, word_embedding_size, nhid, nlayers, dropout=0.5, tied_weights=False):
        super(LSTMModel, self).__init__()

        self.nhid = nhid # hidden dimension of LSTM
        self.nlayers = nlayers # number of LSTM layers
        # TODO: initialize the required modules for the LSTM model
        # HINT: batch_first should be False in LSTM since our data structure is not batch first.
        self.encoder = self.embedding = torch.nn.Embedding(vocab_size, word_embedding_size)
        self.dropout = nn.Dropout(0.5)
        self.lstm = nn.LSTM(input_size = word_embedding_size, 
                          hidden_size = self.nhid, 
                          num_layers = self.nlayers,
                          batch_first = False)
        self.decoder = nn.Linear(self.nhid, vocab_size)

        self.init_weights()

    def init_weights(self):
        """
        For example:
        # initrange = 0.1
        # nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        This is not all that you need!
        """
        # TODO: initialize the parameters
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)   
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)   

    def forward(self, input, hidden):
        """
        # Parameters
        input: input embedding
        hidden: hidden states in LSTM
        # Returns
        decoded: refers to the output of decoder layer over the vocabulary. Note that you don't need to pass it through the softmax layer
        hidden: stores the hidden states in LSTM
        """
        embeds = self.embedding(input)
        out = self.dropout(embeds)
        out, hidden = self.lstm(out, hidden)
        out = self.dropout(embeds)
        decoded = self.decoder(out).flatten(0,1)
        return decoded, hidden

    # initialize parameters in LSTM
    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, bsz, self.nhid),
            weight.new_zeros(self.nlayers, bsz, self.nhid))

In [11]:
# Set the random seed for reproducibility.
torch.manual_seed(SEED)
# set device as GPU/CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
print(device)

cuda


In [13]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

def train():
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)

    hidden = model.init_hidden(TRAIN_BATCH_SIZE)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, BPTT)):
        data, targets = get_batch(train_data, i)
        # print(torch.max(data))
        # print("data shape: ", data.shape)
        # print("targets shape: ", targets.shape)
        data = data.to(device)
        targets = targets.to(device)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        optimizer.zero_grad()
        hidden = repackage_hidden(hidden) # Note that the main advantage here is that the hidden value is continual from the previous forward pass
        output, hidden = model(data, hidden)
        # print("output shape: ", output.shape)
        # print("target shape: ", targets.shape)
        # print("output type: ", type(output))
        # print("target type: ", type(targets))
        loss = criterion(output.to(torch.float), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()

        total_loss += loss.item()

        if batch % LOG_INTERVAL == 0 and batch > 0:
            cur_loss = total_loss / LOG_INTERVAL
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // BPTT,
                elapsed * 1000 / LOG_INTERVAL, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [14]:
# TODO: Compute the loss of model on data_source
def evaluate(data_source, expected_size = BPTT ):
    model.eval()
    # TODO: get the average negative log likelihood on the data_source
    # return average_log_loss
    epoch_loss, epoch_acc = 0, 0
    epoch_tp, epoch_fp, epoch_fn = 0, 0, 0
    
    with torch.no_grad():
      hidden = model.init_hidden(TEST_BATCH_SIZE)
      batch_starts = range(0, data_source.size(0) - 1, BPTT)
      for batch, i in enumerate(batch_starts):
        data, targets = get_batch(data_source, i)
        data = data.to(device)
        batch_size = data.shape[0]
        targets = targets.to(device)
        out, hidden = model.forward(data, hidden)
        loss = criterion(out.to(torch.float), targets)
        loss_weight = batch_size/expected_size # downweight shorter sequence?
        epoch_loss += ((loss_weight*loss.item())/len(batch_starts))
    print("epoch loss: ", epoch_loss)
    return epoch_loss

In [15]:
# prepare the model, loss, and optimizer
ntokens = len(corpus.dictionary)
model = LSTMModel(ntokens, WORD_EMBED_DIM, HID_EMBED_DIM, N_LAYERS, DROPOUT, TIED).to(device)
criterion = nn.CrossEntropyLoss() # use crossentropy loss
optimizer = torch.optim.Adam(model.parameters()) # use adam optimizer with default setting
best_val_loss = None

# Training framework
for epoch in range(1, EPOCHS+1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
        'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
        val_loss, math.exp(val_loss)))
    print('-' * 89)
    
    # Save the model if the validation loss is the best we've seen so far.
    if not best_val_loss or val_loss < best_val_loss:
        with open(SAVE_BEST, 'wb') as f:
            torch.save(model, f)
            print("save new best model!")
        best_val_loss = val_loss

| epoch   1 |   100/  419 batches | ms/batch 181.49 | loss  9.92 | ppl 20284.83
| epoch   1 |   200/  419 batches | ms/batch 179.67 | loss  7.91 | ppl  2732.76
| epoch   1 |   300/  419 batches | ms/batch 180.99 | loss  7.01 | ppl  1108.00
| epoch   1 |   400/  419 batches | ms/batch 182.87 | loss  6.64 | ppl   763.68
epoch loss:  6.138001237782566
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 79.04s | valid loss  6.14 | valid ppl   463.13
-----------------------------------------------------------------------------------------
save new best model!
| epoch   2 |   100/  419 batches | ms/batch 184.32 | loss  6.42 | ppl   614.58
| epoch   2 |   200/  419 batches | ms/batch 182.73 | loss  6.24 | ppl   510.83
| epoch   2 |   300/  419 batches | ms/batch 182.56 | loss  6.15 | ppl   468.07
| epoch   2 |   400/  419 batches | ms/batch 182.26 | loss  6.08 | ppl   435.83
epoch loss:  5.782747320478613
-----------------------

In [16]:
# Load the best saved model.
with open(SAVE_BEST, 'rb') as f:
    model = torch.load(f)
    # After loading the RNN params, they are not a continuous chunk of memory.
    # flatten_paramters() makes them a continuous chunk, and will speed up the forward pass.
    # Currently, only RNN model supports flatten_parameters function.
    model.lstm.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

epoch loss:  5.341324213409423
| End of training | test loss  5.34 | test ppl   208.79


In [17]:
# Generation with GPT-2
# Check this tutorial: https://huggingface.co/blog/how-to-generate
# It comes with a notebook. You need to run through that notebook and understand different sampling procedures.

In [17]:
def generate_text(prompt, sampling_func):
    # # Generation with LSTM lm given a sampling function and a prompt
    max_length = 30
    ids = []
    for word in prompt.split():
        ids.append(corpus.dictionary.word2idx[word])
    hidden = model.init_hidden(1)
    with torch.no_grad():  # no tracking history
        output, hidden = model(torch.LongTensor([[wid] for wid in ids]).to(device), hidden)
        word_prob = torch.nn.functional.softmax(output[-1,:], dim=0).cpu()
        generations = []
        for i in range(max_length):
            word_idx = sampling_func(word_prob)
            word = corpus.dictionary.idx2word[word_idx]
            generations.append(word)
            if word == "<eos>":
                break
            new_word = torch.LongTensor([[word_idx]]).to(device)
            output, hidden = model(new_word, hidden)
            word_prob = torch.nn.functional.softmax(output[-1,:], dim=0).cpu()
    return generations

In [18]:
import random
def greedy_sampling(word_prob):
    # TODO: return the word with the max probability
    return torch.argmax(word_prob).item()

def random_sampling(word_prob):
    # TODO: sample a random word based on the probability vector
    # weights are cumulative
    id = random.choices([x for x in range(0,word_prob.shape[0])], cum_weights=word_prob.tolist(), k=1)[0]
    return id

def topk_sampling(word_prob, k=10):
    probs, indices = torch.topk(word_prob, k)
    # weights are relative
    id = random.choices(indices.tolist(), weights=probs.tolist(), k=1)[0]
    return id


In [19]:
prompt = "i went to".lower()
generations = generate_text(prompt, greedy_sampling) # replace sample_func with the sampling function that you would like to try
print('prompt: ' + prompt)
print(' '.join(generations))

prompt: i went to
the <unk> , and the <unk> , and the <unk> , and the <unk> , and the <unk> , and the <unk> , and the <unk> , and the <unk>


In [20]:
prompt = "i went to".lower()
generations = generate_text(prompt, random_sampling) # replace sample_func with the sampling function that you would like to try
print('prompt: ' + prompt)
print(' '.join(generations))

prompt: i went to
<sos> t tristan <sos> t <sos> <sos> t tristan schmoke <sos> <sos> <sos> t civic <sos> <sos> <sos> t <sos> <sos> t <sos> t civic painting <sos> t tristan advisers


In [22]:
prompt = "i went to".lower()
generations = generate_text(prompt, topk_sampling) # replace sample_func with the sampling function that you would like to try
print('prompt: ' + prompt)
print(' '.join(generations))

prompt: i went to
a large number of the united states , but was the game , <unk> , <unk> . <eos>
