In [None]:
!pip install datasets

In [2]:
from typing import List
import re
import string
import math
import os

import torch

from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from datasets import load_dataset

import numpy as np
import pandas as pd
from itertools import chain

import gensim

import gc

In [None]:
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip /content/glove.6B.zip

In [4]:
path_to_glove_embedding = '/content/glove.6B.300d.txt'
path_to_glove_embeddings_in_gensim_word2vec_format = './glove-word2vec.6B.300d.txt'

In [5]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
glove_file = datapath(path_to_glove_embedding)
tmp_file = get_tmpfile('./glove-word2vec.6B.300d.txt')
_ = glove2word2vec(glove_file, tmp_file)

  _ = glove2word2vec(glove_file, tmp_file)


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
ptb = load_dataset('ptb_text_only')

In [None]:
# "RNN and LSTM have to be used for Assignment 3"
# V = [10, 768] (context paragraph + question) C: NLP 243 is a class taught at UCSC. In NLP 243 we have to do assignmetns + final project + exams. Q: Where is the class taught?
# K = [10, 100]
# Q = [10, 100] (Answer) -> "UCSC <PAD> <PAD>"

# softmax(K X Q^T) => [10, 10] x [10, 768] = [768, 10]

# [
#     1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 (RNN)
#     0.99 0.01 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.01
#     0.91 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
#     0.91 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
#     0.91 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
#     0.91 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
#     0.91 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
# ]



In [None]:
ptb

In [9]:
# train and validation sets

train_data = [_text['sentence'] for _text in ptb['train']]
val_data = [_text['sentence'] for _text in ptb['validation']]

In [12]:
# # Load glove6B embeddings into Gensim class containing matrix
word2vec_weights = gensim.models.KeyedVectors.load_word2vec_format(tmp_file)


In [13]:
# This is to encode the input text and pad it => encoded into list of indices
class Sequencer(object):
    def __init__(self, corpus, bos_token='<s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>'):
        self.word2idx = {}
        self.idx2word = {}

        self.unk_index = self.add_token(unk_token)

        self.pad_index = self.add_token(pad_token)

        self.bos_index = self.add_token(bos_token)

        self.eos_index = self.add_token(eos_token)

        self.tokenizer = lambda text: [t for t in text.split(' ')]

        for _sentence in corpus:
            for _token in self.tokenizer(_sentence):
                self.add_token(_token)

    def add_token(self, token):

        if token not in self.word2idx:
            self.word2idx[token] = new_index = len(self.word2idx)
            self.idx2word[new_index] = token
            return new_index
        else:
            return self.word2idx[token]

    def encode(self, text):

        tokens = self.tokenizer(text)

        sequence = []
        sequence.append(self.bos_index)

        for token in tokens:

            index = self.word2idx.get(token, self.unk_index)
            sequence.append(index)

        sequence.append(self.eos_index)
        return sequence

    def create_padded_tensor(self, sequences):


        lengths = [len(sequence) for sequence in sequences]
        max_seq_len = max(lengths)
        tensor = torch.full((len(sequences), max_seq_len), self.pad_index, dtype=torch.long)

        for i, sequence in enumerate(sequences):
            for j, token in enumerate(sequence):
                tensor[i][j] = token

        return tensor, torch.tensor(lengths)

In [14]:
# Maps text to index and then to corresponding embedding
class PreTrainedSequencer(object):
    def __init__(self, corpus, gensim_w2v, embedding_dim, bos_token='<s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>'):

        self.idx2word = {}
        self.word2idx = {}

        self.embedding_dim = embedding_dim

        self.w2v = gensim_w2v

        self.w2v.add_vectors([bos_token], [np.random.uniform(low=-1, high=1.0, size=(self.embedding_dim,))])

        self.w2v.add_vectors([eos_token], [np.random.uniform(low=-1, high=1.0, size=(self.embedding_dim,))])

        self.w2v.add_vectors([unk_token], [np.random.uniform(low=-1, high=1.0, size=(self.embedding_dim,))])

        self.w2v.add_vectors([pad_token], [np.random.uniform(low=-1, high=1.0, size=(self.embedding_dim,))])

        self.bos_index = self.add_token(bos_token)

        self.eos_index = self.add_token(eos_token)

        self.unk_index = self.add_token(unk_token)

        self.pad_index = self.add_token(pad_token)

        self.tokenizer = lambda text: [t for t in text.split(' ')]

        for _sentence in corpus:
            for _token in self.tokenizer(_sentence):
                self.add_token(_token)

        self.pre_trained_embeddings = torch.zeros([len(self.idx2word.keys()), self.embedding_dim])

        for idx, word in self.idx2word.items():
            if self.w2v.has_index_for(word):
                self.pre_trained_embeddings[idx] = torch.tensor(self.w2v.get_vector(self.w2v.key_to_index.get(word), norm=True))
            else:

                self.pre_trained_embeddings[idx] = torch.tensor(self.w2v.get_vector(self.w2v.key_to_index.get(unk_token), norm=True))

    def add_token(self, token):
        if token not in self.word2idx:
            self.word2idx[token] = new_index = len(self.word2idx)
            self.idx2word[new_index] = token
            return new_index
        else:
            return self.word2idx[token]

    def encode(self, text):


        tokens = self.tokenizer(text)

        sequence = []
        sequence.append(self.bos_index)

        for token in tokens:

            index = self.word2idx.get(token, self.unk_index)
            sequence.append(index)

        sequence.append(self.eos_index)
        return sequence

    def create_padded_tensor(self, sequences):

        lengths = [len(sequence) for sequence in sequences]
        max_seq_len = max(lengths)
        tensor = torch.full((len(sequences), max_seq_len), self.pad_index, dtype=torch.long)

        for i, sequence in enumerate(sequences):
            for j, token in enumerate(sequence):
                tensor[i][j] = token

        return tensor, lengths

In [15]:
# represent the penn treebank dataset, the _getitem converts a training example to embeddings along with output
class PennTreebankDataset(Dataset):
    def __init__(self, data, text_sequencer):

        self.data = data
        self.sequencer = text_sequencer

    def __getitem__(self, index):
        input_text = self.data[index]
        x = self.sequencer.encode(input_text)
        y = x[1:]
        x = x[:-1]
        return x, y

    def __len__(self):
        return len(self.data)

In [16]:
# RNN implementation
class RNNLM(nn.Module):

    def __init__(self, vocab_size, pad_index, embedding_dim, batch_size, hidden_size=100, bidirectional=False, num_layers=2, dropout_p=0.1, pre_trained_embeddings=None, tunable_pre_trained_embedding=False):
        super(RNNLM, self).__init__()

        self.pad_index = pad_index

        self.bidirectional = bidirectional
        self.num_directions = 1 if not self.bidirectional else 2


        self.hidden_size = hidden_size


        self.dropout_p = dropout_p


        self.num_layers = num_layers

        self.vocab_size = vocab_size

        self.embedding_dim = embedding_dim

        self.batch_size = batch_size


        self.pre_trained_embeddings = pre_trained_embeddings

        self.tunable_pre_trained_embedding = tunable_pre_trained_embedding if self.pre_trained_embeddings is not None else False

        # add learnable embedding + frozen embedding or just frozen embedding
        if self.pre_trained_embeddings is not None:

            if self.tunable_pre_trained_embedding:
                self.tunable_embedding = nn.Embedding.from_pretrained(self.pre_trained_embeddings, freeze=False)


            self.embedding = nn.Embedding.from_pretrained(self.pre_trained_embeddings, freeze=True)
        else:

            self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.dropout = nn.Dropout(dropout_p)

        embeddingSize = 2 if self.pre_trained_embeddings is not None and self.tunable_pre_trained_embedding else 1

        self.rnn = nn.RNN(
            bidirectional=self.bidirectional,
            input_size=embeddingSize*self.embedding_dim, # tunable embedding.concat(frozen embedding)
            hidden_size=self.hidden_size,
            num_layers=self.num_layers,
            bias=False
        )

        self.fc1 = nn.Linear(self.hidden_size*self.num_directions, self.embedding_dim, bias=False)

        self.fc2 = nn.Linear(self.embedding_dim, self.vocab_size, bias=False)


    def init_hidden(self):
        h_0 = nn.Parameter(torch.randn(self.num_layers * self.num_directions, self.batch_size, self.hidden_size).type(torch.FloatTensor).to(device), requires_grad=True)
        return h_0

    def forward(self, x, lengths):
        h_0 = self.init_hidden()

        if self.pre_trained_embeddings is not None:

            embed_frozen = self.embedding(x)

            if self.tunable_pre_trained_embedding:
                embed_tunable = self.tunable_embedding(x)

                embed = self.dropout(torch.cat((embed_tunable, embed_frozen), dim=2))

            else:
                embed = self.dropout(embed_frozen)
        else:
            embed = self.dropout(self.embedding(x))

        packed_input = nn.utils.rnn.pack_padded_sequence(embed, lengths, batch_first=True, enforce_sorted=False) # batch_first -> (Input is B x T x *); enforce_sort -> sequences sorted by decreasing order

        output, h_n = self.rnn(packed_input, h_0.detach())

        seq_unpacked, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True, padding_value=self.pad_index) # pack the outputs to a padded matrix to feed to FFN

        decoded = self.fc1(seq_unpacked)

        decoded = self.dropout(decoded)

        decoded = self.fc2(decoded)

        logits = F.log_softmax(decoded, dim=-1) # get the highest prob to predict next word

        return logits

In [17]:
# Parameters
embedding_dim = 300
batch_size = 16
hidden_size=512
bidirectional = False
num_layers = 2
dropout_p = 0.5
tunable_pre_trained_embedding = True

In [19]:
sequencer = Sequencer(train_data)
pre_trained_embeddings = None
vocab_size = len(sequencer.idx2word)

In [20]:
train_dataset = PennTreebankDataset(train_data, sequencer)
val_dataset = PennTreebankDataset(val_data, sequencer)

In [21]:

def prepare_batch(batch, sequencer):
    input_texts, output_texts = zip(*batch)
    input_text_tensor, input_lengths = sequencer.create_padded_tensor(input_texts)
    output_text_tensor, output_lengths = sequencer.create_padded_tensor(output_texts)
    return (input_text_tensor, input_lengths, output_text_tensor)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, collate_fn=lambda batch: prepare_batch(batch, sequencer))
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, collate_fn=lambda batch: prepare_batch(batch, sequencer), shuffle=False)

In [22]:
model = RNNLM(vocab_size, sequencer.pad_index, embedding_dim=embedding_dim, batch_size=batch_size, hidden_size=hidden_size, bidirectional=bidirectional, num_layers=num_layers, dropout_p=dropout_p, tie_weights=tie_weights, pre_trained_embeddings=pre_trained_embeddings, tunable_pre_trained_embedding=tunable_pre_trained_embedding)

In [23]:
model.to(device)

RNNLM(
  (embedding): Embedding(10002, 300)
  (dropout): Dropout(p=0.5, inplace=False)
  (rnn): RNN(300, 512, num_layers=2, bias=False)
  (fc1): Linear(in_features=512, out_features=300, bias=False)
  (fc2): Linear(in_features=300, out_features=10002, bias=False)
)

In [32]:
learning_rate = 0.0005

loss_function = nn.NLLLoss(ignore_index=sequencer.pad_index) # negative log likelihood loss
# Perplexity = 2^-log likelihood

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [25]:
def train(model, optimizer, loss_function, loader, device):

        model.train()

        running_loss = 0.

        running_loss_history = []

        for i, batch in enumerate(loader):
            batch_size = batch[0].shape[0]

            optimizer.zero_grad()

            pred = model(batch[0].to(device), batch[1])

            loss = loss_function(pred.view(-1, pred.size(2)), batch[2].view(-1).to(device))

            running_loss += (loss.item() - running_loss) / (i + 1)
            running_loss_history.append(running_loss)

            loss.backward()

            nn.utils.clip_grad_norm_(model.parameters(), 3.0)

            optimizer.step()

            predictions = torch.argmax(pred, dim=-1)

        mean_running_loss = np.mean(running_loss_history)

        del running_loss_history

        return mean_running_loss

In [29]:
def run_training(model, optimizer, loss_function, train_loader, valid_loader, device, n_epochs=20):
        train_running_losses = []

        min_loss = 1000

        for i in range(n_epochs):
            print(f"Epoch: {i}")


            train_running_loss_history = train(model, optimizer, loss_function, train_loader, device)


            train_running_losses.append(train_running_loss_history)

        all_train_running_losses = list(chain.from_iterable(train_running_losses))
        all_train_running_perplexity = list(chain.from_iterable(train_running_perplexity))


        train_epoch_idx = range(len(all_train_running_losses))

        train_epoch_idx = range(len(all_train_running_perplexity))


        return all_train_running_losses

In [30]:
# clear gpu cache
gc.collect()
torch.cuda.empty_cache()

In [31]:
train_loss = run_training(model, optimizer, loss_function, train_loader, val_loader, device, n_epochs=30)

Epoch: 0


KeyboardInterrupt: ignored