# Project 3 - Sequence models

igu011 and edj001

In [1]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from datetime import datetime
from torch.utils.data import DataLoader, TensorDataset
from rich import print

import numpy as np
import torchtext
from os import listdir
import re
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

torch.manual_seed(265)
torch.set_default_dtype(torch.double)

device = torch.device("cpu")
print(f"Device {device}.")


## 2.1 Word embedding

#### 1 - Read txt files and tokenize them to obtain train/validation/test lists of words.


In [2]:
tokenizer = get_tokenizer("basic_english")


def read_files(datapath="./"):
    files = listdir(datapath)
    files = [datapath + f for f in files if f.endswith(".txt")]

    lines = []
    for f_name in files:
        with open(f_name) as f:
            lines += f.readlines()
    return lines


no_digits = "\w*[0-9]+\w*"
no_names = "\w*[A-Z]+\w*"
no_spaces = "\s+"


def tokenize(lines):
    list_text = []
    for line in lines:
        list_text += tokenizer(line)
    return list_text


def yield_tokens(lines):
    for line in lines:
        line = re.sub(no_digits + "|" + no_names, " ", line)
        line = re.sub(no_spaces, " ", line)
        yield tokenizer(line)


def count_freqs(data, vocab):
    freqs = torch.zeros(len(vocab), dtype=torch.int)
    for w in data:
        freqs[vocab[w]] += 1
    return freqs


train_books = read_files(datapath="./data_train/")
train_tokenized = tokenize(train_books)

val_books = read_files(datapath="./data_val/")
val_tokenized = tokenize(val_books)

test_books = read_files(datapath="./data_test/")
test_tokenized = tokenize(test_books)


#### 2 - Define a vocabulary based on the training dataset
To avoid getting a too large vocabulary, a solution can be to keep only words that appear at least 100 times in the training dataset.
Report the total number of words in the training dataset, the number of distinct words in the
training dataset and the size the defined vocabulary. Comment your results.

In [3]:
specials = ["<unk>", ",", ".", "!", "?"]
vocab = build_vocab_from_iterator(yield_tokens(train_books), min_freq=100, specials=specials)

vocab.append_token("i")

vocab.set_default_index(vocab["<unk>"])
vocab_size = len(vocab)

print("Total number of words in the dataset:   ", len(train_tokenized))
print("Number of distinct words in the dataset:", len(set(train_tokenized)))
print("Size the defined vocabular:             ", vocab_size)


freqs = count_freqs(train_tokenized, vocab)
top20 = freqs[len(specials) : len(specials) + 19]
print(
    "Top 20 occurences (without special characters):\n",
    [
        (f.item(), w)
        for (f, w) in zip(top20, vocab.lookup_tokens(range(len(specials), len(specials) + 19)))
    ],
)


We see the top 20 occurences match with what we would expect with words like and, a, that, it and the. The vocabulary has a total of 1050 words when the minimum frequency is set to 100. There are a total of 30374 distinct words, so only 1050 of these may not be good enough to create a complete text generation model.

In [4]:
def create_dataset(text, vocab, context_size=3):
    """
    Create a dataset from a list of context words and a list of target words
    """
    contexts = []
    targets = []
    n_text = len(text)

    txt = [vocab[w] for w in text]

    for i in range(n_text - context_size):

        t = txt[i + context_size]

        c = torch.Tensor(txt[i : i + context_size]).type(torch.long)

        targets.append(t)

        contexts.append(c)

    contexts = torch.stack(contexts)
    targets = torch.tensor(targets)
    return TensorDataset(contexts, targets)


context_size = 3

train_data = create_dataset(train_tokenized, vocab, context_size=context_size)
val_data = create_dataset(val_tokenized, vocab, context_size=context_size)
test_data = create_dataset(test_tokenized, vocab, context_size=context_size)

batch_size = 64

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)


#### 3 - Define a n-gram language model architecture based on this vocabulary that contains an embedding layer.
To drastically reduce computational cost, the dimension of the embedding can be as low as 16 even though in a real setting a larger space would be used.

In [5]:
class NGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((inputs.shape[0], -1))
        out = self.linear1(embeds)
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


#### 4 - Train several models, select the best one and evaluate its performance.
Note that the accuracy here is potentially extremely low, but the real objective is not to train a good predictor, only
to have a good representation of the semantic of each word in the vocabulary

In [6]:
def train(n_epochs, optimizer, model, loss_fn, train_loader):
    """
    Train our model and save weight values
    """
    n_batch = len(train_loader)
    losses_train = []
    model.train()
    optimizer.zero_grad(set_to_none=True)

    for epoch in range(1, n_epochs + 1):

        loss_train = 0.0
        for contexts, labels in train_loader:

            contexts = contexts.to(device=device)
            labels = labels.to(device=device)
            outputs = model(contexts)

            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            loss_train += loss.item()

        losses_train.append(loss_train / n_batch)

        print(
            "{}  |  Epoch {}  |  Training loss {:.5f}".format(
                datetime.now().time(), epoch, loss_train / n_batch
            )
        )
    return


def relative_error(a, b):
    return torch.norm(a - b) / torch.norm(a)


In [7]:
loss_fn = nn.NLLLoss()

n_epochs = 4
lr = 0.1
embedding_dim = 16

NGram_model = NGram(vocab_size, embedding_dim, context_size).to(device=device)
optimizer = optim.SGD(NGram_model.parameters(), lr=lr)

weight = train(
    n_epochs=n_epochs,
    optimizer=optimizer,
    model=NGram_model,
    loss_fn=loss_fn,
    train_loader=train_loader,
)


In [8]:
n_epochs = 20
lr = 0.1
embedding_dim = 64

NGram_larger_model = NGram(vocab_size, embedding_dim, context_size).to(device=device)
NGram_larger_optimizer = optim.SGD(NGram_larger_model.parameters(), lr=lr)

NGram_larger_model_weight = train(
    n_epochs=n_epochs,
    optimizer=NGram_larger_optimizer,
    model=NGram_larger_model,
    loss_fn=loss_fn,
    train_loader=train_loader,
)


In [9]:
def accuracy(model, loader):
    """
    Compute the accuracy of model with given data loader
    """
    model.eval()
    correct = 0
    with torch.no_grad():
        for contexts, labels in loader:
            contexts = contexts.to(device=device)
            labels = labels.to(device=device)

            outputs = model(contexts)
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()

    return correct / len(loader.dataset)

In [10]:
def model_selection(models, titles, train_loader, val_loader):
    """
    Choose the best model from the list of models based on validation accuracy
    """
    best_model = None
    best_acc = 0
    for model, title in zip(models, titles):
        acc = accuracy(model, val_loader)
        train_acc = accuracy(model, train_loader)
        print(f"{title} | Train accuracy {train_acc:.2%} |  Validation accuracy {acc:.2%}")
        if acc > best_acc:
            best_model = model
            best_acc = acc
    return best_model


In [11]:
best_model = model_selection(
    [NGram_model, NGram_larger_model], ["NGram", "NGram_larger"], train_loader, val_loader
)
best_model


NGram(
  (embeddings): Embedding(1050, 64)
  (linear1): Linear(in_features=192, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=1050, bias=True)
)

In [12]:
best_model_acc = accuracy(best_model, test_loader)
print(f"Best model | Test accuracy {best_model_acc:.2%}")


#### 5 - Compute the cosine similarity matrix of the vocabulary based on the trained embedding.
For some words of your choice (e.g. me, white, man, have, be, child, yes, what etc.), report the 10
most similar words. Comment your results.

In [13]:
best_model_weights = best_model.embeddings.weight.detach()

sim_matrix = np.zeros(shape=(len(best_model_weights), len(best_model_weights)))

for w1 in range(0, len(best_model_weights)):
    for w2 in range(0, len(best_model_weights)):
        # Cosine similarity
        sim_matrix[w1][w2] = np.dot(best_model_weights[w1], best_model_weights[w2]) / (
            np.linalg.norm(best_model_weights[w1]) * np.linalg.norm(best_model_weights[w2])
        )


In [14]:
testing_words = ["me", "white", "man", "have", "be", "child", "yes", "what"]
testing_words_idx = [(i, vocab.get_stoi()[i]) for i in testing_words]

n_most_similar = 10
for w in testing_words_idx:

    indices = (-sim_matrix[w[1]]).argsort()[:n_most_similar]
    sim_words = [(vocab.get_itos()[i], sim_matrix[w[1]][i]) for i in indices]
    print(f"{w} most similar words:")

    for sim_w in sim_words:
        print(f"\t{sim_w}")


We find some words in the top 10 most timilar words according to the similarity matrix. For example be - am, have - had and what - how.

Let us take a look at the embedding space visually

#### 6 -Visualize the embedding space.
To do so, upload the vocabulary and their corresponding values in the embedding space as tsv files. Try to find
and select clusters. Report both plots and their corresponding selections for some meaningful
clusters.

In [15]:
import csv

with open("./weights.tsv", "wt") as out_file:
    tsv_writer = csv.writer(out_file, delimiter="\t")
    for i in best_model_weights:
        tsv_writer.writerow(i.numpy())


with open("./metadata.tsv", "wt") as out_file:
    tsv_writer = csv.writer(out_file, delimiter="\t")
    vocab_dict = vocab.get_itos()
    for i in vocab_dict:
        tsv_writer.writerow([i])


![Battlefield](./visualizations/battlefield.png)
![Princess](./visualizations/princess.png)
![Happiness](./visualizations/happiness.png)
![Work](./visualizations/work.png)


In regards to task 5 we find better clusters than the words checked previously. It seems that outliers and special words used in less context are better clustered, but more normal words have less similarity in the cosine similarity matrix. For example Battlefield is surprisingly well clustered.

## 2.2 Conjugating *be* and *have*

#### 1 - Use your trained word embedding and define a MLP architecture as well as a RNN architecture to predict be and have conjugation given the context around the missing target. 
Use the same context size for both MLPs and RNNs, even though RNNs could take a context size of arbitrary
length.

- Your trained word embedding can be integrated in your next models in different ways. For
example, you can transform inputs before feeding them to your models, or have a first layer in
your models with the same weight values as your trained embedding matrix, or simply define
the computations between the input and the embedding matrix in the forward function.
- To predict be and have conjugation, the (contexts, targets) datasets must be such that
the targets are be, am, are, is, was, were, been, being, have, has, had, having. The output
layer must match the number of classes to predict and the labels must be mapped from their
original index in the vocabulary to integers between 0 and 11.

In [16]:
def create_conjugate_dataset(text, vocab, around_context_size=2):
    contexts = []
    targets = []
    n_text = len(text)

    target_possibilities = [
        "be",
        "am",
        "are",
        "is",
        "was",
        "were",
        "been",
        "being",
        "have",
        "has",
        "had",
        "having",
    ]

    target_to_idx = {target: i for i, target in enumerate(target_possibilities)}

    for i in range(around_context_size, n_text - around_context_size):

        word = text[i]

        if word in target_possibilities:

            t = target_to_idx[word]

            around = text[i - around_context_size-1 : i - 1] + text[i + 1 : i + around_context_size+1]

            c = torch.Tensor([vocab[w] for w in around]).type(torch.long)

            targets.append(t)

            contexts.append(c)

    contexts = torch.stack(contexts)
    targets = torch.tensor(targets)
    return TensorDataset(contexts, targets)


train_conjugate_data = create_conjugate_dataset(train_tokenized, vocab)
val_conjugate_data = create_conjugate_dataset(val_tokenized, vocab)
test_conjugate_data = create_conjugate_dataset(test_tokenized, vocab)

batch_size = 64

train_conjugate_data_loader = DataLoader(train_conjugate_data, batch_size=batch_size)
val_conjugate_data_loader = DataLoader(val_conjugate_data, batch_size=batch_size)
test_conjugate_data_loader = DataLoader(test_conjugate_data, batch_size=batch_size)


In [17]:
class MLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, embedding_weights):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings.load_state_dict({"weight": embedding_weights})
        self.embeddings.requires_grad = False

        self.classifier = nn.Sequential(
            nn.Linear(in_features=embedding_dim, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=12),
            nn.LogSoftmax(dim=1),
        )

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        output = self.classifier(embeds)

        # Only ouput class label, not features
        output = output[:,0,:]

        return output


In [18]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, embedding_weights, hidden_size, lstm_layers,):
        super(RNN, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings.load_state_dict({"weight": embedding_weights})
        self.embeddings.requires_grad = False

        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=lstm_layers,
            batch_first=True,
        )

        self.classifier = nn.Sequential(
            nn.Linear(in_features=embedding_dim, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=12),
            nn.LogSoftmax(dim=1),
        )

    def forward(self, input):
        embeds = self.embeddings(input)
        output, (hn, cn) = self.lstm(embeds)
        output = self.classifier(hn)
        return output[0]


#### 2 - Train several models, select the best one and evaluate its performance. 
Comment the differences in terms of performances/training time between the MLP architecture and the RNN
architecture.

In [19]:
models = []
titles = [] 

In [20]:
loss_fn = nn.NLLLoss()
n_epochs = 4
lr = 0.1
embedding_dim = 64

MLP_model = MLP(vocab_size, embedding_dim, best_model_weights).to(device=device)
optimizer = optim.SGD(MLP_model.parameters(), lr=lr)


train(n_epochs,optimizer,MLP_model,loss_fn,train_conjugate_data_loader)

models.append(MLP_model)
titles.append("MLP")

In [21]:
loss_fn = nn.NLLLoss()
n_epochs = 20
lr = 0.001
embedding_dim = 64

MLP_model_2 = MLP(vocab_size, embedding_dim, best_model_weights).to(device=device)
optimizer = optim.Adam(MLP_model_2.parameters(), lr=lr)


train(n_epochs,optimizer,MLP_model_2,loss_fn,train_conjugate_data_loader)

models.append(MLP_model_2)
titles.append("MLP_2")

In [22]:
loss_fn = nn.CrossEntropyLoss()
n_epochs = 4
lr = 0.001
embedding_dim = 64

RNN_model = RNN(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    embedding_weights=best_model_weights,
    hidden_size=64,
    lstm_layers=1,
).to(device=device)
optimizer = optim.SGD(RNN_model.parameters(), lr=lr)


train(n_epochs, optimizer, RNN_model, loss_fn, train_conjugate_data_loader)

models.append(RNN_model)
titles.append("RNN")


In [23]:
loss_fn = nn.CrossEntropyLoss()
n_epochs = 20
lr = 0.001
embedding_dim = 64

RNN_model_2 = RNN(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    embedding_weights=best_model_weights,
    hidden_size=64,
    lstm_layers=2,
).to(device=device)
optimizer = optim.Adam(RNN_model_2.parameters(), lr=lr)


train(n_epochs, optimizer, RNN_model_2, loss_fn, train_conjugate_data_loader)

models.append(RNN_model_2)
titles.append("RNN_2")

In [24]:
best_conjugating_model = model_selection(models, titles, train_conjugate_data_loader, val_conjugate_data_loader)
best_conjugating_model

RNN(
  (embeddings): Embedding(1050, 64)
  (lstm): LSTM(64, 64, num_layers=2, batch_first=True)
  (classifier): Sequential(
    (0): Linear(in_features=64, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=12, bias=True)
    (3): LogSoftmax(dim=1)
  )
)

In [25]:
best_conjugating_model_acc = accuracy(best_conjugating_model, test_loader)
print(f"Best conjugating model | Test accuracy {best_model_acc:.2%}")

## 2.3 Text generation

#### 1 -  Use your trained word embedding and define a RNN architecture that can predict the next word given the context before the target.


In [26]:
class RNN_2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, embedding_weights, hidden_size, lstm_layers, output_size):
        super(RNN_2, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings.load_state_dict({"weight": embedding_weights})
        self.embeddings.requires_grad = False

        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=lstm_layers,
            batch_first=True,
        )

        self.classifier = nn.Sequential(
            nn.Linear(in_features=embedding_dim, out_features=4096),
            nn.ReLU(),
            nn.Linear(in_features=4096, out_features=2048),
            nn.ReLU(),
            nn.Linear(in_features=2048, out_features=output_size),
            nn.LogSoftmax(dim=1),
        )

    def forward(self, input):
        embeds = self.embeddings(input)
        output, (hn, cn) = self.lstm(embeds)
        output = self.classifier(hn)
        return output[0]


In [27]:
loss_fn = nn.CrossEntropyLoss()
n_epochs = 4
lr = 0.001
embedding_dim = 64

RNN_text_gen = RNN_2(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    embedding_weights=best_model_weights,
    hidden_size=64,
    lstm_layers=1,
    output_size=vocab_size,
).to(device=device)
optimizer = optim.Adam(RNN_text_gen.parameters(), lr=lr)

# I was not able to train due to time it took training.
#train(n_epochs, optimizer, RNN_text_gen, loss_fn, train_loader)


In 24 minutes the training loop had not completed a single epoch, so i did not have the time to train the text_generation model.

![Long training time](./visualizations/long_waIT.png)

#### 2 - Train several models, select the best one and evaluate its performance.
*Models could not be trained due to training times, but i would simply define several models with different parameters, and use the existing general implemented model_selection and accuracy functions as the two previous tasks.*

#### 3 - Implement the beam search algorithm.

*We did not have time to implement this part of the task*

#### 4 - Have fun playing with your model. 

*We did not have time to implement this part of the task*