## Required Packages Import

In [None]:
import copy
import json
import pickle
import random
from os import path

import numpy
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from time import time
from torch import nn, optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm


CACHE_FILE = "vecors.cache"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_EPOCH = 100
MODEL_FILE = "best.model"
MODEL_INFO_FILE = "best.model.info"
SEED = 42
torch.manual_seed(SEED)


## Task 1: Document Classification using Attention

Here we will implement the document classification model shown in Figure 1 with PyTorch using Multi-Head Attention.

<img src= >

<center><b>Note regarding what we did for better understanding</b></center>

**Encoder Word Embedding**: We use the Glove model from spacy found in its en_core_web_md model, which is used in the load function the Dataloader cell.

**RNN Layer**: We decided to use a bidirectional LSTM after applying hyperparameter search where the results can be seen in the table in pandas Dataframe below.;

**Dropout**: We used hyperparameter search to test out differnt dropouts on the RNN Layer and the Attention layer we applied the same dropout for both, however we could have tried to expirement with seperate dropout assignments, however it was not, directly mentioned in the description.


## Data Loading 

In [None]:
# custom dataloader-dataset for variable length sequences, as TensorDataset only works for equal length sequences
class MyDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, i):
        return self.x[i], self.y[i]


# padding collate-function to receive batch-processable vectors
def pad(batch):
    x = [torch.FloatTensor(xx).to(DEVICE) for xx, _ in batch]
    y = torch.LongTensor([yy for _, yy in batch]).to(DEVICE)
    return pad_sequence(x, batch_first=True).to(DEVICE), y


def load():
    # create or load vectors from cache file
    if path.exists(CACHE_FILE):
        with open(CACHE_FILE, "rb") as f:
            x_all, y_all, max_doc_len = pickle.loads(f.read())
        print("Loaded vectors and labels from cache")
    else:
        import spacy
        nlp = spacy.load("en_core_web_md")
        dataset = pd.read_csv('sst5.data.txt')  # to only load 1000
        sentence_lengths = dataset["text"].str.split(" ").apply(len)
        # print("Histogram of sentence lengths")
        # sentence_lengths.hist()  # print histogram of sentence lengths
        documents = nlp.pipe(dataset["text"].to_list())
        x_all = list(map(lambda doc: [word.tensor for word in doc], documents))
        y_all = (dataset["label"] + 2).to_list()  # instead of -2 - 2, make it 0-4
        max_doc_len = max(sentence_lengths)
        with open(CACHE_FILE, "wb") as f:
            f.write(pickle.dumps((x_all, y_all, max_doc_len)))

    # split first into 60:40 and the 40 into 50:50 -> 60:20:20
    x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.4, random_state=SEED)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.5, random_state=SEED)

    # build dataloaders
    train_loader = DataLoader(MyDataset(x_train, y_train), batch_size=1000, shuffle=True, collate_fn=pad)
    test_loader = DataLoader(MyDataset(x_test, y_test), batch_size=1000, collate_fn=pad)
    val_loader = DataLoader(MyDataset(x_val, y_val), batch_size=1000, collate_fn=pad)
    return train_loader, test_loader, val_loader

## Model

In [None]:
class Seq2SeqAttention(nn.Module):
    def __init__(self, input_size, nr_classes=5, hidden_size=128, num_heads=1, dropout=0.1, bidir=True, mask=True):
        nr_dir = 1 + bidir
        super(Seq2SeqAttention, self).__init__()
        if hidden_size > 0:
            self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=bool(bidir))
        embedding_dim = hidden_size*nr_dir if hidden_size else 96
        self.attention = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads)  # *2 because of bidir
        self.decoder = nn.Linear(embedding_dim, nr_classes)
        self.query = torch.autograd.Variable(torch.zeros(embedding_dim, requires_grad=True)).to(DEVICE)
        self.softmax = nn.LogSoftmax(dim=-1)
        self.mask = mask
        self.hidden_size = hidden_size

        # Apply dropout to the RNN and Attention layers
        self.lstm_dr = nn.Dropout(dropout)
        self.attention_dr = nn.Dropout(dropout)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        # Given a batch, the model first fetches the corresponding embeddings...
        # (Skipped, as we already pass in embeddings from spacy for faster data-loading, to decrease gpu-cpu transfers)

        # ... and calculates hidden states of the given documents with the RNN model ...
        if self.hidden_size:
            enc_out, _ = self.lstm(x)  # we already fetch the word vectors
            enc_out = self.lstm_dr(enc_out)  # Apply dropout, swap batch and seq dimension
        else:
            enc_out = x
        enc_out = enc_out.permute(1, 0, 2)

        # ... The hidden states of RNN are then passed as values (and keys) to the Attention layer, where the query
        # is a vector of trainable parameters. The output of the Attention layer is the document embedding. ...
        query = self.query.repeat(1, batch_size, 1)
        if self.mask:
            mask = (x == 0).all(dim=-1)
            mask = mask.repeat(1, self.attention.num_heads, 1).permute(1, 0, 2)
        else:
            mask = None
        # documentation for attention: https://pytorch.org/docs/master/generated/torch.nn.MultiheadAttention.html
        # enc_out = (S=seq_len..MAX_SEQ_LEN, N=batch size, E=hidden_size=embedding_size)
        # query = (L=1=single query, N=batch_size x S=embedding_size)
        # mask = (N=num_heads*batch_size, L=1=number of queries, S=embedding_size)
        # print(query.size(), enc_out.size(), mask.size())
        attn_output, attn_output_weights = self.attention(query, enc_out, enc_out, attn_mask=mask)  # attn_output_weights
        document_embedding = self.attention_dr(attn_output)  # Apply dropout

        # ... The document embedding is then used to predict the probability distribution of the output classes,
        # by being passed to the decoder (a linear projection) and a softmax layer.
        return self.softmax(self.decoder(document_embedding))[0], attn_output_weights





## Training 

In [None]:
# partial copy of ex3 training loop
def train(train_loader, val_loader, test_loader, hidden, batch_s, drop, bidir, estopnr, lr_ex, num_heads, mask):
    start = time()
    # our LSTM model
    train_loader = DataLoader(train_loader.dataset, batch_size=batch_s, collate_fn=pad)  # use new batch_size
    input_size = len(train_loader.dataset[0][0][0])  # 0th batch, 0th sequence, 0th element of (train, target)
    nr_samples = len(train_loader.dataset)
    model = Seq2SeqAttention(input_size=input_size, hidden_size=hidden, dropout=drop, bidir=bidir, num_heads=num_heads, mask=mask).to(DEVICE)
    # Loss Function. Loss is calculated using Negative Log Likelihood.
    criterion = nn.NLLLoss()
    # Optimization. Adam with default parameters* is used.
    optimizer = optim.Adam(model.parameters(), lr=0.1**lr_ex)
    tq = tqdm(range(MAX_EPOCH), leave=False)
    vlh, correct = [0], 0  # validation loss history
    estopcnt = 1
    for epoch in tq:
        train_loss = 0.0

        # run training iteration
        for xt, yt in train_loader:
            # train_loader iterator returns a batch_size x seq_len x embedding_size list of tensors
            optimizer.zero_grad()
            output, _ = model(xt)
            # calculate loss and
            loss = criterion(output, yt)
            train_loss += float(loss)
            pred = output.max(1, keepdim=True)[1]
            correct += int(pred.eq(yt.view_as(pred)).sum().item())
            loss.backward()
            optimizer.step()

        # run validation
        train_loss /= len(train_loader)
        val_loss = validate(model, val_loader, criterion)
        vlh.append(val_loss)

        # statistics
        correct /= nr_samples
        stats = {"corr%": f"{correct * 100:.1f}", "tr_lo%": f"{train_loss * 100:.1f}", "va_lo%": f"{val_loss * 100:.1f}"}
        tq.set_postfix_str(str(stats)[1:-1].replace("'", ""))

        if estopnr:  # set 0 to disable
            # Early Stopping. After each epoch or after a certain number of batches (defined as a hyperparameter),
            # evaluate the model on the validation set. If the evaluation result improves, save the
            # model as the best performing model so far.
            if vlh[-1] <= min(vlh[0: -1]):
                best_model = copy.deepcopy(model)
            # If the results are not improving after a certain number
            # of evaluations (given as another hyper-parameter), terminate training.
            # if current loss is bigger than the smallest of the last loses
            estopcnt = estopcnt + 1 if (val_loss >= min(vlh[-estopcnt-1: -1])) else 1
            if estopcnt > estopnr:
                break

    test_loss = validate(model, test_loader, criterion)
    print("\r", end="")  # remove tqdm
    duration = time() - start
    stats.update({"te_lo%": f"{test_loss*100:.1f}", "duration": f"{duration:.1f}", "epochs": str(epoch+1)})
    return stats, best_model if estopnr else model


def step(model, loader, criterion, optimizer=None):
    correct, loss_t = 0, 0
    if not optimizer:
        model.eval()  # disables dropout and gradients
    for x, y in loader:
        if optimizer:
            optimizer.zero_grad()
        output, _ = model(x)
        loss = criterion(output, y.long())
        loss_t += float(loss)
        pred = output.argmax(1, keepdim=True)
        correct += int(pred.eq(y.view_as(pred)).sum().item())
        if optimizer:
            loss.backward()
            optimizer.step()
    if not optimizer:
        model.train()
    return loss_t/len(loader), correct/len(loader.dataset)



loaders = load()

if path.exists(MODEL_INFO_FILE):
        with open(MODEL_INFO_FILE, "r") as f:
            best_info = json.load(f)
else:
    best_info = {"te_lo": "10000"}  # baseline: 30%
hyper_params = {"hidden": [0, 32, 64, 128, 256, 512], "bidir": [1, 0], "batch_s": [64, 256, 1024], "mask": [1, 0],
                "drop": [.0, .2, .4], "estopnr": [0, 5, 10, 15], "lr_ex": [2, 3, 4], "num_heads": [1, 2, 4, 8, 16]}
extra_params = ["tr_acc%", "tr_lo", "te_acc%", "te_lo", "va_acc%", "va_lo", "duration", "epoch"]
print(*hyper_params.keys(), *extra_params, sep="\t")
while True:  # infinite hyperparameter search
    hyper = {k: random.choice(v) for k, v in hyper_params.items()}
    # print(*hyper.values(), sep="\t\t")  # print before training for debugging purposes
    stats, model = train(*loaders, **hyper)
    print(*hyper.values(), *stats.values(), sep="\t")
    # if the model performs better than the best, save it!
    if float(stats["te_lo"]) < float(best_info["te_lo"]):
        torch.save(model, MODEL_FILE)
        with open(MODEL_INFO_FILE, "w") as f:
            json.dump({**stats, **hyper}, f, indent=4, sort_keys=True)
    del model, stats

## Hyperparameter Search: extra exploration
Below we are visualizing the result of our hyperparameter search and how each parameter correlates with  both the train and validation accuracy respectively.
We observe the following:
* The optimal hidden sizes is 128 for training however the third best for validation.
* Bidirectional LSTM and dropout have no significant effect on the performance.
* The training accuracy of the model decreases as batch size increases however validation accuracuy stays constant, indicating some level of overfitting.
* Masking has a negligible or probably even slightly negative impact.
* Dropout does not affect the validation accuracy and training accuracy only slightly.
* No early stopping leads to overfitting, a way better training accuracy and only 1% less test accuracy.
* Learning expectition does increase the accuray of both training and validation, however no significantly.
* A number of heads equal to two seems to the give the best train and validation accuracy.



<img src=  >

## Task 2: Interpreting Attention Weights
Which word(s) has the higher contribution(s) to the prediction results of some specific documents? 
To explore this question in this task, we first selected four documents from test set, such that two are correctly and the other two are incorrectly classified by the model.
Conducted a forward pass for each of these documents to achieve the attention weights(lukily they were the first 4 sequences in the test set).  Looked at the words with the highest attention weights and visualized the weights over document’s words.

In [None]:
def col_vis(words, values):
    # https://stackoverflow.com/questions/287871/how-to-print-colored-text-in-terminal-in-python
    # https://askubuntu.com/questions/558280/changing-colour-of-text-and-background-of-terminal
    TERM_BLUETONES = list(range(21, 52, 6))
    # 21, 27, 33, 39, 45 are blue tones, map the percentages [0:20, 20:40, 40:60, 60:80, 80:100] to those by 21+6*%//40
    return " ".join([f"\033[96;5;{21+6*(value//40)}m{word}\033[m" for word, value in zip(words, values)])
# print(col_vis(["abc"] * 10, list(range(0, 100, 10)))) # for testing this
dataset = pd.read_csv('sst5.data.txt')  # to only load 1000
relevant_sentences = range(4)  # luckily, the first 4 already
dataset = dataset.iloc[relevant_sentences]
# print(dataset)
CACHE_FILE = "vecors.cache"
with open(CACHE_FILE, "rb") as f:
    x_all, y_all, max_doc_len = pickle.loads(f.read())
x_all = [x_all[s] for s in relevant_sentences]
y_all = [y_all[s] for s in relevant_sentences]
# print("vector lengths", *[len(v) for v in x_all])

loader = DataLoader(MyDataset(x_all, y_all), collate_fn=pad, shuffle=False, batch_size=1)
model = torch.load(MODEL_FILE)
model.eval()  # model will not use dropout
model.lstm.flatten_parameters()
for idx, (x, y) in enumerate(loader):
    out, attn = model(x)
    pred = int(out.argmax())
    sentence = dataset.iloc[idx]["text"]
    print(f"real: {int(y)-2}\tpredicted: {pred-2}")
    attn = (attn[0][0]*100).int().tolist()
    # show attention per word
    print(col_vis(sentence.split(" "), attn))

<img src = >

## Analysis 

### Correctly predicted:
**Document 1**: We observe that the highest attention is on for "success". The word "sucess" is a definitive part of the sentence and is the most positive. However we do observe that the it doesn't seem to notice that "himself" and "take" could also be important to the sentence as they do inidcate some negativity in the sentence.

**Document 2**: The model is able to determine that "film" and "shortcomings" are definitive parts of the document. Unlike the first document it is able to see that there are more than one word which should be noted.

### Incorrectly predicted:

**Document 3**: The highest attention in the document is for the word "wit". However no attention is present for the words "dry","suffers" and "maddeningly", which would be more definitive of the document.


**Document 4**: The word "character" was seen as the most definitive in the document. However it can be argued that the words  "isn't" and "complex" or "interest" seem to be more definitive to the document
 
### Final Thoughts
From the analysis we can conclude the following:
* The model seems to put the most attention weight on the nouns in the document.
* For both cases, we can observe that complex or rare words are not recognized by the models to contribute to the document classification.
* A larger hyperparameter space, training set and a better embeeding could have improved the performance of the model and helped it etc.[1][2]

[1]Luong, M. T., Pham, H., & Manning, C. D. (2015). Effective approaches to attention-based neural machine translation. Conference Proceedings - EMNLP 2015: Conference on Empirical Methods in Natural Language Processing, 1412–1421. https://doi.org/10.18653/v1/d15-1166


[2]Zhang, Y., Zhong, V., Chen, D., Angeli, G., & Manning, C. D. (2017). Position-aware attention and supervised data improve slot filling. EMNLP 2017 - Conference on Empirical Methods in Natural Language Processing, Proceedings, 35–45. https://doi.org/10.18653/v1/d17-1004