## 2. Model Implementation


Running the data wrangling bit can be quite computationally intensive, and


In [20]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# load in the np data from the cleaneddata folder
final_doc_test = np.load("cleaneddata/final_doc_test.npy")
final_doc_train = np.load("cleaneddata/final_doc_train.npy")
final_qn_train = np.load("cleaneddata/final_qn_train.npy")
final_qn_test = np.load("cleaneddata/final_qn_test.npy")
tr_labels = np.load("cleaneddata/tr_labels.npy")
ts_labels = np.load("cleaneddata/ts_labels.npy")

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
# check the shape of all the above
print(
    final_doc_test.shape,
    final_doc_train.shape,
    final_qn_train.shape,
    final_qn_test.shape,
    tr_labels.shape,
    ts_labels.shape,
)

(630, 200, 156) (2117, 200, 156) (2117, 23, 156) (630, 23, 156) (2117, 200) (630, 200)


In [23]:
# convert the numpy arrays to tensors
final_doc_test = torch.from_numpy(final_doc_test).to(device=device, dtype=torch.float32)
final_doc_train = torch.from_numpy(final_doc_train).to(
    device=device, dtype=torch.float32
)
final_qn_train = torch.from_numpy(final_qn_train).to(device=device, dtype=torch.float32)
final_qn_test = torch.from_numpy(final_qn_test).to(device=device, dtype=torch.float32)

In [24]:
# check the shapes of the tensors
print(
    final_doc_test.shape,
    final_doc_train.shape,
    final_qn_train.shape,
    final_qn_test.shape,
)

torch.Size([630, 200, 156]) torch.Size([2117, 200, 156]) torch.Size([2117, 23, 156]) torch.Size([630, 23, 156])


**Input Embedding Ablation Study**

In the model input embedding Ablation study, we are given 3 variations of input embeddings to test. We will test 3 options:

1. Word2Vec only # 100 dims
2. Word2Vec + Tf-IDF # 101 dims
3. Word2Vec + all features (TF-IDF, POS, NER) # 156 dims

Since we are using tensors, we can use tensor slicing to take out the relevant features.
Our tensor of embeddings are built as follows (w2v, TF-IDF, POS, NER)


In [25]:
def convert_tensors(tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test, option=3):
    if option == 3:
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test
    elif option == 1:
        tf_doc_train = tf_doc_train[:, :, :100]
        tf_doc_test = tf_doc_test[:, :, :100]
        tf_qn_train = tf_qn_train[:, :, :100]
        tf_qn_test = tf_qn_test[:, :, :100]
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test
    elif option == 2:
        tf_doc_train = tf_doc_train[:, :, :101]
        tf_doc_test = tf_doc_test[:, :, :101]
        tf_qn_train = tf_qn_train[:, :, :101]
        tf_qn_test = tf_qn_test[:, :, :101]
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test

In [87]:
# Create a mapping from label to index
label2index = {"N": 0, "S": 1, "I": 2, "E": 3}

# Find the maximum length of the label lists
max_len = final_doc_train.shape[1]

# Create a tensor to hold the one-hot encoded labels
train_labels = torch.zeros(
    len(tr_labels), max_len, len(label2index), device=device, dtype=torch.float32
)
test_labels = torch.zeros(
    len(ts_labels),
    max_len,
    len(label2index),
    device=device,
    dtype=torch.float32,
)

# Sets the first element of the third dimension of the target_labels tensor to 1
train_labels[:, :, 0] = 1
test_labels[:, :, 0] = 1

# Iterate over the label lists and one-hot encode the labels
for i, label_list in enumerate(tr_labels):
    for j, label in enumerate(label_list):
        index = label2index[label]
        # Sets all elements of the target_labels tensor at position (i,j) to 0
        train_labels[i, j] = 0
        train_labels[i, j, index] = 1

for i, label_list in enumerate(ts_labels):
    for j, label in enumerate(label_list):
        index = label2index[label]
        # Sets all elements of the target_labels tensor at position (i,j) to 0
        test_labels[i, j] = 0
        test_labels[i, j, index] = 1

In [88]:
from sklearn.utils.class_weight import compute_class_weight

# Reshape the target labels tensor
reshaped_target_labels = (
    train_labels.view(-1, 4).cpu().numpy()
)  # Assuming it's on the GPU

# Flatten the reshaped target labels
flattened_target_labels = reshaped_target_labels.argmax(axis=1)

# Calculate the class weights
class_weights = compute_class_weight(
    class_weight="balanced", classes=[0, 1, 2, 3], y=flattened_target_labels
)

# Convert the class weights to a PyTorch tensor
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

Data Preprocessing complete at this stage, we should check again the shapes of the tensors


In [89]:
# prior to training
as_doc_train, as_doc_test, as_qn_train, as_qn_test = convert_tensors(
    final_doc_train, final_doc_test, final_qn_train, final_qn_test, 3
)
# if not running any ablation, use the free up space by deleting np arrays:
# del final_doc_test, final_doc_train, final_qn_train, final_qn_test, tr_labels, ts_labels

In [90]:
# check final tensor shapes of all the tensors, note we use ablation study tensors here, change option to 3 if using full tensors, or 2 if using 101
print(
    as_doc_train.shape,
    as_doc_test.shape,
    as_qn_train.shape,
    as_qn_test.shape,
    train_labels.shape,
    test_labels.shape,
)

torch.Size([2117, 200, 156]) torch.Size([630, 200, 156]) torch.Size([2117, 23, 156]) torch.Size([630, 23, 156]) torch.Size([2117, 200, 4]) torch.Size([630, 200, 4])


## 3. Model Architecture


In [91]:
from torch import Tensor
from enum import Enum
from typing import Literal

In [126]:
# Architecture of the model for the Document BiLSTM


class DocumentBiRNN(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        num_layers=1,
    ):
        super(DocumentBiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers=num_layers,
            bidirectional=True,
        )

    def forward(self, input: Tensor):
        input = input.unsqueeze(1)
        output: Tensor
        output, _ = self.lstm(input)
        # print("document output shape: ", output.shape)
        return output


# Architecture of the model for the Question BiLSTM


class QuestionBiRNN(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        num_layers=1,
    ):
        super(QuestionBiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers=num_layers,
            bidirectional=True,
        )

    def forward(self, input: Tensor):
        input = input.unsqueeze(1)
        output, (hn, cn) = self.lstm(input)
        forward_hn = hn[-2, :, :]
        backward_hn = hn[-1, :, :]
        hidden = torch.cat((forward_hn, backward_hn), dim=-1).unsqueeze(0)
        # print("question hidden shape: ", hidden.shape)
        return hidden


# Architecture of the model for the Attention Calculation


class AttentionMethod(Enum):
    DOT_PRODUCT = "dot_product"
    SCALE_DOT_PRODUCT = "scale_dot_product"
    COSINE_SIMILARITY = "cosine_similarity"

    def __str__(self):
        return self.value


class Attention(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        attention_method: Literal[
            "dot_product",
            "scale_dot_product",
            "cosine_similarity",
        ] = "dot_product",
    ):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.attention_method = AttentionMethod(attention_method)

    def forward(self, document_output, question_summary):
        if self.attention_method == AttentionMethod.DOT_PRODUCT:
            document_output = document_output.permute(
                1, 0, 2
            )  # torch.Size([200, 1, 16])
            question_summary = question_summary.permute(
                1, 2, 0
            )  # torch.Size([1, 1, 16])
            # [1, 200, 16], [1, 16, 1] -> [1, 200, 1]
            attention_scores = torch.bmm(document_output, question_summary).squeeze(0)
            # print("attention scores: ", attention_scores)
            # print("attention scores shape: ", attention_scores.shape)
            return attention_scores

        elif self.attention_method == AttentionMethod.COSINE_SIMILARITY:
            cosine_similarity = nn.CosineSimilarity(dim=-1)
            # torch.Size([200, 16])
            document_output = document_output.squeeze(1)
            # torch.Size([16])
            question_summary = question_summary.squeeze(0).squeeze(0)

            # [200]
            attention_scores = cosine_similarity(
                document_output, question_summary
            ).unsqueeze(-1)

            return attention_scores

        else:
            document_output = document_output.permute(
                1, 0, 2
            )  # torch.Size([200, 1, 16])
            question_summary = question_summary.permute(
                1, 2, 0
            )  # torch.Size([1, 1, 16])
            # [1, 200, 16], [1, 16, 1] -> [1, 200, 1]
            attention_scores = torch.bmm(document_output, question_summary) / np.sqrt(
                self.hidden_size
            )
            attention_scores = attention_scores.squeeze(0)
            # print("attention scores: ", attention_scores)
            # print("attention scores shape: ", attention_scores.shape)
            return attention_scores


# Architecture of the model for the Attention Weighted Document Representation a.k.a ReadingComprehension


class ReadingComprehensionModel(nn.Module):
    def __init__(self, document_rnn, question_rnn, attention, hidden_size, output_size):
        super(ReadingComprehensionModel, self).__init__()
        self.document_rnn = document_rnn
        self.question_rnn = question_rnn
        self.attention = attention
        self.linear = nn.Linear(1, output_size)

    def predict_label(self, attention_output):
        pred = self.linear(attention_output)
        # print("prediction shape: ", pred.shape)
        pred_weights = nn.functional.softmax(pred, dim=1)
        # print("prediction weights shape: ", pred_weights.shape)
        # shape of the context vector: (batch_size, 1, hidden_size)
        return pred_weights

In [93]:
# Function to train the model


def trainIter(
    model,
    document_inputs,
    question_inputs,
    target_labels,
    num_epochs,
    criterion,
    optimizer,
):
    model.train()
    for epoch in range(num_epochs):
        loss = 0
        for document_input, question_input, target_label in zip(
            document_inputs, question_inputs, target_labels
        ):
            optimizer.zero_grad()

            document_output = model.document_rnn(document_input)
            question_summary = model.question_rnn(question_input)

            attention_output = model.attention(document_output, question_summary)

            token_label_logits = model.predict_label(attention_output).to(device)

            # print("token label logits shape: ", token_label_logits.shape)
            # print("target label shape: ", target_label.shape)
            # print("token label logits: ", token_label_logits)

            # print(token_label_logits[0])
            # print(target_label[0])
            # raise TypeError("stop")

            loss += criterion(token_label_logits, target_label)
            # print(loss)

        loss.backward()
        optimizer.step()

        avg_loss = loss.item() / len(document_inputs)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

In [94]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)

In [95]:
# Evalutation of the model

START_LABEL = 1
END_LABEL = 3


def evaluate(model, document_inputs, question_inputs, target_labels, criterion):
    model.eval()
    with torch.no_grad():
        loss = 0
        all_predictions = []
        all_targets = []
        for document_input, question_input, target_label in zip(
            document_inputs, question_inputs, target_labels
        ):
            document_output = model.document_rnn(document_input)
            question_summary = model.question_rnn(question_input)
            attention_output = model.attention(document_output, question_summary)
            token_label_logits = model.predict_label(attention_output).to(device)
            loss += criterion(token_label_logits, target_label)

            # print(token_label_logits)

            predictions = token_label_logits.argmax(dim=-1).cpu().numpy()
            targets = target_label.argmax(dim=-1).cpu().numpy()
            # print(predictions == 1)

            if any(targets == START_LABEL) and any(targets == END_LABEL):
                # Find indices of start and end tokens
                start_token_idx = np.where(targets == START_LABEL)[0]
                end_token_idx = np.where(targets == END_LABEL)[0]

                # print("target: ", targets[start_token_idx[0] : end_token_idx[0] + 1])
                # print(
                #     "prediction: ",
                #     predictions[start_token_idx[0] : end_token_idx[0] + 1],
                # )
                # print()

                # Take slice of predictions and target_labels for sentence tokens
                sentence_prediction = predictions[
                    start_token_idx[0] : end_token_idx[0] + 1
                ]
                sentence_target = targets[start_token_idx[0] : end_token_idx[0] + 1]

                all_predictions.extend(sentence_prediction)
                all_targets.extend(sentence_target)

            else:
                # Use the whole document since there is no answer
                all_predictions.extend(predictions)
                all_targets.extend(targets)

        # print(all_predictions)
        # print(all_targets)

        # avg_loss = loss.item() / len(document_inputs)
        # accuracy = accuracy_score(all_targets, all_predictions)
        # precision = precision_score(all_targets, all_predictions, average="macro")
        # recall = recall_score(all_targets, all_predictions, average="macro")
        # f1 = f1_score(all_targets, all_predictions, average="macro")
        cr = classification_report(all_targets, all_predictions)

        # print(
        #     f"Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}"
        # )

        return cr

In [123]:
from sklearn.utils.class_weight import compute_class_weight

# Reshape the target labels tensor
reshaped_target_labels = (
    train_labels.view(-1, 4).cpu().numpy()
)  # Assuming it's on the GPU

# Flatten the reshaped target labels
flattened_target_labels = reshaped_target_labels.argmax(axis=1)


# Calculate the class weights
class_weights = compute_class_weight(
    class_weight="balanced", classes=[0, 1, 2, 3], y=flattened_target_labels
)

# Convert the class weights to a PyTorch tensor
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

In [124]:
# Start of the training

from torch import optim


def train(
    hidden_size=128,
    epochs=10,
    learning_rate=0.05,
    num_layers=1,
    token_labels=4,
    attention_method: Literal[
        "dot_product",
        "scale_dot_product",
        "cosine_similarity",
    ] = "dot_product",
):
    # note the names of the tensors are changed to:
    # as_doc_train, as_doc_test, as_qn_train, as_qn_test, train_labels, test_labels are called before in the ablation part
    # to avoid confusion with the original tensors

    # as_doc_train, as_doc_test, as_qn_train, as_qn_test

    document_num_embeddings = as_doc_train.shape[2]
    question_num_embeddings = as_qn_train.shape[2]

    document_rnn = DocumentBiRNN(
        hidden_size=hidden_size,
        input_size=document_num_embeddings,
        num_layers=num_layers,
    ).to(device)
    question_rnn = QuestionBiRNN(
        input_size=question_num_embeddings,
        hidden_size=hidden_size,
        num_layers=num_layers,
    ).to(device)
    attention = Attention(hidden_size, attention_method).to(device)
    reading_comp = ReadingComprehensionModel(
        document_rnn,
        question_rnn,
        attention,
        hidden_size=hidden_size,
        output_size=token_labels,
    ).to(device)
    reading_comp_optimizer = optim.AdamW(reading_comp.parameters(), lr=learning_rate)

    # class_weights = torch.tensor([1.0, 20.0, 10.0, 20.0], dtype=torch.float32).to(
    #     device
    # )

    criterion = nn.CrossEntropyLoss(weight=class_weights)

    # # Create TensorDatasets from your data tensors
    # dataset = TensorDataset(tf_final_doc_train, tf_final_qn_train)

    # # Define batch size and create DataLoader
    # batch_size = 32
    # dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # note the names of the tensors are changed to:
    # as_doc_train, as_doc_test, as_qn_train, as_qn_test, train_labels, test_labels

    trainIter(
        reading_comp,
        as_doc_train,
        as_qn_train,
        train_labels,
        epochs,
        criterion,
        reading_comp_optimizer,
    )

    return reading_comp, criterion

In [None]:
print(class_weights)

In [121]:
reading_comp_dot, criterion_dot = train(attention_method="dot_product")

# Model evaluation dot product # ignore if rerun
train_report_dot, test_report_dot = evaluate(
    reading_comp_dot, as_doc_train, as_qn_train, train_labels, criterion_dot
), evaluate(reading_comp_dot, as_doc_test, as_qn_test, test_labels, criterion_dot)

# Model evaluation for train and test set
print("Evaluation on train set")
print(train_report_dot)
print("----------------------------------------------------------")
print("Evaluation on test set")
print(test_report_dot)
print("----------------------------------------------------------")

Epoch 1/10, Loss: 2.0819
Epoch 2/10, Loss: 1.6655
Epoch 3/10, Loss: 1.6621
Epoch 4/10, Loss: 1.6621
Epoch 5/10, Loss: 1.6621
Epoch 6/10, Loss: 1.6621
Epoch 7/10, Loss: 1.6621
Epoch 8/10, Loss: 1.6621
Epoch 9/10, Loss: 1.6621
Epoch 10/10, Loss: 1.6621


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluation on train set
              precision    recall  f1-score   support

           0       0.92      1.00      0.96    260691
           1       0.00      0.00      0.00       826
           2       0.00      0.00      0.00     19947
           3       0.00      0.00      0.00       812

    accuracy                           0.92    282276
   macro avg       0.23      0.25      0.24    282276
weighted avg       0.85      0.92      0.89    282276

----------------------------------------------------------
Evaluation on test set
              precision    recall  f1-score   support

           0       0.93      1.00      0.97     80177
           1       0.00      0.00      0.00       230
           2       0.00      0.00      0.00      5295
           3       0.00      0.00      0.00       229

    accuracy                           0.93     85931
   macro avg       0.23      0.25      0.24     85931
weighted avg       0.87      0.93      0.90     85931

------------------------

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [128]:
reading_comp_dot, criterion_dot = train(attention_method="cosine_similarity")

# Model evaluation dot product # ignore if rerun
train_report_dot, test_report_dot = evaluate(
    reading_comp_dot, as_doc_train, as_qn_train, train_labels, criterion_dot
), evaluate(reading_comp_dot, as_doc_test, as_qn_test, test_labels, criterion_dot)

# Model evaluation for train and test set
print("Evaluation on train set")
print(train_report_dot)
print("----------------------------------------------------------")
print("Evaluation on test set")
print(test_report_dot)
print("----------------------------------------------------------")

Epoch 1/10, Loss: 1.3886
Epoch 2/10, Loss: 1.3828
Epoch 3/10, Loss: 1.3835
Epoch 4/10, Loss: 1.3813
Epoch 5/10, Loss: 1.3758
Epoch 6/10, Loss: 1.3686
Epoch 7/10, Loss: 1.3637
Epoch 8/10, Loss: 1.3588
Epoch 9/10, Loss: 1.3528
Epoch 10/10, Loss: 1.3473


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluation on train set
              precision    recall  f1-score   support

           0       0.00      0.00      0.00    260691
           1       0.00      0.00      0.00       826
           2       0.12      0.80      0.21     19947
           3       0.00      0.88      0.01       812

    accuracy                           0.06    282276
   macro avg       0.03      0.42      0.05    282276
weighted avg       0.01      0.06      0.01    282276

----------------------------------------------------------
Evaluation on test set
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     80177
           1       0.00      0.00      0.00       230
           2       0.10      0.79      0.17      5295
           3       0.00      0.81      0.01       229

    accuracy                           0.05     85931
   macro avg       0.03      0.40      0.05     85931
weighted avg       0.01      0.05      0.01     85931

------------------------

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [115]:
reading_comp_dot, criterion_dot = train(attention_method="dot_product", num_layers=2)

# Model evaluation dot product # ignore if rerun
train_report_dot, test_report_dot = evaluate(
    reading_comp_dot, as_doc_train, as_qn_train, train_labels, criterion_dot
), evaluate(reading_comp_dot, as_doc_test, as_qn_test, test_labels, criterion_dot)

# Model evaluation for train and test set
print("Evaluation on train set")
print(train_report_dot)
print("----------------------------------------------------------")
print("Evaluation on test set")
print(test_report_dot)
print("----------------------------------------------------------")

Epoch 1/10, Loss: 1.3908
Epoch 2/10, Loss: 1.4861
Epoch 3/10, Loss: 1.4936


KeyboardInterrupt: 

In [108]:
reading_comp_scale, criterion_scale = train(attention_method="scale_dot_product")

# Model evaluation dot product # ignore if rerun
train_report_scale, test_report_scale = evaluate(
    reading_comp_scale, as_doc_train, as_qn_train, train_labels, criterion_scale
), evaluate(reading_comp_scale, as_doc_test, as_qn_test, test_labels, criterion_scale)

# Model evaluation for train and test set
print("Evaluation on train set")
print(train_report_scale)
print("----------------------------------------------------------")
print("Evaluation on test set")
print(test_report_scale)
print("----------------------------------------------------------")

Epoch 1/10, Loss: 1.3931
Epoch 2/10, Loss: 1.3705
Epoch 3/10, Loss: 1.3809
Epoch 4/10, Loss: 1.3576
Epoch 5/10, Loss: 1.3389
Epoch 6/10, Loss: 1.3212
Epoch 7/10, Loss: 1.3019
Epoch 8/10, Loss: 1.2871
Epoch 9/10, Loss: 1.2681
Epoch 10/10, Loss: 1.2517


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluation on train set
              precision    recall  f1-score   support

           0       0.95      0.69      0.80    260691
           1       0.01      0.92      0.02       826
           2       0.11      0.14      0.12     19947
           3       0.00      0.00      0.00       812

    accuracy                           0.65    282276
   macro avg       0.27      0.44      0.24    282276
weighted avg       0.88      0.65      0.74    282276

----------------------------------------------------------
Evaluation on test set
              precision    recall  f1-score   support

           0       0.95      0.67      0.79     80177
           1       0.01      0.88      0.02       230
           2       0.08      0.14      0.10      5295
           3       0.00      0.00      0.00       229

    accuracy                           0.64     85931
   macro avg       0.26      0.42      0.23     85931
weighted avg       0.89      0.64      0.74     85931

------------------------

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
