## 2. Model Implementation

Running the data wrangling bit can be quite computationally intensive, and

In [1]:
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F

#load in the np data from the cleaneddata folder
final_doc_test = np.load('cleaneddata/final_doc_test.npy')
final_doc_train = np.load('cleaneddata/final_doc_train.npy')
final_qn_train = np.load('cleaneddata/final_qn_train.npy')
final_qn_test = np.load('cleaneddata/final_qn_test.npy')
tr_labels = np.load('cleaneddata/tr_labels.npy')
ts_labels = np.load('cleaneddata/ts_labels.npy')

In [2]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# check the shape of all the above
print(final_doc_test.shape, final_doc_train.shape, final_qn_train.shape, final_qn_test.shape, tr_labels.shape, ts_labels.shape)

(630, 200, 156) (2117, 200, 156) (2117, 23, 156) (630, 23, 156) (2117, 200) (630, 200)


In [4]:
# convert the numpy arrays to tensors
final_doc_test = torch.from_numpy(final_doc_test)
final_doc_train = torch.from_numpy(final_doc_train)
final_qn_train = torch.from_numpy(final_qn_train)
final_qn_test = torch.from_numpy(final_qn_test)

In [5]:
# check the shapes of the tensors
print(final_doc_test.shape, final_doc_train.shape, final_qn_train.shape, final_qn_test.shape)

torch.Size([630, 200, 156]) torch.Size([2117, 200, 156]) torch.Size([2117, 23, 156]) torch.Size([630, 23, 156])


**Input Embedding Ablation Study**

In the model input embedding Ablation study, we are given 3 variations of input embeddings to test. We will test 3 options:

1. Word2Vec only # 100 dims
2. Word2Vec + Tf-IDF # 101 dims
3. Word2Vec + all features (TF-IDF, POS, NER) # 156 dims

Since we are using tensors, we can use tensor slicing to take out the relevant features.
Our tensor of embeddings are built as follows (w2v, TF-IDF, POS, NER)


In [6]:
def convert_tensors(tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test, option=3):
    if option == 3:
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test
    elif option == 1:
        tf_doc_train = tf_doc_train[:, :, :100]
        tf_doc_test = tf_doc_test[:, :, :100]
        tf_qn_train = tf_qn_train[:, :, :100]
        tf_qn_test = tf_qn_test[:, :, :100]
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test
    elif option == 2:
        tf_doc_train = tf_doc_train[:, :, :101]
        tf_doc_test = tf_doc_test[:, :, :101]
        tf_qn_train = tf_qn_train[:, :, :101]
        tf_qn_test = tf_qn_test[:, :, :101]
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test

In [7]:
# Create a mapping from label to index
label2index = {"N": 0, "S": 1, "I": 2, "E": 3}

# Find the maximum length of the label lists
max_len = final_doc_train.shape[1]

# Create a tensor to hold the one-hot encoded labels
train_labels = torch.zeros(
    len(tr_labels), max_len, len(label2index), device=device, dtype=torch.float32
)
test_labels = torch.zeros(
    len(ts_labels),
    max_len,
    len(label2index),
    device=device,
    dtype=torch.float32,
)

# Iterate over the label lists and one-hot encode the labels
for i, label_list in enumerate(tr_labels):
    for j, label in enumerate(label_list):
        index = label2index[label]
        # Sets all elements of the target_labels tensor at position (i,j) to 0
        train_labels[i, j] = 0
        train_labels[i, j, index] = 1

for i, label_list in enumerate(ts_labels):
    for j, label in enumerate(label_list):
        index = label2index[label]
        # Sets all elements of the target_labels tensor at position (i,j) to 0
        test_labels[i, j] = 0
        test_labels[i, j, index] = 1

In [8]:
from sklearn.utils.class_weight import compute_class_weight

# Reshape the target labels tensor
reshaped_target_labels = (
    train_labels.view(-1, 4).cpu().numpy()
)  # Assuming it's on the GPU

# Flatten the reshaped target labels
flattened_target_labels = reshaped_target_labels.argmax(axis=1)

# Calculate the class weights
class_weights = compute_class_weight(
    class_weight="balanced", classes=[0, 1, 2, 3], y=flattened_target_labels
)

# Convert the class weights to a PyTorch tensor
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

Data Preprocessing complete at this stage, we should check again the shapes of the tensors

## 3. Model Architecture

In [9]:

from torch import Tensor
from enum import Enum
from typing import Literal

In [67]:
# Architecture of the model for the Document BiLSTM

class DocumentBiRNN(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        num_layers=1,
    ):
        super(DocumentBiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers=num_layers,
            bidirectional=True,
        )

    def forward(self, input: Tensor):
        input = input.unsqueeze(1)
        output: Tensor
        output, _ = self.lstm(input)
        # print("document output shape: ", output.shape)
        return output
    
# Architecture of the model for the Question BiLSTM

class QuestionBiRNN(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        num_layers=1,
    ):
        super(QuestionBiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers=num_layers,
            bidirectional=True,
        )

    def forward(self, input: Tensor):
        input = input.unsqueeze(1)
        output, (hn, cn) = self.lstm(input)
        forward_hn = hn[-2, :, :]
        backward_hn = hn[-1, :, :]
        hidden = torch.cat((forward_hn, backward_hn), dim=-1).unsqueeze(0)
        # print("question hidden shape: ", hidden.shape)
        return hidden
    
# attention methods
class AttentionMethod(Enum):
    DOT_PRODUCT = "dot_product"
    SCALE_DOT_PRODUCT = "scale_dot_product"
    COSINE_SIMILARITY = "cosine_similarity"

    def __str__(self):
        return self.value
   
    
# Architecture of the model for the Attention Calculation

class Attention(nn.Module):
    def __init__(self, ques_len, hidden_size:int, attention_method: Literal[
            "dot_product",
            "scale_dot_product",
            "cosine_similarity",
        ] = "dot_product",):
        super(Attention, self).__init__()
        self.out = nn.Linear(ques_len, hidden_size)
        self.hidden_size = hidden_size
        self.attention_method = AttentionMethod(attention_method)
        
    def forward(self, document_output, question_summary):
        if self.attention_method == AttentionMethod.DOT_PRODUCT:
            document_output = document_output.permute(1, 0, 2)  # torch.Size([200, 1, 16])
            question_summary = question_summary.permute(1, 2, 0)  # torch.Size([1, 16, 1])

            attention_scores = torch.bmm(document_output, question_summary).permute(1, 0, 2)
            # get attention weights
            attention_weights = nn.functional.softmax(attention_scores, dim=1)
             #attention_scores = torch.bmm(document_output, question_summary) / np.sqrt(self.hidden_size)
            # get context vector
            context_scores = torch.bmm(document_output.permute(1, 2, 0), attention_weights).permute(0, 2, 1)
            return context_scores
        
        elif self.attention_method == AttentionMethod.SCALE_DOT_PRODUCT:
            document_output = document_output.permute(1, 0, 2)
            question_summary = question_summary.permute(1, 2, 0)
            attention_scores = torch.bmm(document_output, question_summary).permute(1,0,2) / np.sqrt(self.hidden_size)
            attention_weights = nn.functional.softmax(attention_scores, dim=1)
            context_scores = torch.bmm(document_output.permute(1, 2, 0), attention_weights).permute(0, 2, 1)
            return context_scores
        
        elif self.attention_method == AttentionMethod.COSINE_SIMILARITY:
            document_output = document_output.permute(1, 0, 2)
            question_summary = question_summary.permute(1, 2, 0)
            question_summary = question_summary.squeeze(-1)
            # cosine similarity attention:
            cos_sim = F.cosine_similarity(document_output, question_summary.unsqueeze(0), dim=-1).T.unsqueeze(1)
            attention_weights = nn.functional.softmax(cos_sim, dim=1)
            context_scores = torch.bmm(document_output.permute(1, 2, 0), attention_weights).permute(0, 2, 1)
            return context_scores


# Architecture of the model for the Attention Weighted Document Representation a.k.a ReadingComprehension
class ReadingComprehensionModel(nn.Module):
    def __init__(self, document_rnn, question_rnn, attention, hidden_size, output_size):
        super(ReadingComprehensionModel, self).__init__()
        self.document_rnn = document_rnn
        self.question_rnn = question_rnn
        self.attention = attention
        self.linear = nn.Linear(hidden_size*2, hidden_size*2)
        self.linear2 = nn.Linear(hidden_size*2, output_size)

    def predict_label(self, attention_output):
        attention_output = torch.squeeze(attention_output,1)
        # pass to linear
        pred_weights = self.linear(attention_output)
        pred_weights = self.linear2(pred_weights)
        # get the softmax
        #pred_weights = nn.functional.softmax(pred_weights, dim=1)
        return pred_weights

In [26]:
# Function to train the model

def trainIter(
    model,
    document_inputs,
    question_inputs,
    target_labels,
    num_epochs,
    criterion,
    optimizer,
):
    model.train()
    for epoch in range(num_epochs):
        loss = 0
        for document_input, question_input, target_label in zip(
            document_inputs, question_inputs, target_labels
        ):
            #optimizer.zero_grad()

            document_output = model.document_rnn(document_input)
            question_summary = model.question_rnn(question_input)

            attention_output = model.attention(document_output, question_summary)

            token_label_logits = model.predict_label(attention_output).to(device)

            #print("token label logits shape: ", token_label_logits)
            # print("target label shape: ", target_label.shape)
            # print("token label logits: ", token_label_logits)

            # print(token_label_logits[0])
            # print(target_label[0])
            # raise TypeError("stop")

            loss += criterion(token_label_logits, target_label)
            optimizer.zero_grad()
            # print(loss)

        loss.backward()
        optimizer.step()

        avg_loss = loss.item() / len(document_inputs)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [27]:
# Evalutation of the model

START_LABEL = 1
END_LABEL = 3


def evaluate(model, document_inputs, question_inputs, target_labels, criterion):
    model.eval()
    with torch.no_grad():
        loss = 0
        all_predictions = []
        all_targets = []
        for document_input, question_input, target_label in zip(
            document_inputs, question_inputs, target_labels
        ):
            document_output = model.document_rnn(document_input)
            question_summary = model.question_rnn(question_input)
            attention_output = model.attention(document_output, question_summary)
            token_label_logits = model.predict_label(attention_output).to(device)
            loss += criterion(token_label_logits, target_label)

            # print(token_label_logits)

            predictions = token_label_logits.argmax(dim=-1).cpu().numpy()
            targets = target_label.argmax(dim=-1).cpu().numpy()
            # print(predictions == 1)

            if any(targets == START_LABEL) and any(targets == END_LABEL):
                # Find indices of start and end tokens
                start_token_idx = np.where(targets == START_LABEL)[0]
                end_token_idx = np.where(targets == END_LABEL)[0]

                #print("target: ", targets[start_token_idx[0] : end_token_idx[0] + 1])
                #print(
                #    "prediction: ",
                #    predictions[start_token_idx[0] : end_token_idx[0] + 1],
                #)
                #print()

                # Take slice of predictions and target_labels for sentence tokens
                sentence_prediction = predictions[
                    start_token_idx[0] : end_token_idx[0] + 1
                ]
                sentence_target = targets[start_token_idx[0] : end_token_idx[0] + 1]

                all_predictions.extend(sentence_prediction)
                all_targets.extend(sentence_target)
            else:
                # Use the whole document since there is no answer
                all_predictions.extend(predictions)
                all_targets.extend(targets)

        # print(all_predictions)
        # print(all_targets)

        avg_loss = loss.item() / len(document_inputs)
        accuracy = accuracy_score(all_targets, all_predictions)
        precision = precision_score(all_targets, all_predictions, average="macro")
        recall = recall_score(all_targets, all_predictions, average="macro")
        f1 = f1_score(all_targets, all_predictions, average="macro")
        cr = classification_report(all_targets, all_predictions)

        print(
            f"Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}"
        )

        return cr

In [22]:
# prior to training
as_doc_train, as_doc_test, as_qn_train, as_qn_test = convert_tensors(final_doc_train, final_doc_test, final_qn_train, final_qn_test, 3)
# if not running any ablation, use the free up space by deleting np arrays:
#del final_doc_test, final_doc_train, final_qn_train, final_qn_test, tr_labels, ts_labels

In [70]:
# Start of the training

from torch import optim


def train(
    hidden_size = 64,
    epochs = 10,
    learning_rate = 0.01,
    num_layers = 1,
    token_labels = 4,
    attention_method: Literal[
            "dot_product",
            "scale_dot_product",
            "cosine_similarity",
        ] = "dot_product",
    ):

    # note the names of the tensors are changed to:
    # as_doc_train, as_doc_test, as_qn_train, as_qn_test, train_labels, test_labels are called before in the ablation part
    # to avoid confusion with the original tensors

    #as_doc_train, as_doc_test, as_qn_train, as_qn_test

    document_num_embeddings = as_doc_train.shape[2]
    question_num_embeddings = as_qn_train.shape[2]
    ques_len = as_qn_train.shape[1]

    document_rnn = DocumentBiRNN(
        hidden_size=hidden_size, input_size=document_num_embeddings, num_layers=num_layers,
    ).to(device)
    question_rnn = QuestionBiRNN(
        input_size=question_num_embeddings,
        hidden_size=hidden_size,
        num_layers=num_layers,
    ).to(device)
    attention = Attention(ques_len, hidden_size, attention_method).to(device)
    reading_comp = ReadingComprehensionModel(
        document_rnn,
        question_rnn,
        attention,
        hidden_size=hidden_size,
        output_size=token_labels,
    ).to(device)
    reading_comp_optimizer = optim.AdamW(reading_comp.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss(weight=class_weights) # to account for imbalanced class weights

    trainIter(
        reading_comp,
        as_doc_train,
        as_qn_train,
        train_labels,
        epochs,
        criterion,
        reading_comp_optimizer,
    )

    return reading_comp, criterion

## 3a. Attention Ablation Study

In this section, we study 3 different type of attention mechanisms between the question model and the document model. We ensured that the 3 attention mechanisms are ran on the same model hyperparameters, so as to keep things interpretable and standardized across the study. 

The hyperparameters of the training model are as follows:
- RNN (Bi-LSTM) Hidden Size: 64,
- Number of epochs: 10,
- Learning Rate: 0.01,
- Number of RNN (Bi-LSTM) layers: 1

**Attention Ablation Study - Dot Product**

In [77]:
# Model evaluation
reading_comp_dot, criterion_dot = train(attention_method="dot_product")
train_report, test_report = evaluate(reading_comp_dot, as_doc_train, as_qn_train, train_labels, criterion_dot), evaluate(reading_comp_dot, as_doc_test, as_qn_test, test_labels, criterion_dot)

Epoch 1/10, Loss: 1.3877
Epoch 2/10, Loss: 1.2985
Epoch 3/10, Loss: 1.1640
Epoch 4/10, Loss: 1.0213
Epoch 5/10, Loss: 0.8929
Epoch 6/10, Loss: 0.8886
Epoch 7/10, Loss: 0.8762
Epoch 8/10, Loss: 0.7103
Epoch 9/10, Loss: 0.7208
Epoch 10/10, Loss: 0.6745
Loss: 0.6243, Accuracy: 0.4794, Precision: 0.2890, Recall: 0.7330, F1: 0.2368
Loss: 0.7285, Accuracy: 0.4514, Precision: 0.2805, Recall: 0.6719, F1: 0.2169


In [79]:
# Model evaluation for train and test set
print("Evaluation on train set")
print(train_report)
print('----------------------------------------------------------')
print("Evaluation on test set")
print(test_report)
print('----------------------------------------------------------')

Evaluation on train set
              precision    recall  f1-score   support

           0       0.97      0.46      0.63    260691
           1       0.03      0.90      0.05       826
           2       0.12      0.67      0.21     19947
           3       0.03      0.90      0.06       812

    accuracy                           0.48    282276
   macro avg       0.29      0.73      0.24    282276
weighted avg       0.91      0.48      0.59    282276

----------------------------------------------------------
Evaluation on test set
              precision    recall  f1-score   support

           0       0.97      0.44      0.60     80177
           1       0.02      0.85      0.05       230
           2       0.10      0.65      0.17      5295
           3       0.02      0.75      0.04       229

    accuracy                           0.45     85931
   macro avg       0.28      0.67      0.22     85931
weighted avg       0.92      0.45      0.57     85931

------------------------

**Attention Ablation Study - Scaled Dot Product**

In [75]:
# testing with scaled dot product attention
reading_comp_scaled, criterion_scaled = train(attention_method="scale_dot_product")

Epoch 1/10, Loss: 1.3930
Epoch 2/10, Loss: 1.2962
Epoch 3/10, Loss: 1.1624
Epoch 4/10, Loss: 1.0126
Epoch 5/10, Loss: 0.8761
Epoch 6/10, Loss: 0.8303
Epoch 7/10, Loss: 0.8849
Epoch 8/10, Loss: 0.7509
Epoch 9/10, Loss: 0.7617
Epoch 10/10, Loss: 0.6999


In [76]:
scaled_train_report, scaled_test_report = evaluate(reading_comp_scaled, as_doc_train, as_qn_train, train_labels, criterion_scaled), evaluate(reading_comp_scaled, as_doc_test, as_qn_test, test_labels, criterion_scaled)

# model evaluation for train and test set
print("Evaluation on train set")
print(scaled_train_report)
print('----------------------------------------------------------')
print("Evaluation on test set")
print(scaled_test_report)
print('----------------------------------------------------------')


Loss: 0.6344, Accuracy: 0.4300, Precision: 0.2885, Recall: 0.7360, F1: 0.2194
Loss: 0.7066, Accuracy: 0.4040, Precision: 0.2813, Recall: 0.6986, F1: 0.2018
Evaluation on train set
              precision    recall  f1-score   support

           0       0.98      0.41      0.58    260691
           1       0.03      0.93      0.05       826
           2       0.13      0.66      0.21     19947
           3       0.02      0.95      0.04       812

    accuracy                           0.43    282276
   macro avg       0.29      0.74      0.22    282276
weighted avg       0.92      0.43      0.55    282276

----------------------------------------------------------
Evaluation on test set
              precision    recall  f1-score   support

           0       0.98      0.39      0.55     80177
           1       0.02      0.89      0.04       230
           2       0.10      0.64      0.18      5295
           3       0.02      0.88      0.03       229

    accuracy                   

**Attention Ablation Study - Cosine Similarity**

In [73]:
# testing with cosine similarity attention
reading_comp_cosine, criterion_cosine = train(attention_method="cosine_similarity")

Epoch 1/10, Loss: 1.3923
Epoch 2/10, Loss: 1.2907
Epoch 3/10, Loss: 1.1422
Epoch 4/10, Loss: 0.9780
Epoch 5/10, Loss: 0.8600
Epoch 6/10, Loss: 0.9419
Epoch 7/10, Loss: 0.8626
Epoch 8/10, Loss: 0.8444
Epoch 9/10, Loss: 0.7202
Epoch 10/10, Loss: 0.7252


In [74]:
# model evaluation for train and test set
cosine_train_report, cosine_test_report = evaluate(reading_comp_cosine, as_doc_train, as_qn_train, train_labels, criterion_cosine), evaluate(reading_comp_cosine, as_doc_test, as_qn_test, test_labels, criterion_cosine)

print("Evaluation on train set")
print(cosine_train_report)
print('----------------------------------------------------------')
print("Evaluation on test set")
print(cosine_test_report)
print('----------------------------------------------------------')

Loss: 0.7232, Accuracy: 0.3062, Precision: 0.2883, Recall: 0.7168, F1: 0.1751
Loss: 0.8248, Accuracy: 0.2832, Precision: 0.2822, Recall: 0.6633, F1: 0.1586
Evaluation on train set
              precision    recall  f1-score   support

           0       1.00      0.26      0.42    260691
           1       0.02      0.91      0.05       826
           2       0.10      0.81      0.19     19947
           3       0.03      0.89      0.05       812

    accuracy                           0.31    282276
   macro avg       0.29      0.72      0.18    282276
weighted avg       0.93      0.31      0.40    282276

----------------------------------------------------------
Evaluation on test set
              precision    recall  f1-score   support

           0       1.00      0.25      0.39     80177
           1       0.02      0.84      0.04       230
           2       0.09      0.81      0.16      5295
           3       0.02      0.76      0.04       229

    accuracy                   

### Input Embeddings Ablation Study

The above model used the full context vector with all word embeddings taken (Word2Vec, POS, NER, TF-IDF). In this section, we want to study the results of:
1. Word2Vec Word embeddings only
2. Word2Vec + TF-IDF
3. Full vector, which we have ran the results above

As we have seen from the above study that the best attention was the dot product, we will standardise across the 3 experiments using this. Additionally, we use the same hyperparameters as above in the Attention Study.

In [82]:
# Word embeds only, important to run this as it also makes sure that the input size is correct
as_doc_train, as_doc_test, as_qn_train, as_qn_test = convert_tensors(final_doc_train, final_doc_test, final_qn_train, final_qn_test, 1)
# check the tensor sizes
print(as_doc_train.shape, as_doc_test.shape, as_qn_train.shape, as_qn_test.shape)

torch.Size([2117, 200, 100]) torch.Size([630, 200, 100]) torch.Size([2117, 23, 100]) torch.Size([630, 23, 100])


**Word2Vec Word Embeddings Ablation Study**

In [83]:
# Model evaluation for Word Embeds only
reading_comp_word, criterion_word = train(attention_method="dot_product")
word_train_report, word_test_report = evaluate(reading_comp_word, as_doc_train, as_qn_train, train_labels, criterion_word), evaluate(reading_comp_word, as_doc_test, as_qn_test, test_labels, criterion_word)

Epoch 1/10, Loss: 1.3862
Epoch 2/10, Loss: 1.2843
Epoch 3/10, Loss: 1.1349
Epoch 4/10, Loss: 0.9795
Epoch 5/10, Loss: 0.8905
Epoch 6/10, Loss: 0.9311
Epoch 7/10, Loss: 0.7805
Epoch 8/10, Loss: 0.7982
Epoch 9/10, Loss: 0.6880
Epoch 10/10, Loss: 0.6777
Loss: 0.6712, Accuracy: 0.5077, Precision: 0.2971, Recall: 0.7111, F1: 0.2562
Loss: 0.7897, Accuracy: 0.4777, Precision: 0.2871, Recall: 0.6466, F1: 0.2326


In [84]:
# Model evaluation for train and test set
print("Evaluation on train set")
print(word_train_report)
print('----------------------------------------------------------')
print("Evaluation on test set")
print(word_test_report)
print('----------------------------------------------------------')

Evaluation on train set
              precision    recall  f1-score   support

           0       0.98      0.49      0.65    260691
           1       0.03      0.89      0.06       826
           2       0.13      0.75      0.22     19947
           3       0.05      0.72      0.10       812

    accuracy                           0.51    282276
   macro avg       0.30      0.71      0.26    282276
weighted avg       0.91      0.51      0.62    282276

----------------------------------------------------------
Evaluation on test set
              precision    recall  f1-score   support

           0       0.98      0.46      0.62     80177
           1       0.03      0.81      0.05       230
           2       0.10      0.75      0.18      5295
           3       0.04      0.56      0.07       229

    accuracy                           0.48     85931
   macro avg       0.29      0.65      0.23     85931
weighted avg       0.92      0.48      0.59     85931

------------------------

**Word2Vec + TF-IDF**

In [85]:
# Word embeds + TF-IDF, important to run this as it also makes sure that the input size is correct
as_doc_train, as_doc_test, as_qn_train, as_qn_test = convert_tensors(final_doc_train, final_doc_test, final_qn_train, final_qn_test, 2)
# check the tensor sizes
print(as_doc_train.shape, as_doc_test.shape, as_qn_train.shape, as_qn_test.shape)

torch.Size([2117, 200, 101]) torch.Size([630, 200, 101]) torch.Size([2117, 23, 101]) torch.Size([630, 23, 101])


In [86]:
reading_comp_tfidf, criterion_tfidf = train(attention_method="dot_product")
tfidf_train_report, tfidf_test_report = evaluate(reading_comp_tfidf, as_doc_train, as_qn_train, train_labels, criterion_tfidf), evaluate(reading_comp_tfidf, as_doc_test, as_qn_test, test_labels, criterion_tfidf)

Epoch 1/10, Loss: 1.3894
Epoch 2/10, Loss: 1.2855
Epoch 3/10, Loss: 1.1432
Epoch 4/10, Loss: 0.9871
Epoch 5/10, Loss: 0.8744
Epoch 6/10, Loss: 0.9173
Epoch 7/10, Loss: 0.9402
Epoch 8/10, Loss: 0.9034
Epoch 9/10, Loss: 0.8007
Epoch 10/10, Loss: 0.7634
Loss: 0.7210, Accuracy: 0.4227, Precision: 0.2837, Recall: 0.6919, F1: 0.2123
Loss: 0.7827, Accuracy: 0.4018, Precision: 0.2775, Recall: 0.6506, F1: 0.1973


In [87]:
# Model evaluation for train and test set
print("Evaluation on train set")
print(tfidf_train_report)
print('----------------------------------------------------------')
print("Evaluation on test set")
print(tfidf_test_report)
print('----------------------------------------------------------')

Evaluation on train set
              precision    recall  f1-score   support

           0       0.98      0.41      0.58    260691
           1       0.01      0.94      0.03       826
           2       0.12      0.51      0.20     19947
           3       0.02      0.91      0.05       812

    accuracy                           0.42    282276
   macro avg       0.28      0.69      0.21    282276
weighted avg       0.91      0.42      0.55    282276

----------------------------------------------------------
Evaluation on test set
              precision    recall  f1-score   support

           0       0.98      0.39      0.56     80177
           1       0.01      0.91      0.02       230
           2       0.10      0.51      0.17      5295
           3       0.02      0.79      0.04       229

    accuracy                           0.40     85931
   macro avg       0.28      0.65      0.20     85931
weighted avg       0.92      0.40      0.53     85931

------------------------

### Hyperparameter Ablation Study