## 2. Model Implementation


Running the data wrangling bit can be quite computationally intensive, and


In [1]:
%pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# load in the np data from the cleaneddata folder
final_doc_test = np.load("cleaneddata/final_doc_test.npy")
final_doc_train = np.load("cleaneddata/final_doc_train.npy")
final_qn_train = np.load("cleaneddata/final_qn_train.npy")
final_qn_test = np.load("cleaneddata/final_qn_test.npy")
tr_labels = np.load("cleaneddata/tr_labels.npy")
ts_labels = np.load("cleaneddata/ts_labels.npy")

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# check the shape of all the above
print(
    final_doc_test.shape,
    final_doc_train.shape,
    final_qn_train.shape,
    final_qn_test.shape,
    tr_labels.shape,
    ts_labels.shape,
)

(630, 200, 156) (2117, 200, 156) (2117, 23, 156) (630, 23, 156) (2117, 200) (630, 200)


In [5]:
# convert the numpy arrays to tensors
final_doc_test = torch.from_numpy(final_doc_test).to(device=device, dtype=torch.float32)
final_doc_train = torch.from_numpy(final_doc_train).to(
    device=device, dtype=torch.float32
)
final_qn_train = torch.from_numpy(final_qn_train).to(device=device, dtype=torch.float32)
final_qn_test = torch.from_numpy(final_qn_test).to(device=device, dtype=torch.float32)

In [6]:
# check the shapes of the tensors
print(
    final_doc_test.shape,
    final_doc_train.shape,
    final_qn_train.shape,
    final_qn_test.shape,
)

torch.Size([630, 200, 156]) torch.Size([2117, 200, 156]) torch.Size([2117, 23, 156]) torch.Size([630, 23, 156])


**Input Embedding Ablation Study**

In the model input embedding Ablation study, we are given 3 variations of input embeddings to test. We will test 3 options:

1. Word2Vec only # 100 dims
2. Word2Vec + Tf-IDF # 101 dims
3. Word2Vec + all features (TF-IDF, POS, NER) # 156 dims

Since we are using tensors, we can use tensor slicing to take out the relevant features.
Our tensor of embeddings are built as follows (w2v, TF-IDF, POS, NER)


In [7]:
def convert_tensors(tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test, option=3):
    if option == 3:
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test
    elif option == 1:
        tf_doc_train = tf_doc_train[:, :, :100]
        tf_doc_test = tf_doc_test[:, :, :100]
        tf_qn_train = tf_qn_train[:, :, :100]
        tf_qn_test = tf_qn_test[:, :, :100]
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test
    elif option == 2:
        tf_doc_train = tf_doc_train[:, :, :101]
        tf_doc_test = tf_doc_test[:, :, :101]
        tf_qn_train = tf_qn_train[:, :, :101]
        tf_qn_test = tf_qn_test[:, :, :101]
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test

In [8]:
# Create a mapping from label to index
label2index = {"N": 0, "S": 1, "I": 2, "E": 3}

# Find the maximum length of the label lists
max_len = final_doc_train.shape[1]

# Create a tensor to hold the one-hot encoded labels
train_labels = torch.zeros(
    len(tr_labels), max_len, len(label2index), device=device, dtype=torch.float32
)
test_labels = torch.zeros(
    len(ts_labels),
    max_len,
    len(label2index),
    device=device,
    dtype=torch.float32,
)

# Sets the first element of the third dimension of the target_labels tensor to 1
train_labels[:, :, 0] = 1
test_labels[:, :, 0] = 1

# Iterate over the label lists and one-hot encode the labels
for i, label_list in enumerate(tr_labels):
    for j, label in enumerate(label_list):
        index = label2index[label]
        # Sets all elements of the target_labels tensor at position (i,j) to 0
        train_labels[i, j] = 0
        train_labels[i, j, index] = 1

for i, label_list in enumerate(ts_labels):
    for j, label in enumerate(label_list):
        index = label2index[label]
        # Sets all elements of the target_labels tensor at position (i,j) to 0
        test_labels[i, j] = 0
        test_labels[i, j, index] = 1

In [9]:
from sklearn.utils.class_weight import compute_class_weight

# Reshape the target labels tensor
reshaped_target_labels = (
    train_labels.view(-1, 4).cpu().numpy()
)  # Assuming it's on the GPU

# Flatten the reshaped target labels
flattened_target_labels = reshaped_target_labels.argmax(axis=1)

# Calculate the class weights
class_weights = compute_class_weight(
    class_weight="balanced", classes=[0, 1, 2, 3], y=flattened_target_labels
)

# Convert the class weights to a PyTorch tensor
class_weights = torch.tensor(class_weights, dtype=torch.float32, device=device)

Data Preprocessing complete at this stage, we should check again the shapes of the tensors


## 3. Model Architecture


In [10]:
from torch import Tensor
from enum import Enum
from typing import Literal

In [11]:
# Architecture of the model for the Document BiLSTM


class DocumentBiRNN(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        num_layers=1,
    ):
        super(DocumentBiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers=num_layers,
            bidirectional=True,
        )

    def forward(self, input: Tensor):
        input = input.unsqueeze(1)
        output: Tensor
        output, _ = self.lstm(input)
        # print("document output shape: ", output.shape)
        return output


# Architecture of the model for the Question BiLSTM


class QuestionBiRNN(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        num_layers=1,
    ):
        super(QuestionBiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers=num_layers,
            bidirectional=True,
        )

    def forward(self, input: Tensor):
        input = input.unsqueeze(1)
        output, (hn, cn) = self.lstm(input)
        forward_hn = hn[-2, :, :]
        backward_hn = hn[-1, :, :]
        hidden = torch.cat((forward_hn, backward_hn), dim=-1).unsqueeze(0)
        # print("question hidden shape: ", hidden.shape)
        return hidden


# attention methods
class AttentionMethod(Enum):
    DOT_PRODUCT = "dot_product"
    SCALE_DOT_PRODUCT = "scale_dot_product"
    COSINE_SIMILARITY = "cosine_similarity"

    def __str__(self):
        return self.value


# Architecture of the model for the Attention Calculation


class Attention(nn.Module):
    def __init__(
        self,
        ques_len,
        hidden_size: int,
        attention_method: Literal[
            "dot_product",
            "scale_dot_product",
            "cosine_similarity",
        ] = "dot_product",
    ):
        super(Attention, self).__init__()
        self.out = nn.Linear(ques_len, hidden_size)
        self.hidden_size = hidden_size
        self.attention_method = AttentionMethod(attention_method)

    def forward(self, document_output, question_summary):
        if self.attention_method == AttentionMethod.DOT_PRODUCT:
            document_output = document_output.permute(
                1, 0, 2
            )  # torch.Size([200, 1, 16])
            question_summary = question_summary.permute(
                1, 2, 0
            )  # torch.Size([1, 16, 1])

            attention_scores = torch.bmm(document_output, question_summary).permute(
                1, 0, 2
            )
            # get attention weights
            attention_weights = nn.functional.softmax(attention_scores, dim=1)
            # attention_scores = torch.bmm(document_output, question_summary) / np.sqrt(self.hidden_size)
            # get context vector
            context_scores = torch.bmm(
                document_output.permute(1, 2, 0), attention_weights
            ).permute(0, 2, 1)
            return context_scores

        elif self.attention_method == AttentionMethod.SCALE_DOT_PRODUCT:
            document_output = document_output.permute(1, 0, 2)
            question_summary = question_summary.permute(1, 2, 0)
            attention_scores = torch.bmm(document_output, question_summary).permute(
                1, 0, 2
            ) / np.sqrt(self.hidden_size)
            attention_weights = nn.functional.softmax(attention_scores, dim=1)
            context_scores = torch.bmm(
                document_output.permute(1, 2, 0), attention_weights
            ).permute(0, 2, 1)
            return context_scores

        elif self.attention_method == AttentionMethod.COSINE_SIMILARITY:
            document_output = document_output.permute(1, 0, 2)
            question_summary = question_summary.permute(1, 2, 0)
            question_summary = question_summary.squeeze(-1)
            # cosine similarity attention:
            cos_sim = F.cosine_similarity(
                document_output, question_summary.unsqueeze(0), dim=-1
            ).T.unsqueeze(1)
            attention_weights = nn.functional.softmax(cos_sim, dim=1)
            context_scores = torch.bmm(
                document_output.permute(1, 2, 0), attention_weights
            ).permute(0, 2, 1)
            return context_scores


# Architecture of the model for the Attention Weighted Document Representation a.k.a ReadingComprehension
class ReadingComprehensionModel(nn.Module):
    def __init__(self, document_rnn, question_rnn, attention, hidden_size, output_size):
        super(ReadingComprehensionModel, self).__init__()
        self.document_rnn = document_rnn
        self.question_rnn = question_rnn
        self.attention = attention
        self.linear = nn.Linear(hidden_size * 2, hidden_size * 2)
        self.linear2 = nn.Linear(hidden_size * 2, output_size)

    def predict_label(self, attention_output):
        attention_output = torch.squeeze(attention_output, 1)
        # pass to linear
        pred_weights = self.linear(attention_output)
        pred_weights = self.linear2(pred_weights)
        # get the softmax
        # pred_weights = nn.functional.softmax(pred_weights, dim=1)
        return pred_weights

In [12]:
# Function to train the model


def trainIter(
    model,
    document_inputs,
    question_inputs,
    target_labels,
    num_epochs,
    criterion,
    optimizer,
    verbose=True,
):
    model.train()

    for epoch in range(num_epochs):
        loss = 0
        for document_input, question_input, target_label in zip(
            document_inputs, question_inputs, target_labels
        ):
            # optimizer.zero_grad()

            document_output = model.document_rnn(document_input)
            question_summary = model.question_rnn(question_input)

            attention_output = model.attention(document_output, question_summary)

            token_label_logits = model.predict_label(attention_output).to(device)

            # print("token label logits shape: ", token_label_logits)
            # print("target label shape: ", target_label.shape)
            # print("token label logits: ", token_label_logits)

            # print(token_label_logits[0])
            # print(target_label[0])
            # raise TypeError("stop")

            loss += criterion(token_label_logits, target_label)
            optimizer.zero_grad()
            # print(loss)

        loss.backward()
        optimizer.step()

        avg_loss = loss.item() / len(document_inputs)

        if verbose:
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

In [13]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)

In [14]:
# Evalutation of the model

START_LABEL = 1
END_LABEL = 3


def evaluate(
    model,
    document_inputs,
    question_inputs,
    target_labels,
    criterion,
    output_dict=False,
    verbose=True,
):
    model.eval()
    with torch.no_grad():
        loss = 0
        all_predictions = []
        all_targets = []
        for document_input, question_input, target_label in zip(
            document_inputs, question_inputs, target_labels
        ):
            document_output = model.document_rnn(document_input)
            question_summary = model.question_rnn(question_input)
            attention_output = model.attention(document_output, question_summary)
            token_label_logits = model.predict_label(attention_output).to(device)
            loss += criterion(token_label_logits, target_label)

            # print(token_label_logits)

            predictions = token_label_logits.argmax(dim=-1).cpu().numpy()
            targets = target_label.argmax(dim=-1).cpu().numpy()
            # print(predictions == 1)

            if any(targets == START_LABEL) and any(targets == END_LABEL):
                # Find indices of start and end tokens
                start_token_idx = np.where(targets == START_LABEL)[0]
                end_token_idx = np.where(targets == END_LABEL)[0]

                # print("target: ", targets[start_token_idx[0] : end_token_idx[0] + 1])
                # print(
                #    "prediction: ",
                #    predictions[start_token_idx[0] : end_token_idx[0] + 1],
                # )
                # print()

                # Take slice of predictions and target_labels for sentence tokens
                sentence_prediction = predictions[
                    start_token_idx[0] : end_token_idx[0] + 1
                ]
                sentence_target = targets[start_token_idx[0] : end_token_idx[0] + 1]

                all_predictions.extend(sentence_prediction)
                all_targets.extend(sentence_target)
            else:
                # Use the whole document since there is no answer
                all_predictions.extend(predictions)
                all_targets.extend(targets)

        # print(all_predictions)
        # print(all_targets)

        avg_loss = loss.item() / len(document_inputs)
        accuracy = accuracy_score(all_targets, all_predictions)
        precision = precision_score(all_targets, all_predictions, average="macro")
        recall = recall_score(all_targets, all_predictions, average="macro")
        f1 = f1_score(all_targets, all_predictions, average="macro")
        cr = classification_report(
            all_targets, all_predictions, output_dict=output_dict
        )

        if verbose:
            print(
                f"Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}"
            )

        return cr

In [15]:
# prior to training
as_doc_train, as_doc_test, as_qn_train, as_qn_test = convert_tensors(
    final_doc_train, final_doc_test, final_qn_train, final_qn_test, 3
)
# if not running any ablation, use the free up space by deleting np arrays:
# del final_doc_test, final_doc_train, final_qn_train, final_qn_test, tr_labels, ts_labels

In [16]:
# Start of the training

from torch import optim


def train(
    as_doc_train=as_doc_train,
    as_qn_train=as_qn_train,
    train_labels=train_labels,
    hidden_size=64,
    epochs=10,
    learning_rate=0.01,
    num_layers=1,
    token_labels=4,
    attention_method: Literal[
        "dot_product",
        "scale_dot_product",
        "cosine_similarity",
    ] = "dot_product",
    verbose=True,
):
    # note the names of the tensors are changed to:
    # as_doc_train, as_doc_test, as_qn_train, as_qn_test, train_labels, test_labels are called before in the ablation part
    # to avoid confusion with the original tensors

    # as_doc_train, as_doc_test, as_qn_train, as_qn_test

    document_num_embeddings = as_doc_train.shape[2]
    question_num_embeddings = as_qn_train.shape[2]
    ques_len = as_qn_train.shape[1]

    document_rnn = DocumentBiRNN(
        hidden_size=hidden_size,
        input_size=document_num_embeddings,
        num_layers=num_layers,
    ).to(device)
    question_rnn = QuestionBiRNN(
        input_size=question_num_embeddings,
        hidden_size=hidden_size,
        num_layers=num_layers,
    ).to(device)
    attention = Attention(ques_len, hidden_size, attention_method).to(device)
    reading_comp = ReadingComprehensionModel(
        document_rnn,
        question_rnn,
        attention,
        hidden_size=hidden_size,
        output_size=token_labels,
    ).to(device)
    reading_comp_optimizer = optim.AdamW(reading_comp.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss(
        weight=class_weights
    )  # to account for imbalanced class weights

    trainIter(
        reading_comp,
        as_doc_train,
        as_qn_train,
        train_labels,
        epochs,
        criterion,
        reading_comp_optimizer,
        verbose=verbose,
    )

    return reading_comp, criterion

In [26]:
import os

model_path = "./pytorch/dot_product_model.pt"

if os.path.exists(model_path):
    reading_comp_dot = torch.load(model_path)
    criterion_dot = nn.CrossEntropyLoss(
        weight=class_weights
    )  # to account for imbalanced class weights
else:
    reading_comp_dot, criterion_dot = train(attention_method="dot_product")
    torch.save(reading_comp_dot, model_path)

train_report, test_report = evaluate(
    reading_comp_dot, as_doc_train, as_qn_train, train_labels, criterion_dot
), evaluate(reading_comp_dot, as_doc_test, as_qn_test, test_labels, criterion_dot)

Loss: 0.6519, Accuracy: 0.3150, Precision: 0.2869, Recall: 0.7228, F1: 0.1774
Loss: 0.7326, Accuracy: 0.2916, Precision: 0.2810, Recall: 0.6869, F1: 0.1615


## 3a. Attention Ablation Study

In this section, we study 3 different type of attention mechanisms between the question model and the document model. We ensured that the 3 attention mechanisms are ran on the same model hyperparameters, so as to keep things interpretable and standardized across the study.

The hyperparameters of the training model are as follows:

-   RNN (Bi-LSTM) Hidden Size: 64,
-   Number of epochs: 10,
-   Learning Rate: 0.01,
-   Number of RNN (Bi-LSTM) layers: 1


**Attention Ablation Study - Dot Product**


In [None]:
# Model evaluation
reading_comp_dot, criterion_dot = train(attention_method="dot_product")
train_report, test_report = evaluate(
    reading_comp_dot, as_doc_train, as_qn_train, train_labels, criterion_dot
), evaluate(reading_comp_dot, as_doc_test, as_qn_test, test_labels, criterion_dot)

Epoch 1/10, Loss: 1.3877
Epoch 2/10, Loss: 1.2985
Epoch 3/10, Loss: 1.1640
Epoch 4/10, Loss: 1.0213
Epoch 5/10, Loss: 0.8929
Epoch 6/10, Loss: 0.8886
Epoch 7/10, Loss: 0.8762
Epoch 8/10, Loss: 0.7103
Epoch 9/10, Loss: 0.7208
Epoch 10/10, Loss: 0.6745
Loss: 0.6243, Accuracy: 0.4794, Precision: 0.2890, Recall: 0.7330, F1: 0.2368
Loss: 0.7285, Accuracy: 0.4514, Precision: 0.2805, Recall: 0.6719, F1: 0.2169


In [None]:
# Model evaluation for train and test set
print("Evaluation on train set")
print(train_report)
print("----------------------------------------------------------")
print("Evaluation on test set")
print(test_report)
print("----------------------------------------------------------")

Evaluation on train set
              precision    recall  f1-score   support

           0       0.97      0.46      0.63    260691
           1       0.03      0.90      0.05       826
           2       0.12      0.67      0.21     19947
           3       0.03      0.90      0.06       812

    accuracy                           0.48    282276
   macro avg       0.29      0.73      0.24    282276
weighted avg       0.91      0.48      0.59    282276

----------------------------------------------------------
Evaluation on test set
              precision    recall  f1-score   support

           0       0.97      0.44      0.60     80177
           1       0.02      0.85      0.05       230
           2       0.10      0.65      0.17      5295
           3       0.02      0.75      0.04       229

    accuracy                           0.45     85931
   macro avg       0.28      0.67      0.22     85931
weighted avg       0.92      0.45      0.57     85931

------------------------

**Attention Ablation Study - Scaled Dot Product**


In [None]:
# testing with scaled dot product attention
reading_comp_scaled, criterion_scaled = train(attention_method="scale_dot_product")

Epoch 1/10, Loss: 1.3930
Epoch 2/10, Loss: 1.2962
Epoch 3/10, Loss: 1.1624
Epoch 4/10, Loss: 1.0126
Epoch 5/10, Loss: 0.8761
Epoch 6/10, Loss: 0.8303
Epoch 7/10, Loss: 0.8849
Epoch 8/10, Loss: 0.7509
Epoch 9/10, Loss: 0.7617
Epoch 10/10, Loss: 0.6999


In [None]:
scaled_train_report, scaled_test_report = evaluate(
    reading_comp_scaled, as_doc_train, as_qn_train, train_labels, criterion_scaled
), evaluate(reading_comp_scaled, as_doc_test, as_qn_test, test_labels, criterion_scaled)

# model evaluation for train and test set
print("Evaluation on train set")
print(scaled_train_report)
print("----------------------------------------------------------")
print("Evaluation on test set")
print(scaled_test_report)
print("----------------------------------------------------------")

Loss: 0.6344, Accuracy: 0.4300, Precision: 0.2885, Recall: 0.7360, F1: 0.2194
Loss: 0.7066, Accuracy: 0.4040, Precision: 0.2813, Recall: 0.6986, F1: 0.2018
Evaluation on train set
              precision    recall  f1-score   support

           0       0.98      0.41      0.58    260691
           1       0.03      0.93      0.05       826
           2       0.13      0.66      0.21     19947
           3       0.02      0.95      0.04       812

    accuracy                           0.43    282276
   macro avg       0.29      0.74      0.22    282276
weighted avg       0.92      0.43      0.55    282276

----------------------------------------------------------
Evaluation on test set
              precision    recall  f1-score   support

           0       0.98      0.39      0.55     80177
           1       0.02      0.89      0.04       230
           2       0.10      0.64      0.18      5295
           3       0.02      0.88      0.03       229

    accuracy                   

**Attention Ablation Study - Cosine Similarity**


In [None]:
# testing with cosine similarity attention
reading_comp_cosine, criterion_cosine = train(attention_method="cosine_similarity")

Epoch 1/10, Loss: 1.3923
Epoch 2/10, Loss: 1.2907
Epoch 3/10, Loss: 1.1422
Epoch 4/10, Loss: 0.9780
Epoch 5/10, Loss: 0.8600
Epoch 6/10, Loss: 0.9419
Epoch 7/10, Loss: 0.8626
Epoch 8/10, Loss: 0.8444
Epoch 9/10, Loss: 0.7202
Epoch 10/10, Loss: 0.7252


In [None]:
# model evaluation for train and test set
cosine_train_report, cosine_test_report = evaluate(
    reading_comp_cosine, as_doc_train, as_qn_train, train_labels, criterion_cosine
), evaluate(reading_comp_cosine, as_doc_test, as_qn_test, test_labels, criterion_cosine)

print("Evaluation on train set")
print(cosine_train_report)
print("----------------------------------------------------------")
print("Evaluation on test set")
print(cosine_test_report)
print("----------------------------------------------------------")

Loss: 0.7232, Accuracy: 0.3062, Precision: 0.2883, Recall: 0.7168, F1: 0.1751
Loss: 0.8248, Accuracy: 0.2832, Precision: 0.2822, Recall: 0.6633, F1: 0.1586
Evaluation on train set
              precision    recall  f1-score   support

           0       1.00      0.26      0.42    260691
           1       0.02      0.91      0.05       826
           2       0.10      0.81      0.19     19947
           3       0.03      0.89      0.05       812

    accuracy                           0.31    282276
   macro avg       0.29      0.72      0.18    282276
weighted avg       0.93      0.31      0.40    282276

----------------------------------------------------------
Evaluation on test set
              precision    recall  f1-score   support

           0       1.00      0.25      0.39     80177
           1       0.02      0.84      0.04       230
           2       0.09      0.81      0.16      5295
           3       0.02      0.76      0.04       229

    accuracy                   

### Input Embeddings Ablation Study

The above model used the full context vector with all word embeddings taken (Word2Vec, POS, NER, TF-IDF). In this section, we want to study the results of:

1. Word2Vec Word embeddings only
2. Word2Vec + TF-IDF
3. Full vector, which we have ran the results above


In [None]:
# Word embeds only
as_doc_train, as_doc_test, as_qn_train, as_qn_test = convert_tensors(
    final_doc_train, final_doc_test, final_qn_train, final_qn_test, 1
)

In [None]:
# Model evaluation for train and test set
print("Evaluation on train set")
print(train_report)
print("----------------------------------------------------------")
print("Evaluation on test set")
print(test_report)
print("----------------------------------------------------------")

### Hyperparameter Ablation Study


In [17]:
import optuna

In [18]:
from sklearn.model_selection import KFold


def objective(trial):
    # Define the hyperparameters to search over
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)

    # Create a KFold object for cross-validation
    kf = KFold(n_splits=3)

    # Initialize a list to store the cross-validation scores
    cv_scores = []

    # Perform cross-validation
    for train_index, val_index in kf.split(as_doc_train):
        # Split the data into training and validation sets
        doc_train, doc_val = as_doc_train[train_index], as_doc_train[val_index]
        que_train, que_val = as_qn_train[train_index], as_qn_train[val_index]
        label_train, label_val = train_labels[train_index], train_labels[val_index]

        # Train the model on the training set
        reading_comp_dot, criterion_dot = train(
            doc_train,
            que_train,
            label_train,
            attention_method="dot_product",
            learning_rate=learning_rate,
            verbose=False,
        )

        # Evaluate the model on the validation set
        val_report = evaluate(
            reading_comp_dot,
            doc_val,
            que_val,
            label_val,
            criterion_dot,
            output_dict=True,
            verbose=False,
        )
        # Store the validation score
        cv_scores.append(val_report["macro avg"]["f1-score"])
    print(f"Learning rate: {learning_rate:.4f}, F1: {np.mean(cv_scores):.4f}")

    # Return the average cross-validation score
    return np.mean(cv_scores)


# Create a study object and optimize the objective function
study_name = "reading_comprehension"  # Unique identifier of the study.
storage_name = f"sqlite:///./optuna/{study_name}.db"
sampler = optuna.samplers.GridSampler({"learning_rate": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]})
study = optuna.create_study(
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
    direction="maximize",
    sampler=sampler,
)
study.optimize(objective, n_trials=5, n_jobs=2, show_progress_bar=True)

[32m[I 2023-05-20 02:10:09,784][0m A new study created in RDB with name: reading_comprehension[0m
  self._init_valid()


  0%|          | 0/5 [00:00<?, ?it/s]

Learning rate: 0.0000, F1: 0.1789
Learning rate: 0.0001, F1: 0.0726
[32m[I 2023-05-20 02:18:45,065][0m Trial 1 finished with value: 0.17893999001974925 and parameters: {'learning_rate': 1e-05}. Best is trial 1 with value: 0.17893999001974925.[0m
[32m[I 2023-05-20 02:18:45,109][0m Trial 0 finished with value: 0.0726473386810972 and parameters: {'learning_rate': 0.0001}. Best is trial 1 with value: 0.17893999001974925.[0m
Learning rate: 0.1000, F1: 0.2102
Learning rate: 0.0100, F1: 0.2531
[32m[I 2023-05-20 02:28:15,891][0m Trial 2 finished with value: 0.21015808025039706 and parameters: {'learning_rate': 0.1}. Best is trial 2 with value: 0.21015808025039706.[0m
[32m[I 2023-05-20 02:28:15,900][0m Trial 3 finished with value: 0.253149620219665 and parameters: {'learning_rate': 0.01}. Best is trial 3 with value: 0.253149620219665.[0m
Learning rate: 0.0010, F1: 0.1692
[32m[I 2023-05-20 02:33:19,426][0m Trial 4 finished with value: 0.16921768934114767 and parameters: {'learning_

In [44]:
# %pip install optuna-dashboard
# %optuna-dashboard sqlite:///db.sqlite3

Collecting optuna-dashboard
  Downloading optuna_dashboard-0.9.2-py3-none-any.whl (4.4 MB)
     ---------------------------------------- 4.4/4.4 MB 1.8 MB/s eta 0:00:00
Collecting bottle
  Downloading bottle-0.12.25-py3-none-any.whl (90 kB)
     ---------------------------------------- 90.2/90.2 kB 5.0 MB/s eta 0:00:00
Installing collected packages: bottle, optuna-dashboard
Successfully installed bottle-0.12.25 optuna-dashboard-0.9.2
Note: you may need to restart the kernel to use updated packages.


UsageError: Line magic function `%optuna-dashboard` not found.


In [19]:
trial = study.best_trial
print("Best Score: ", trial.value)
print("Best Params: ")
for key, value in trial.params.items():
    print("  {}: {}".format(key, value))

Best Score:  0.253149620219665
Best Params: 
  learning_rate: 0.01


In [57]:
# %pip install plotly
# %pip install optuna-dashboard
# %optuna-dashboard sqlite:///./optuna/reading_comprehension.db

Note: you may need to restart the kernel to use updated packages.


UsageError: Line magic function `%optuna-dashboard` not found.


In [24]:
# Get a DataFrame containing the results of all trials
df = study.trials_dataframe()

# Display the DataFrame
print(df)

   number     value             datetime_start          datetime_complete  \
0       0  0.072647 2023-05-20 02:10:09.848122 2023-05-20 02:18:45.060281   
1       1  0.178940 2023-05-20 02:10:09.864671 2023-05-20 02:18:45.028565   
2       2  0.210158 2023-05-20 02:18:45.119037 2023-05-20 02:28:15.830017   
3       3  0.253150 2023-05-20 02:18:45.209923 2023-05-20 02:28:15.856555   
4       4  0.169218 2023-05-20 02:28:15.935802 2023-05-20 02:33:19.404487   

                duration  params_learning_rate  system_attrs_grid_id  \
0 0 days 00:08:35.212159               0.00010                     1   
1 0 days 00:08:35.163894               0.00001                     0   
2 0 days 00:09:30.710980               0.10000                     4   
3 0 days 00:09:30.646632               0.01000                     3   
4 0 days 00:05:03.468685               0.00100                     2   

                           system_attrs_search_space     state  
0  {'learning_rate': [1e-05, 0.0001, 0.

In [20]:
optuna.visualization.plot_optimization_history(study)

In [22]:
optuna.visualization.plot_slice(study)

In [23]:
optuna.visualization.plot_edf(study)