In [42]:
import matplotlib

# Anti-Grain Geometry: Essential for clusters without screens (X11 forwarding)
matplotlib.use('Agg')

In [43]:
import os
import re
import time
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split, Subset
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification, BertModel, get_linear_schedule_with_warmup


In [2]:
seed = 551
np.random.seed(seed)
torch.manual_seed(seed)

# Cluster Optimization: Determine workers based on CPU count, capped at 4 for safety
NUM_WORKERS = min(4, os.cpu_count() if os.cpu_count() else 0)

In [3]:
def getDevice():
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"
    elif torch.backends.mps.is_available():
        device = torch.device("mps")

    print(f"Using device: {device}")
    return device

In [4]:
DEVICE = getDevice()
PIN_MEMORY = True if str(DEVICE) == "cuda" else False

Using device: cuda


## Task 1: Acquire and Pre-process the Web of Science Dataset

In [5]:
DATA_DIR = "WebOfScienceDataset/WOS11967"
GLOVE_PATH = "glove.6B/glove.6B.300d.txt"

MAX_SEQ_LEN = 200
EMBED_DIM = 300
MAX_VOCAB = 10000
BATCH_SIZE = 32

In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


def load_data(data_dir):
    with open(os.path.join(data_dir, 'X.txt'), 'r', encoding='utf-8') as f:
        texts = [clean_text(line) for line in f.readlines()]

    with open(os.path.join(data_dir, 'Y.txt'), 'r', encoding='utf-8') as f:
        y_sub = [int(line.strip()) for line in f.readlines()]

    with open(os.path.join(data_dir, 'YL1.txt'), 'r', encoding='utf-8') as f:
        y_domain = [int(line.strip()) for line in f.readlines()]

    return texts, y_sub, y_domain


def get_dynamic_max_len(texts, percentile=95):
    lengths = [len(t.split()) for t in texts]
    limit = int(np.percentile(lengths, percentile))
    print(f"95th percentile length is {limit}. Mean is {int(np.mean(lengths))}.")
    return limit

In [7]:
def build_vocab(texts, max_words=MAX_VOCAB, min_freq=2):
    word_counts = Counter()
    for text in texts:
        word_counts.update(text.split())

    vocab = {"<PAD>": 0, "<UNK>": 1}

    for word, count in word_counts.most_common(max_words - 2):
        if count >= min_freq:
            vocab[word] = len(vocab)

    return vocab


def encode_texts(texts, vocab, max_len=MAX_SEQ_LEN):
    tensor_data = []
    unk_idx = vocab["<UNK>"]
    pad_idx = vocab["<PAD>"]

    for text in texts:
        tokens = text.split()
        seq = [vocab.get(t, unk_idx) for t in tokens]

        if len(seq) < max_len:
            seq = seq + [pad_idx] * (max_len - len(seq))
        else:
            seq = seq[:max_len]

        tensor_data.append(seq)

    return torch.tensor(tensor_data, dtype=torch.long)


def load_glove_matrix(path, vocab, embed_dim=EMBED_DIM):
    weights = np.random.uniform(-0.25, 0.25, (len(vocab), embed_dim))

    if "<PAD>" in vocab:
        weights[vocab["<PAD>"]] = 0

    hits = 0
    # Optimization: Using a set for O(1) lookups
    vocab_set = set(vocab.keys())

    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]

            if word in vocab_set:
                vector = np.array(values[1:], dtype=float)
                if len(vector) == embed_dim:
                    weights[vocab[word]] = vector
                    hits += 1

    print(f"GloVe loaded. Found {hits} / {len(vocab)} words.")
    return torch.tensor(weights, dtype=torch.float32)

In [8]:
def prepare_data():
    texts, y_sub, y_domain = load_data(DATA_DIR)
    dynamic_max_len = get_dynamic_max_len(texts)
    vocab = build_vocab(texts)

    X_tensor = encode_texts(texts, vocab, max_len=dynamic_max_len)
    Y_sub_tensor = torch.tensor(y_sub, dtype=torch.long)
    Y_domain_tensor = torch.tensor(y_domain, dtype=torch.long)
    embedding_weights = load_glove_matrix(GLOVE_PATH, vocab)

    dataset = TensorDataset(X_tensor, Y_domain_tensor, Y_sub_tensor)

    test_size = int(0.2 * len(dataset))
    remaining_size = len(dataset) - test_size
    val_size = int(0.2 * remaining_size)
    train_size = remaining_size - val_size

    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

    # Cluster Optimization: workers and pinning
    train_ldr = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS,
                           pin_memory=PIN_MEMORY)
    val_ldr = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS,
                         pin_memory=PIN_MEMORY)
    test_ldr = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS,
                          pin_memory=PIN_MEMORY)

    return train_ldr, val_ldr, test_ldr, embedding_weights, vocab, train_dataset, val_dataset, test_dataset


## Task 2: Implement LSTM and BERT models

In [37]:
class CustomLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size, init_type):
        super(CustomLSTMCell, self).__init__()
        self.hidden_size = hidden_size
        self.weight_ih = nn.Linear(input_size, 4 * hidden_size)
        self.weight_hh = nn.Linear(hidden_size, 4 * hidden_size)

        if init_type == 'xavier':
            nn.init.xavier_uniform_(self.weight_ih.weight)
            nn.init.xavier_uniform_(self.weight_hh.weight)
        elif init_type == 'zero':
            nn.init.zeros_(self.weight_ih.weight)
            nn.init.zeros_(self.weight_hh.weight)
        elif init_type == 'random':
            nn.init.normal_(self.weight_ih.weight, mean=0, std=0.01)
            nn.init.normal_(self.weight_hh.weight, mean=0, std=0.01)

    def forward(self, x, state):
        # type: (Tensor, Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tensor]
        h_prev, c_prev = state
        gates = self.weight_ih(x) + self.weight_hh(h_prev)
        i_gate, f_gate, g_gate, o_gate = gates.chunk(4, 1)

        i = torch.sigmoid(i_gate)
        f = torch.sigmoid(f_gate)
        g = torch.tanh(g_gate)
        o = torch.sigmoid(o_gate)

        c_next = (f * c_prev) + (i * g)
        h_next = o * torch.tanh(c_next)

        return h_next, c_next

In [38]:
class JittedLSTMLayer(nn.Module):
    """
    Wraps the cell and the loop so the ENTIRE loop is compiled to C++.
    This eliminates the Python interpreter overhead during the sequence loop.
    """
    def __init__(self, cell):
        super().__init__()
        self.cell = cell

    def forward(self, x_emb):
        # type: (Tensor) -> Tensor
        # We must explicitly type inputs for JIT to work
        batch_size = x_emb.size(0)
        seq_len = x_emb.size(1)

        # Initialize states on the correct device
        h_t = torch.zeros(batch_size, self.cell.hidden_size, device=x_emb.device)
        c_t = torch.zeros(batch_size, self.cell.hidden_size, device=x_emb.device)

        # This loop now runs in C++!
        for t in range(seq_len):
            x_t = x_emb[:, t, :]
            h_t, c_t = self.cell(x_t, (h_t, c_t))
            
        return h_t

In [39]:
# Optimization: JIT Script compiles this class to C++, speeding up the manual loop significantly
class CustomLSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_dim, hidden_size=512, embedding_matrix=None, dropout=0.5, init_type='xavier'):
        super(CustomLSTMModel, self).__init__()
        self.device = getDevice()
        self.hidden_size = hidden_size
        self.output_dim = output_dim

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if embedding_matrix is not None:
            self.embedding.weight = nn.Parameter(embedding_matrix)

        cell = CustomLSTMCell(embed_dim, hidden_size, init_type)
        self.lstm_layer = torch.jit.script(JittedLSTMLayer(cell))

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, output_dim)
        self.to(self.device)

    def forward(self, x):
        # Embed
        x_emb = self.embedding(x)
        
        # Fast C++ Loop
        h_t = self.lstm_layer(x_emb)

        # Classify
        out = self.dropout(h_t)
        logits = self.fc(out)
        return logits

    def fit(self, train_ldr, val_ldr, epochs=10, lr=0.001, l1_lambda=0.0, weight_decay=0.0):
        optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)
        criterion = nn.CrossEntropyLoss()

        history = {"train_loss": [], "val_loss": [], "val_acc": []}
        print(f"\ntraining LSTM (output dim: {self.output_dim}) for {epochs} epochs")

        for epoch in range(epochs):
            self.train()
            total_loss = 0

            for X_batch, Y_domain, Y_sub in train_ldr:
                X_batch = X_batch.to(self.device)
                target = Y_domain.to(self.device) if self.output_dim == 7 else Y_sub.to(self.device)

                optimizer.zero_grad()
                outputs = self(X_batch)
                loss = criterion(outputs, target)

                if l1_lambda > 0:
                    l1_norm = sum(p.abs().sum() for p in self.parameters())
                    loss += l1_lambda * l1_norm

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.parameters(), 1.0)
                optimizer.step()
                total_loss += loss.item()

            val_acc = self.evaluate_acc(val_ldr)
            val_loss = self.evaluate_loss(val_ldr, criterion)

            history["val_acc"].append(val_acc)
            history["val_loss"].append(val_loss)
            print(f"epoch {epoch + 1}/{epochs} | loss: {total_loss / len(train_ldr):.4f} | val acc: {val_acc:.2f}%")

        return history

    def predict(self, X):
        self.eval()
        with torch.no_grad():
            X = X.to(self.device)
            outputs = self(X)
            predictions = torch.argmax(outputs, dim=1)
        return predictions

    def evaluate_acc(self, data_loader):
        self.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for X_batch, Y_domain, Y_sub in data_loader:
                X_batch = X_batch.to(self.device)
                target = Y_domain.to(self.device) if self.output_dim == 7 else Y_sub.to(self.device)
                preds = self.predict(X_batch)
                correct += (preds == target).sum().item()
                total += target.size(0)
        return 100 * correct / total

    def evaluate_loss(self, data_loader, criterion):
        self.eval()
        total_loss = 0
        total = 0
        with torch.no_grad():
            for X_batch, Y_domain, Y_sub in data_loader:
                X_batch = X_batch.to(self.device)
                target = Y_domain.to(self.device) if self.output_dim == 7 else Y_sub.to(self.device)
                outputs = self(X_batch)
                loss = criterion(outputs, target)
                total_loss += loss.item() * target.size(0)
                total += target.size(0)
        return total_loss / total

In [12]:
class BERTClassifier(nn.Module):
    def __init__(self, output_dim, model_name='bert-base-uncased', dropout=0.3):
        super().__init__()
        self.device = getDevice()
        self.bert = BertModel.from_pretrained(model_name, output_attentions=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(self.bert.config.hidden_size, output_dim)
        self.to(self.device)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooler_output = outputs.pooler_output
        pooled_output = self.dropout(pooler_output)
        logits = self.fc(pooled_output)
        return logits

    def get_attention_maps(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids.to(self.device),
                                attention_mask=attention_mask.to(self.device))
        return outputs.attentions

    def fit(self, train_ldr, val_ldr, lr=2e-5, epochs=3, patience=2, weight_decay=0.0, l1_lambda=0.0):
        optimizer = optim.AdamW(self.parameters(), lr=lr, weight_decay=weight_decay)
        total_steps = len(train_ldr) * epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        criterion = nn.CrossEntropyLoss()

        history = {"val_acc": []}
        best_val_loss = float('inf')
        patience_counter = 0

        print(f"starting BERT fine-tuning")

        for epoch in range(epochs):
            self.train()
            total_train_loss = 0
            correct_train = 0
            total_samples = 0
            start_time = time.time()

            for batch in train_ldr:
                b_input_ids = batch[0].to(self.device)
                b_input_mask = batch[1].to(self.device)
                b_labels = batch[2].to(self.device)

                self.zero_grad()
                logits = self.forward(b_input_ids, b_input_mask)
                loss = criterion(logits, b_labels)

                if l1_lambda > 0:
                    l1_norm = sum(p.abs().sum() for p in self.parameters())
                    loss += l1_lambda * l1_norm

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

                total_train_loss += loss.item() * b_input_ids.size(0)
                preds = torch.argmax(logits, dim=1)
                correct_train += (preds == b_labels).sum().item()
                total_samples += b_input_ids.size(0)

            avg_train_loss = total_train_loss / total_samples
            train_acc = correct_train / total_samples
            val_loss, val_acc = self.evaluate(val_ldr, criterion)

            epoch_time = time.time() - start_time
            print(
                f"epoch {epoch + 1}/{epochs} [{epoch_time:.1f}s] | train loss: {avg_train_loss:.4f} acc: {train_acc:.4f} | val loss: {val_loss:.4f} acc: {val_acc:.4f}")

            history["val_acc"].append(val_acc)

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(self.state_dict(), "best_bert_model.pth")
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break

        self.load_state_dict(torch.load("best_bert_model.pth"))
        return history

    def evaluate(self, val_ldr, criterion):
        self.eval()
        total_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_ldr:
                b_input_ids = batch[0].to(self.device)
                b_input_mask = batch[1].to(self.device)
                b_labels = batch[2].to(self.device)

                logits = self.forward(b_input_ids, b_input_mask)
                loss = criterion(logits, b_labels)

                total_loss += loss.item() * b_input_ids.size(0)
                preds = torch.argmax(logits, dim=1)
                correct += (preds == b_labels).sum().item()
                total += b_input_ids.size(0)

        return total_loss / total, correct / total

## Task 3: Run experiments

In [40]:
def evaluate_lstm(train_ds, val_ds, vocab_size, output_dim, batch_size=BATCH_SIZE, embedding_matrix=None, epochs=10,
                  hidden_size=512, dropout=0.5, l1_lambda=0.0, weight_decay=0.0, lr=0.001, init_type='xavier'):
    # Optim: Workers and pinning
    train_ldr = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS,
                           pin_memory=PIN_MEMORY)
    val_ldr = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)

    model = CustomLSTMModel(vocab_size, embedding_matrix.shape[1], output_dim, hidden_size=hidden_size, dropout=dropout,
                            embedding_matrix=embedding_matrix, init_type=init_type)

    history = model.fit(train_ldr, val_ldr, lr=lr, epochs=epochs, l1_lambda=l1_lambda, weight_decay=weight_decay)
    return max(history.get("val_acc", [0]))


In [14]:
def get_bert_test_data(all_texts, all_labels, test_indices, max_length=256):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    test_texts = [all_texts[i] for i in test_indices]
    test_labels = [all_labels[i] for i in test_indices]

    test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'],
                                 torch.tensor(test_labels))
    # Optim: Workers and pinning
    test_ldr = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
    return test_ldr, test_texts, test_labels, tokenizer

In [15]:
def plot_combined_results(values, model_1_scores, model_2_scores, title, xlabel, label_model_1="LSTM Val Acc",
                          label_model_2="BERT Val Acc", save_dir="plots"):
    os.makedirs(save_dir, exist_ok=True)
    fig, ax = plt.subplots()
    ax.plot(values, model_1_scores, label=label_model_1, marker='o')
    ax.plot(values, model_2_scores, label=label_model_2, marker='x')
    ax.set_xlabel(xlabel)
    ax.set_ylabel("Validation Accuracy")
    ax.set_title(title)
    ax.legend()
    ax.grid(True)

    clean_title = title.replace(" ", "_").replace(":", "")
    filename = f"{clean_title}.png"
    filepath = os.path.join(save_dir, filename)
    fig.savefig(filepath, bbox_inches='tight')
    plt.close(fig)
    print(f"saved plot to: {filepath}")

In [35]:
def plot_single_result(values, scores, title, xlabel, ylabel="Validation Accuracy", save_dir="plots"):
    os.makedirs(save_dir, exist_ok=True)
    fig, ax = plt.subplots()

    ax.plot(values, scores, marker='o', label='LSTM')

    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.legend()
    ax.grid(True)

    clean_title = title.replace(" ", "_").replace(":", "")
    filename = f"{clean_title}.png"
    filepath = os.path.join(save_dir, filename)

    fig.savefig(filepath, bbox_inches='tight')
    plt.close(fig)
    print(f"saved plot to: {filepath}")

In [16]:
print("loading data")
train_loader, val_loader, test_loader, embedding_matrix, vocab, train_ds, val_ds, test_ds = prepare_data()

# reload raw texts for BERT
all_texts, y_sub_raw, y_domain_raw = load_data(DATA_DIR)

vocab_size = len(vocab)
output_dim_domain = 7
output_dim_sub = 33

train_indices = train_ds.indices
val_indices = val_ds.indices
test_indices = test_ds.indices

loading data
95th percentile length is 333. Mean is 201.
GloVe loaded. Found 9853 / 10000 words.


In [17]:
# Pre-tokenize BERT inputs once to save time in loops
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_train_texts = [all_texts[i] for i in train_indices]
bert_val_texts = [all_texts[i] for i in val_indices]

print("Tokenizing BERT data...")
bert_train_enc = bert_tokenizer(bert_train_texts, truncation=True, padding=True, max_length=256, return_tensors='pt')
bert_val_enc = bert_tokenizer(bert_val_texts, truncation=True, padding=True, max_length=256, return_tensors='pt')

bert_sub_train_labels = torch.tensor([y_sub_raw[i] for i in train_indices])
bert_sub_val_labels = torch.tensor([y_sub_raw[i] for i in val_indices])
bert_domain_train_labels = torch.tensor([y_domain_raw[i] for i in train_indices])
bert_domain_val_labels = torch.tensor([y_domain_raw[i] for i in val_indices])

Tokenizing BERT data...


In [18]:
# 1) test dropout
dropout_list = [0.5, 0.1]
lstm_res_domain = []
lstm_res_sub = []

for d in dropout_list:
    print(f"testing dropout: {d}")
    lstm_res_domain.append(
        evaluate_lstm(train_ds, val_ds, vocab_size, output_dim_domain, embedding_matrix=embedding_matrix, dropout=d))
    lstm_res_sub.append(
        evaluate_lstm(train_ds, val_ds, vocab_size, output_dim_sub, embedding_matrix=embedding_matrix, dropout=d,
                      epochs=20))

plot_combined_results(dropout_list, lstm_res_domain, lstm_res_sub, "Effect of Dropout", "Dropout Rate")

BEST_DROPOUT_LSTM_DOMAIN = dropout_list[np.argmax(lstm_res_domain)]
BEST_DROPOUT_LSTM_SUB = dropout_list[np.argmax(lstm_res_sub)]

testing dropout: 0.5
Using device: cuda

training LSTM (output dim: 7) for 10 epochs
epoch 1/10 | loss: 1.9308 | val acc: 17.19%
epoch 2/10 | loss: 1.7173 | val acc: 29.47%
epoch 3/10 | loss: 1.9004 | val acc: 23.51%
epoch 4/10 | loss: 1.8514 | val acc: 26.70%
epoch 5/10 | loss: 1.6863 | val acc: 28.06%
epoch 6/10 | loss: 1.6920 | val acc: 29.41%
epoch 7/10 | loss: 1.3997 | val acc: 51.83%
epoch 8/10 | loss: 0.9659 | val acc: 73.41%
epoch 9/10 | loss: 0.7064 | val acc: 78.94%
epoch 10/10 | loss: 0.4460 | val acc: 84.54%
Using device: cuda

training LSTM (output dim: 33) for 20 epochs
epoch 1/20 | loss: 3.4918 | val acc: 3.40%
epoch 2/20 | loss: 3.4725 | val acc: 3.45%
epoch 3/20 | loss: 3.3910 | val acc: 2.93%
epoch 4/20 | loss: 3.4797 | val acc: 5.49%
epoch 5/20 | loss: 3.1792 | val acc: 7.89%
epoch 6/20 | loss: 2.9105 | val acc: 9.72%
epoch 7/20 | loss: 2.7374 | val acc: 10.76%
epoch 8/20 | loss: 2.5957 | val acc: 12.75%
epoch 9/20 | loss: 2.4416 | val acc: 16.14%
epoch 10/20 | loss:

In [19]:
# 2) test learning rate
lr_list = [1e-3, 1e-4]
lstm_res_domain_lr = []
lstm_res_sub_lr = []

for lr in lr_list:
    print(f"testing learning rate: {lr}")
    lstm_res_domain_lr.append(
        evaluate_lstm(train_ds, val_ds, vocab_size, output_dim_domain, embedding_matrix=embedding_matrix,
                      dropout=BEST_DROPOUT_LSTM_DOMAIN, lr=lr))
    lstm_res_sub_lr.append(
        evaluate_lstm(train_ds, val_ds, vocab_size, output_dim_sub, embedding_matrix=embedding_matrix,
                      dropout=BEST_DROPOUT_LSTM_SUB, lr=lr, epochs=20))

plot_combined_results(lr_list, lstm_res_domain_lr, lstm_res_sub_lr, "Effect of Learning Rate", "Learning Rate")

BEST_LR_LSTM_DOMAIN = lr_list[np.argmax(lstm_res_domain_lr)]
BEST_LR_LSTM_SUB = lr_list[np.argmax(lstm_res_sub_lr)]

testing learning rate: 0.001
Using device: cuda

training LSTM (output dim: 7) for 10 epochs
epoch 1/10 | loss: 1.9401 | val acc: 18.13%
epoch 2/10 | loss: 1.9555 | val acc: 18.50%
epoch 3/10 | loss: 1.7543 | val acc: 27.85%
epoch 4/10 | loss: 1.5714 | val acc: 34.80%
epoch 5/10 | loss: 1.3190 | val acc: 54.13%
epoch 6/10 | loss: 0.9941 | val acc: 68.03%
epoch 7/10 | loss: 0.6345 | val acc: 77.90%
epoch 8/10 | loss: 0.3339 | val acc: 83.86%
epoch 9/10 | loss: 0.2041 | val acc: 88.14%
epoch 10/10 | loss: 0.1299 | val acc: 89.18%
Using device: cuda

training LSTM (output dim: 33) for 20 epochs
epoch 1/20 | loss: 3.4899 | val acc: 4.44%
epoch 2/20 | loss: 3.4681 | val acc: 3.76%
epoch 3/20 | loss: 3.4197 | val acc: 5.54%
epoch 4/20 | loss: 3.3842 | val acc: 5.80%
epoch 5/20 | loss: 3.3866 | val acc: 4.49%
epoch 6/20 | loss: 3.2220 | val acc: 5.07%
epoch 7/20 | loss: 2.9917 | val acc: 7.78%
epoch 8/20 | loss: 2.9477 | val acc: 11.39%
epoch 9/20 | loss: 2.8991 | val acc: 8.36%
epoch 10/20 |

In [20]:
# 3) test hidden size
hidden_size_list = [256, 512]
lstm_res_domain_hidden = []
lstm_res_sub_hidden = []

for hs in hidden_size_list:
    print(f"testing hidden size: {hs}")
    lstm_res_domain_hidden.append(
        evaluate_lstm(train_ds, val_ds, vocab_size, output_dim_domain, embedding_matrix=embedding_matrix,
                      dropout=BEST_DROPOUT_LSTM_DOMAIN, lr=BEST_LR_LSTM_DOMAIN, hidden_size=hs))
    lstm_res_sub_hidden.append(
        evaluate_lstm(train_ds, val_ds, vocab_size, output_dim_sub, embedding_matrix=embedding_matrix,
                      dropout=BEST_DROPOUT_LSTM_SUB, lr=BEST_LR_LSTM_SUB, hidden_size=hs, epochs=20))

plot_combined_results(hidden_size_list, lstm_res_domain_hidden, lstm_res_sub_hidden, "Effect of Hidden Size",
                      "Hidden Size")

BEST_HS_LSTM_DOMAIN = hidden_size_list[np.argmax(lstm_res_domain_hidden)]
BEST_HS_LSTM_SUB = hidden_size_list[np.argmax(lstm_res_sub_hidden)]

testing hidden size: 256
Using device: cuda

training LSTM (output dim: 7) for 10 epochs
epoch 1/10 | loss: 1.9142 | val acc: 27.22%
epoch 2/10 | loss: 1.8457 | val acc: 28.11%
epoch 3/10 | loss: 1.6159 | val acc: 23.41%
epoch 4/10 | loss: 1.6962 | val acc: 30.72%
epoch 5/10 | loss: 1.3851 | val acc: 54.02%
epoch 6/10 | loss: 0.9582 | val acc: 66.67%
epoch 7/10 | loss: 0.7153 | val acc: 68.91%
epoch 8/10 | loss: 0.5049 | val acc: 77.64%
epoch 9/10 | loss: 0.3702 | val acc: 79.36%
epoch 10/10 | loss: 0.2644 | val acc: 85.95%
Using device: cuda

training LSTM (output dim: 33) for 20 epochs
epoch 1/20 | loss: 3.4889 | val acc: 3.61%
epoch 2/20 | loss: 3.4601 | val acc: 5.64%
epoch 3/20 | loss: 3.3935 | val acc: 3.87%
epoch 4/20 | loss: 3.2428 | val acc: 7.26%
epoch 5/20 | loss: 3.1559 | val acc: 7.42%
epoch 6/20 | loss: 3.0550 | val acc: 8.46%
epoch 7/20 | loss: 2.9720 | val acc: 11.86%
epoch 8/20 | loss: 2.7380 | val acc: 13.74%
epoch 9/20 | loss: 2.6605 | val acc: 15.52%
epoch 10/20 | l

In [41]:
# 4) test init type for LSTM
init_types = ['xavier', 'random', 'zero']
init_results = []

for hs in init_types:
    print(f"testing initialization type: {hs}")

    l_acc_domain = evaluate_lstm(
        train_ds, val_ds, vocab_size, output_dim_domain,
        embedding_matrix=embedding_matrix,
        dropout=BEST_DROPOUT_LSTM_DOMAIN,
        lr=BEST_LR_LSTM_DOMAIN,
        hidden_size=BEST_HS_LSTM_DOMAIN,
    )
    init_results.append(l_acc_domain)

plot_single_result(init_types, init_results, "Effect of Initialization", "Init Type")

testing initialization type: xavier
Using device: cuda

training LSTM (output dim: 7) for 10 epochs
epoch 1/10 | loss: 1.9276 | val acc: 19.54%
epoch 2/10 | loss: 1.7110 | val acc: 29.41%
epoch 3/10 | loss: 1.7796 | val acc: 18.91%
epoch 4/10 | loss: 1.5704 | val acc: 32.92%
epoch 5/10 | loss: 1.1343 | val acc: 45.82%
epoch 6/10 | loss: 0.7884 | val acc: 74.19%
epoch 7/10 | loss: 0.4282 | val acc: 84.54%
epoch 8/10 | loss: 0.2420 | val acc: 88.14%
epoch 9/10 | loss: 0.1471 | val acc: 88.14%
epoch 10/10 | loss: 0.0986 | val acc: 89.24%
testing initialization type: random
Using device: cuda

training LSTM (output dim: 7) for 10 epochs
epoch 1/10 | loss: 1.9405 | val acc: 16.88%
epoch 2/10 | loss: 1.8959 | val acc: 27.38%
epoch 3/10 | loss: 1.6271 | val acc: 28.63%
epoch 4/10 | loss: 1.6080 | val acc: 28.16%
epoch 5/10 | loss: 1.5728 | val acc: 25.97%
epoch 6/10 | loss: 1.4489 | val acc: 30.77%
epoch 7/10 | loss: 1.4709 | val acc: 30.77%
epoch 8/10 | loss: 1.4904 | val acc: 42.58%
epoch 9

In [34]:
# 6) Embedding Dimension Comparison (50d vs 100d vs 200d vs 300d)
print("GloVe dimension comparison")
GLOVE_DIR = "glove.6B"
glove_dims = [50, 100, 200, 300]
dim_results = []

for dim in glove_dims:
    filename = f"glove.6B.{dim}d.txt"
    path = os.path.join(GLOVE_DIR, filename)

    print(f"\ntesting GloVe dimension: {dim}d")

    # load specific matrix for this dimension
    # pass 'dim' so the parser knows the vector size
    current_matrix = load_glove_matrix(path, vocab, embed_dim=dim)

    acc = evaluate_lstm(
        train_ds, val_ds, vocab_size, output_dim_domain,
        embedding_matrix=current_matrix,
        dropout=BEST_DROPOUT_LSTM_DOMAIN,
        lr=BEST_LR_LSTM_DOMAIN,
        hidden_size=BEST_HS_LSTM_DOMAIN,
        epochs=10
    )
    dim_results.append(acc)

GloVe dimension comparison

testing GloVe dimension: 50d
GloVe loaded. Found 9853 / 10000 words.
Using device: cuda

training LSTM (output dim: 7) for 10 epochs
epoch 1/10 | loss: 1.9299 | val acc: 16.61%
epoch 2/10 | loss: 1.9168 | val acc: 27.38%
epoch 3/10 | loss: 1.6502 | val acc: 30.25%
epoch 4/10 | loss: 1.8772 | val acc: 13.58%
epoch 5/10 | loss: 1.8021 | val acc: 23.09%
epoch 6/10 | loss: 1.5975 | val acc: 35.84%
epoch 7/10 | loss: 1.3907 | val acc: 46.39%
epoch 8/10 | loss: 1.1344 | val acc: 53.92%
epoch 9/10 | loss: 0.9322 | val acc: 64.79%
epoch 10/10 | loss: 0.7123 | val acc: 72.05%

testing GloVe dimension: 100d
GloVe loaded. Found 9853 / 10000 words.
Using device: cuda

training LSTM (output dim: 7) for 10 epochs
epoch 1/10 | loss: 1.9281 | val acc: 17.92%
epoch 2/10 | loss: 1.8269 | val acc: 29.05%
epoch 3/10 | loss: 1.7592 | val acc: 18.08%
epoch 4/10 | loss: 1.9046 | val acc: 16.98%
epoch 5/10 | loss: 1.8366 | val acc: 14.99%
epoch 6/10 | loss: 1.3199 | val acc: 56.17%

In [36]:
dim_labels = [str(d) for d in glove_dims]
plot_single_result(dim_labels, dim_results, "Effect of Embedding Dimension", "GloVe Dimension")

saved plot to: plots/Effect_of_Embedding_Dimension.png


In [21]:
# Clear VRAM before heavy lifting
torch.cuda.empty_cache()

In [22]:
# 1) LSTM Domain
print("\n--- EXP 1/4: Custom LSTM (Domain) ---")
lstm_domain = CustomLSTMModel(vocab_size, EMBED_DIM, output_dim_domain, hidden_size=BEST_HS_LSTM_DOMAIN,
                              embedding_matrix=embedding_matrix, dropout=BEST_DROPOUT_LSTM_DOMAIN)
lstm_domain.fit(train_loader, val_loader, epochs=10)
test_acc_domain_lstm = lstm_domain.evaluate_acc(test_loader)
print(f"Final LSTM Domain Test Acc: {test_acc_domain_lstm:.2f}%")


--- EXP 1/4: Custom LSTM (Domain) ---
Using device: cuda

training LSTM (output dim: 7) for 10 epochs
epoch 1/10 | loss: 1.8983 | val acc: 23.56%
epoch 2/10 | loss: 1.8022 | val acc: 16.35%
epoch 3/10 | loss: 1.9315 | val acc: 24.92%
epoch 4/10 | loss: 1.6113 | val acc: 46.19%
epoch 5/10 | loss: 1.0921 | val acc: 56.74%
epoch 6/10 | loss: 0.8923 | val acc: 72.05%
epoch 7/10 | loss: 0.6221 | val acc: 80.41%
epoch 8/10 | loss: 0.4084 | val acc: 84.27%
epoch 9/10 | loss: 0.2715 | val acc: 82.03%
epoch 10/10 | loss: 0.2941 | val acc: 86.99%
Final LSTM Domain Test Acc: 86.46%


In [23]:
print("\n--- EXP 2/4: BERT (Domain) ---")
bert_domain_model = BERTClassifier(output_dim=output_dim_domain)
bert_domain_train_ldr = DataLoader(
    TensorDataset(bert_train_enc['input_ids'], bert_train_enc['attention_mask'], bert_domain_train_labels),
    batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
bert_domain_val_ldr = DataLoader(
    TensorDataset(bert_val_enc['input_ids'], bert_val_enc['attention_mask'], bert_domain_val_labels),
    batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)

bert_domain_model.fit(bert_domain_train_ldr, bert_domain_val_ldr, epochs=3)
bert_test_loader, _, _, _ = get_bert_test_data(all_texts, y_domain_raw, test_indices)
_, bert_test_acc = bert_domain_model.evaluate(bert_test_loader, nn.CrossEntropyLoss())
test_acc_domain_bert = bert_test_acc * 100
print(f"Final BERT Domain Test Acc: {test_acc_domain_bert:.2f}%")


--- EXP 2/4: BERT (Domain) ---
Using device: cuda
starting BERT fine-tuning
epoch 1/3 [81.3s] | train loss: 0.7663 acc: 0.7649 | val loss: 0.2998 acc: 0.9175
epoch 2/3 [81.2s] | train loss: 0.2372 acc: 0.9333 | val loss: 0.2645 acc: 0.9211
epoch 3/3 [81.2s] | train loss: 0.1472 acc: 0.9617 | val loss: 0.2560 acc: 0.9248
Final BERT Domain Test Acc: 92.77%


In [24]:
torch.cuda.empty_cache()

In [25]:
# 3) LSTM Sub-field
print("\n--- EXP 3/4: Custom LSTM (Sub-field) ---")
lstm_sub = CustomLSTMModel(vocab_size, EMBED_DIM, output_dim_sub, hidden_size=BEST_HS_LSTM_SUB,
                           embedding_matrix=embedding_matrix, dropout=BEST_DROPOUT_LSTM_SUB)
lstm_sub.fit(train_loader, val_loader, epochs=25)
test_acc_sub_lstm = lstm_sub.evaluate_acc(test_loader)
print(f"Final LSTM Sub-field Test Acc: {test_acc_sub_lstm:.2f}%")


--- EXP 3/4: Custom LSTM (Sub-field) ---
Using device: cuda

training LSTM (output dim: 33) for 25 epochs
epoch 1/25 | loss: 3.4881 | val acc: 3.50%
epoch 2/25 | loss: 3.4661 | val acc: 3.29%
epoch 3/25 | loss: 3.4476 | val acc: 4.55%
epoch 4/25 | loss: 3.3263 | val acc: 6.17%
epoch 5/25 | loss: 3.3416 | val acc: 3.61%
epoch 6/25 | loss: 3.2445 | val acc: 6.48%
epoch 7/25 | loss: 3.1756 | val acc: 4.34%
epoch 8/25 | loss: 3.2061 | val acc: 6.27%
epoch 9/25 | loss: 2.9977 | val acc: 7.31%
epoch 10/25 | loss: 2.9820 | val acc: 5.12%
epoch 11/25 | loss: 3.1118 | val acc: 11.44%
epoch 12/25 | loss: 2.6768 | val acc: 16.35%
epoch 13/25 | loss: 2.4092 | val acc: 21.79%
epoch 14/25 | loss: 2.1525 | val acc: 25.55%
epoch 15/25 | loss: 1.9664 | val acc: 31.09%
epoch 16/25 | loss: 1.7298 | val acc: 38.45%
epoch 17/25 | loss: 1.4613 | val acc: 43.83%
epoch 18/25 | loss: 1.2064 | val acc: 54.70%
epoch 19/25 | loss: 0.8828 | val acc: 63.90%
epoch 20/25 | loss: 0.6495 | val acc: 69.96%
epoch 21/25 

In [26]:
# 4) BERT Sub-field
print("\n--- EXP 4/4: BERT (Sub-field) ---")
bert_sub_model = BERTClassifier(output_dim=output_dim_sub)
bert_sub_train_ldr = DataLoader(
    TensorDataset(bert_train_enc['input_ids'], bert_train_enc['attention_mask'], bert_sub_train_labels),
    batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
bert_sub_val_ldr = DataLoader(
    TensorDataset(bert_val_enc['input_ids'], bert_val_enc['attention_mask'], bert_sub_val_labels),
    batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)

bert_sub_model.fit(bert_sub_train_ldr, bert_sub_val_ldr, epochs=3)
bert_sub_test_loader, _, _, _ = get_bert_test_data(all_texts, y_sub_raw, test_indices)
_, bert_sub_test_acc = bert_sub_model.evaluate(bert_sub_test_loader, nn.CrossEntropyLoss())
test_acc_sub_bert = bert_sub_test_acc * 100
print(f"Final BERT Sub-field Test Acc: {test_acc_sub_bert:.2f}%")


--- EXP 4/4: BERT (Sub-field) ---
Using device: cuda
starting BERT fine-tuning
epoch 1/3 [81.3s] | train loss: 2.5182 acc: 0.4018 | val loss: 1.4264 acc: 0.7759
epoch 2/3 [81.3s] | train loss: 1.1118 acc: 0.8292 | val loss: 0.8206 acc: 0.8459
epoch 3/3 [81.3s] | train loss: 0.7386 acc: 0.8843 | val loss: 0.7160 acc: 0.8558
Final BERT Sub-field Test Acc: 85.96%


In [27]:
def get_sample_data(model, data_loader):
    model.eval()
    correct_data = None
    incorrect_data = None
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Iterate through batch to find specific cases
    with torch.no_grad():
        for batch in data_loader:
            b_input_ids, b_input_mask, b_labels = [b.to(model.device) for b in batch]
            logits = model(b_input_ids, b_input_mask)
            preds = torch.argmax(logits, dim=1)

            correct_mask = preds == b_labels
            incorrect_mask = preds != b_labels

            if correct_data is None and correct_mask.any():
                idx = torch.where(correct_mask)[0][0]
                correct_data = (b_input_ids[idx].unsqueeze(0), b_input_mask[idx].unsqueeze(0))

            if incorrect_data is None and incorrect_mask.any():
                idx = torch.where(incorrect_mask)[0][0]
                incorrect_data = (b_input_ids[idx].unsqueeze(0), b_input_mask[idx].unsqueeze(0))

            if correct_data is not None and incorrect_data is not None:
                break

    # Fallback to avoid NoneType crash if model is 100% correct or 0% correct
    if correct_data is None:
        # Just grab the first available batch
        for batch in data_loader:
            b_input_ids, b_input_mask, _ = [b.to(model.device) for b in batch]
            correct_data = (b_input_ids[0].unsqueeze(0), b_input_mask[0].unsqueeze(0))
            break

    if incorrect_data is None:
        # Just grab the first available batch
        for batch in data_loader:
            b_input_ids, b_input_mask, _ = [b.to(model.device) for b in batch]
            incorrect_data = (b_input_ids[0].unsqueeze(0), b_input_mask[0].unsqueeze(0))
            break

    return correct_data, incorrect_data, tokenizer

In [28]:
def visualize_attention(model, tokenizer, input_ids, attention_mask, title_prefix, device, layer_idx=11, head_idx=0,
                        save_dir="plots"):
    model.eval()
    os.makedirs(save_dir, exist_ok=True)
    attentions = model.get_attention_maps(input_ids, attention_mask)
    attention_matrix = attentions[layer_idx][0, head_idx, :, :].cpu().detach().numpy()
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    seq_len = attention_mask.sum().item()
    attention_matrix = attention_matrix[:seq_len, :seq_len]
    tokens = tokens[:seq_len]

    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(attention_matrix, xticklabels=tokens, yticklabels=tokens, cmap='viridis', ax=ax)
    title = f"{title_prefix} - Layer {layer_idx + 1}, Head {head_idx + 1}"
    ax.set_title(title)
    ax.set_xlabel("Key")
    ax.set_ylabel("Query")
    plt.xticks(rotation=90)

    clean_title = title.replace(" ", "_").replace(":", "").replace(",", "")
    fig.savefig(os.path.join(save_dir, f"{clean_title}.png"), bbox_inches='tight')
    plt.close(fig)
    print(f"saved attention map: {clean_title}.png")


In [29]:
def visualize_token_importance(model, tokenizer, input_ids, attention_mask, title_prefix, device, layer_idx=11,
                               head_idx=0, save_dir="plots"):
    model.eval()
    os.makedirs(save_dir, exist_ok=True)
    attentions = model.get_attention_maps(input_ids, attention_mask)
    cls_attention = attentions[layer_idx][0, head_idx, 0, :].cpu().detach().numpy()
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    valid_tokens = []
    valid_scores = []
    for token, score in zip(tokens, cls_attention):
        if token not in ['[CLS]', '[SEP]', '[PAD]']:
            valid_tokens.append(token)
            valid_scores.append(score)

    sorted_indices = np.argsort(valid_scores)[::-1]
    top_n = 15
    top_tokens = [valid_tokens[i] for i in sorted_indices[:top_n]]
    top_scores = [valid_scores[i] for i in sorted_indices[:top_n]]

    fig, ax = plt.subplots(figsize=(10, 6))
    y_pos = np.arange(len(top_tokens))
    ax.barh(y_pos, top_scores, align='center', color='skyblue')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(top_tokens)
    ax.invert_yaxis()
    title = f"{title_prefix} Top Tokens"
    ax.set_title(title)

    clean_title = title.replace(" ", "_")
    fig.savefig(os.path.join(save_dir, f"{clean_title}.png"), bbox_inches='tight')
    plt.close(fig)
    print(f"Saved importance plot: {clean_title}.png")

In [30]:
# Run Analysis
bert_test_loader_domain, _, _, _ = get_bert_test_data(all_texts, y_domain_raw, test_indices)
correct_data, incorrect_data, tokenizer = get_sample_data(bert_domain_model, bert_test_loader_domain)

visualize_attention(bert_domain_model, tokenizer, correct_data[0], correct_data[1], "Correct", DEVICE)
visualize_attention(bert_domain_model, tokenizer, incorrect_data[0], incorrect_data[1], "Incorrect", DEVICE)

visualize_token_importance(bert_domain_model, tokenizer, correct_data[0], correct_data[1], "Correct", DEVICE)
visualize_token_importance(bert_domain_model, tokenizer, incorrect_data[0], incorrect_data[1], "Incorrect", DEVICE)

saved attention map: Correct_-_Layer_12_Head_1.png
saved attention map: Incorrect_-_Layer_12_Head_1.png
Saved importance plot: Correct_Top_Tokens.png
Saved importance plot: Incorrect_Top_Tokens.png


In [31]:
print("\n--- RESULTS SUMMARY TABLE ---")
print("| Model | Task | Test Accuracy | Winner? |")
print("|---|---|---|---|")
print(
    f"| Custom LSTM (GloVe) | Domain (7 Classes) | {test_acc_domain_lstm:.2f}% | {'<--' if test_acc_domain_lstm > test_acc_domain_bert else ''} |")
print(
    f"| BERT Classifier | Domain (7 Classes) | {test_acc_domain_bert:.2f}% | {'<--' if test_acc_domain_bert > test_acc_domain_lstm else ''} |")
print(
    f"| Custom LSTM (GloVe) | Sub-field (33 Classes) | {test_acc_sub_lstm:.2f}% | {'<--' if test_acc_sub_lstm > test_acc_sub_bert else ''} |")
print(
    f"| BERT Classifier | Sub-field (33 Classes) | {test_acc_sub_bert:.2f}% | {'<--' if test_acc_sub_bert > test_acc_sub_lstm else ''} |")


--- RESULTS SUMMARY TABLE ---
| Model | Task | Test Accuracy | Winner? |
|---|---|---|---|
| Custom LSTM (GloVe) | Domain (7 Classes) | 86.46% |  |
| BERT Classifier | Domain (7 Classes) | 92.77% | <-- |
| Custom LSTM (GloVe) | Sub-field (33 Classes) | 75.14% |  |
| BERT Classifier | Sub-field (33 Classes) | 85.96% | <-- |


In [32]:
def plot_model_comparison(test_acc_domain_lstm, test_acc_domain_bert,
                          test_acc_sub_lstm, test_acc_sub_bert,
                          save_path="plots/model_comparison.png"):

    # Data preparation
    tasks = ['Domain (7 Classes)', 'Sub-field (33 Classes)']
    lstm_scores = [test_acc_domain_lstm, test_acc_sub_lstm]
    bert_scores = [test_acc_domain_bert, test_acc_sub_bert]

    x = np.arange(len(tasks))  # label locations
    width = 0.35  # width of the bars

    fig, ax = plt.subplots(figsize=(8, 6))

    # Plotting the bars
    rects1 = ax.bar(x - width/2, lstm_scores, width, label='Custom LSTM (GloVe)', color='skyblue')
    rects2 = ax.bar(x + width/2, bert_scores, width, label='BERT Classifier', color='salmon')

    # Formatting
    ax.set_ylabel('Test Accuracy (%)')
    ax.set_title('Model Performance Comparison')
    ax.set_xticks(x)
    ax.set_xticklabels(tasks)
    ax.set_ylim(0, 100)  # Set y-axis to 0-100% for context
    ax.legend()
    ax.grid(axis='y', linestyle='--', alpha=0.7)

    # Helper function to put text labels on top of bars
    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate(f'{height:.2f}%',
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),  # 3 points vertical offset
                        textcoords="offset points",
                        ha='center', va='bottom', fontweight='bold')

    autolabel(rects1)
    autolabel(rects2)

    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()
    print(f"Comparison plot saved to {save_path}")

In [33]:
plot_model_comparison(test_acc_domain_lstm, test_acc_domain_bert, test_acc_sub_lstm, test_acc_sub_bert)

Comparison plot saved to plots/model_comparison.png


In [46]:
# 2. Confusion Matrix
def plot_confusion_matrix(model, data_loader, title, save_dir="plots"):
    os.makedirs(save_dir, exist_ok=True)
    model.eval()
    
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            # Handle different batch structures
            # LSTM: [X, Y_domain, Y_sub] -> inputs=batch[0], target depends on output_dim
            # BERT: [ids, mask, labels] -> inputs=batch[0], mask=batch[1], labels=batch[2]
            
            if len(batch) == 3 and isinstance(model, BERTClassifier):
                input_ids = batch[0].to(model.device)
                mask = batch[1].to(model.device)
                labels = batch[2].to(model.device)
                outputs = model(input_ids, mask)
            else:
                inputs = batch[0].to(model.device)
                # Infer label index: if output_dim is 7, use batch[1], else batch[2]
                if model.output_dim == 7:
                    labels = batch[1].to(model.device)
                else:
                    labels = batch[2].to(model.device)
                outputs = model(inputs)

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Create Matrix
    cm = confusion_matrix(all_labels, all_preds)
    
    # Plot
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('True Label')
    ax.set_title(title)
    
    clean_title = title.replace(" ", "_")
    fig.savefig(os.path.join(save_dir, f"{clean_title}.png"), bbox_inches='tight')
    plt.close(fig)
    print(f"Saved confusion matrix: {clean_title}.png")

In [52]:
plot_confusion_matrix(lstm_domain, val_loader, "LSTM_Domain_Confusion_Matrix")
plot_confusion_matrix(lstm_sub, val_loader, "LSTM_Sub_Confusion_Matrix")

plot_confusion_matrix(bert_domain_model, bert_domain_val_ldr, "BERT_Domain_Confusion_Matrix")
plot_confusion_matrix(bert_sub_model, bert_sub_val_ldr, "BERT_Sub_Confusion_Matrix")

Generating confusion matrix for LSTM_Domain_Confusion_Matrix...
Saved confusion matrix: LSTM_Domain_Confusion_Matrix.png
Generating confusion matrix for LSTM_Sub_Confusion_Matrix...
Saved confusion matrix: LSTM_Sub_Confusion_Matrix.png
Generating confusion matrix for BERT_Domain_Confusion_Matrix...
Saved confusion matrix: BERT_Domain_Confusion_Matrix.png
Generating confusion matrix for BERT_Sub_Confusion_Matrix...
Saved confusion matrix: BERT_Sub_Confusion_Matrix.png


In [50]:
def plot_class_distribution(y_data, title, save_dir="plots"):
    os.makedirs(save_dir, exist_ok=True)
    
    counts = Counter(y_data)
    # Sort by class ID
    labels, values = zip(*sorted(counts.items()))
    
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(labels, values, color='skyblue', edgecolor='black')
    
    ax.set_xlabel('Class ID')
    ax.set_ylabel('Number of Samples')
    ax.set_title(title)
    ax.grid(axis='y', linestyle='--', alpha=0.7)
    
    clean_title = title.replace(" ", "_")
    fig.savefig(os.path.join(save_dir, f"{clean_title}.png"), bbox_inches='tight')
    plt.close(fig)
    print(f"Saved distribution plot: {clean_title}.png")

In [51]:
plot_class_distribution(y_domain_raw, "Domain_Class_Distribution")
plot_class_distribution(y_sub_raw, "Subfield_Class_Distribution")

Saved distribution plot: Domain_Class_Distribution.png
Saved distribution plot: Subfield_Class_Distribution.png


In [58]:
def plot_length_vs_accuracy(model, data_loader, model_type="LSTM", bins=5, save_dir="plots"):
    model.eval()
    correct_by_len = [] # Stores tuple (length, is_correct)
    
    with torch.no_grad():
        for batch in data_loader:
            # FIX: Check if "BERT" is part of the string (e.g., "BERT_Domain")
            if "BERT" in model_type:
                input_ids = batch[0].to(model.device)
                mask = batch[1].to(model.device)
                labels = batch[2].to(model.device)
                
                outputs = model(input_ids, mask)
                # Length = sum of attention mask (1s)
                lengths = mask.sum(dim=1).cpu().numpy()
            else:
                inputs = batch[0].to(model.device)
                
                # Check output dimension to pick correct label from the batch
                # (Batch structure: [X, Y_domain, Y_sub])
                # If model has 7 outputs, use Y_domain (index 1), else Y_sub (index 2)
                if hasattr(model, 'output_dim') and model.output_dim == 7:
                    target_idx = 1
                else:
                    target_idx = 2
                    
                labels = batch[target_idx].to(model.device)
                
                outputs = model(inputs)
                # Length = count of non-padding tokens (assume 0 is pad)
                lengths = (inputs != 0).sum(dim=1).cpu().numpy()
                
            preds = torch.argmax(outputs, dim=1)
            is_correct = (preds == labels).cpu().numpy()
            
            for l, c in zip(lengths, is_correct):
                correct_by_len.append((l, c))

    # Convert to dataframe for easy binning
    df = pd.DataFrame(correct_by_len, columns=["Length", "Correct"])
    
    # Create bins
    df['Bin'] = pd.cut(df['Length'], bins=bins)
    
    # Calculate accuracy per bin
    summary = df.groupby('Bin')['Correct'].mean()
    
    # Plot
    fig, ax = plt.subplots(figsize=(10, 6))
    summary.plot(kind='bar', ax=ax, color='teal', alpha=0.7)
    
    ax.set_title(f'{model_type} Accuracy by Sequence Length')
    ax.set_ylabel('Accuracy')
    ax.set_xlabel('Sequence Length Range')
    ax.set_ylim(0, 1.0)
    plt.xticks(rotation=45)
    
    os.makedirs(save_dir, exist_ok=True)
    filename = f"{model_type}_Length_vs_Acc.png"
    fig.savefig(os.path.join(save_dir, filename), bbox_inches='tight')
    plt.close(fig)
    print(f"Saved length analysis: {filename}")

In [59]:
# Run for both (using Domain task as the example)
plot_length_vs_accuracy(lstm_domain, val_loader, model_type="LSTM_Domain")
plot_length_vs_accuracy(bert_domain_model, bert_domain_val_ldr, model_type="BERT_Domain")

Saved length analysis: LSTM_Domain_Length_vs_Acc.png


  summary = df.groupby('Bin')['Correct'].mean()


Saved length analysis: BERT_Domain_Length_vs_Acc.png


  summary = df.groupby('Bin')['Correct'].mean()


In [56]:
def print_worst_mistakes(model, data_loader, tokenizer, raw_texts, indices, k=3):
    """
    Finds the top k most confident errors.
    Requires raw_texts and indices to map back to original string.
    """
    model.eval()
    errors = [] # (confidence, pred, true, text_idx)
    
    with torch.no_grad():
        batch_start_idx = 0
        for batch in data_loader:
            if isinstance(model, BERTClassifier):
                input_ids = batch[0].to(model.device)
                mask = batch[1].to(model.device)
                labels = batch[2].to(model.device)
                outputs = model(input_ids, mask)
            else:
                inputs = batch[0].to(model.device)
                labels = batch[1].to(model.device) # Assuming domain task
                outputs = model(inputs)
            
            probs = F.softmax(outputs, dim=1)
            confidences, preds = torch.max(probs, dim=1)
            
            # Find errors
            incorrect_mask = preds != labels
            
            if incorrect_mask.any():
                # Get indices of errors in this batch
                batch_err_indices = torch.nonzero(incorrect_mask).squeeze()
                if batch_err_indices.dim() == 0: batch_err_indices = batch_err_indices.unsqueeze(0)
                
                for idx in batch_err_indices:
                    global_idx = indices[batch_start_idx + idx.item()]
                    conf = confidences[idx].item()
                    p = preds[idx].item()
                    t = labels[idx].item()
                    errors.append((conf, p, t, global_idx))
            
            batch_start_idx += labels.size(0)

    # Sort by confidence (descending)
    errors.sort(key=lambda x: x[0], reverse=True)
    
    print(f"\n--- Top {k} Confident Failures for {model.__class__.__name__} ---")
    for i in range(min(k, len(errors))):
        conf, p, t, text_idx = errors[i]
        print(f"Confidence: {conf*100:.2f}% | Pred: {p} | True: {t}")
        print(f"Text snippet: {raw_texts[text_idx][:200]}...\n")

In [57]:
# Need to pass the dataset indices to map back to raw text
# Example for BERT Domain model
print_worst_mistakes(bert_domain_model, bert_domain_val_ldr, bert_tokenizer, all_texts, val_indices)


--- Top 3 Confident Failures for BERTClassifier ---
Confidence: 98.89% | Pred: 4 | True: 0
Text snippet: ultraviolet spectrophotometry has been widely applied in determination of water quality parameters because of its advantagous properties compared to chemical method such as high efficiency easy operat...

Confidence: 98.85% | Pred: 4 | True: 0
Text snippet: object detection and classification have countless applications in human robot interacting systems it is a necessary skill for autonomous robots that perform tasks in household scenarios despite the g...

Confidence: 98.76% | Pred: 3 | True: 4
Text snippet: longitudinal dispersion coefficient can be determined by experimental procedures in natural streams many theoretical and empirical equations that are based on hydraulic and geometric characteristics h...



In [60]:
from sklearn.manifold import TSNE
import matplotlib.cm as cm

def plot_tsne(model, data_loader, title, max_samples=1000, save_dir="plots"):
    """
    Runs t-SNE on the features just before the classification layer.
    """
    model.eval()
    features = []
    labels = []
    
    # Register a hook to capture the input to the final fully connected layer
    # This works for both your LSTM (self.fc) and BERT (self.fc) without changing class code
    captured_feats = []
    def hook_fn(module, input, output):
        # input is a tuple, we want the first element
        captured_feats.append(input[0].detach().cpu())

    handle = model.fc.register_forward_hook(hook_fn)

    print(f"Collecting features for {title} (limit {max_samples} samples)...")
    
    try:
        with torch.no_grad():
            count = 0
            for batch in data_loader:
                if count >= max_samples:
                    break
                
                # Standard BERT vs LSTM batch handling
                if isinstance(model, BERTClassifier):
                    input_ids = batch[0].to(model.device)
                    mask = batch[1].to(model.device)
                    batch_labels = batch[2].to(model.device)
                    model(input_ids, mask) # Forward pass triggers the hook
                else:
                    inputs = batch[0].to(model.device)
                    target_idx = 1 if model.output_dim == 7 else 2
                    batch_labels = batch[target_idx].to(model.device)
                    model(inputs) # Forward pass triggers the hook

                labels.extend(batch_labels.cpu().numpy())
                count += inputs.size(0) if not isinstance(model, BERTClassifier) else input_ids.size(0)
                
    finally:
        handle.remove() # Clean up the hook so it doesn't slow down future runs

    # Concatenate all features
    X = torch.cat(captured_feats, dim=0).numpy()[:len(labels)]
    y = np.array(labels)

    # Run t-SNE
    print("Running t-SNE (this might take a moment)...")
    tsne = TSNE(n_components=2, random_state=42, perplexity=30)
    X_2d = tsne.fit_transform(X)
    
    # Plotting
    plt.figure(figsize=(10, 8))
    unique_labels = np.unique(y)
    colors = cm.rainbow(np.linspace(0, 1, len(unique_labels)))
    
    for label, color in zip(unique_labels, colors):
        indices = y == label
        plt.scatter(X_2d[indices, 0], X_2d[indices, 1], c=[color], label=f'Class {label}', alpha=0.6, s=10)
    
    plt.legend()
    plt.title(f"t-SNE of {title} Representations")
    plt.grid(True, alpha=0.3)
    
    os.makedirs(save_dir, exist_ok=True)
    clean_title = title.replace(" ", "_")
    plt.savefig(os.path.join(save_dir, f"{clean_title}_tSNE.png"), bbox_inches='tight')
    plt.close()
    print(f"Saved t-SNE plot to {save_dir}/{clean_title}_tSNE.png")

In [61]:
# Run on Domain task (most distinct classes)
plot_tsne(bert_domain_model, bert_domain_val_ldr, "BERT Domain")
plot_tsne(lstm_domain, val_loader, "LSTM Domain")

Collecting features for BERT Domain (limit 1000 samples)...
Running t-SNE (this might take a moment)...
Saved t-SNE plot to plots/BERT_Domain_tSNE.png
Collecting features for LSTM Domain (limit 1000 samples)...
Running t-SNE (this might take a moment)...
Saved t-SNE plot to plots/LSTM_Domain_tSNE.png


In [63]:
def plot_confidence_hist(model, data_loader, title, save_dir="plots"):
    model.eval()
    confidences = []
    
    with torch.no_grad():
        for batch in data_loader:
            if isinstance(model, BERTClassifier):
                inputs, mask = batch[0].to(model.device), batch[1].to(model.device)
                outputs = model(inputs, mask)
            else:
                inputs = batch[0].to(model.device)
                outputs = model(inputs)
            
            # Apply Softmax to get probabilities (0 to 1)
            probs = F.softmax(outputs, dim=1)
            
            # Get the probability of the predicted class (the max prob)
            max_probs, _ = torch.max(probs, dim=1)
            confidences.extend(max_probs.cpu().numpy())
            
    plt.figure(figsize=(8, 6))
    plt.hist(confidences, bins=20, range=(0,1), color='purple', alpha=0.7, edgecolor='black')
    plt.xlabel('Prediction Confidence (Probability)')
    plt.ylabel('Count')
    plt.title(f'{title} Confidence Distribution')
    plt.grid(axis='y', alpha=0.5)
    
    os.makedirs(save_dir, exist_ok=True)
    clean_title = title.replace(" ", "_")
    plt.savefig(os.path.join(save_dir, f"{clean_title}_Confidence.png"))
    plt.close()
    print(f"Saved confidence plot: {clean_title}_Confidence.png")

In [64]:
plot_confidence_hist(bert_domain_model, bert_domain_val_ldr, "BERT Domain")
plot_confidence_hist(lstm_domain, val_loader, "LSTM Domain")

Saved confidence plot: BERT_Domain_Confidence.png
Saved confidence plot: LSTM_Domain_Confidence.png
