## Imports

In [1]:
import os
import zipfile
import nltk
import json
import optuna
from tqdm import tqdm
from collections import Counter
from datasets import load_dataset
from pprint import pprint
from PIL import Image
from pathlib import Path

import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from huggingface_hub import hf_hub_download

nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

True

## Constants

In [2]:
IMG_SIZE = (224, 224)
VOCAB_SIZE = 5000
BATCH_SIZE = 32
MAX_NODES_PER_QUESTION = 10

# Directory Information
DATA_DIR = "/kaggle/working/"
DATASET_PATH = os.path.join(DATA_DIR, 'dataset/')
IMAGE_PATH = os.path.join(DATA_DIR, 'imgs/')
VOCABS_PATH = os.path.join(DATA_DIR, 'vocabs/')
HYPERPARAMETERS_RESULT_PATH = os.path.join(DATA_DIR, 'tuning/')

# Huggingface Repository Information
repo_id = "BoKelvin/SLAKE"
repo_type = "dataset"
img_file = "imgs.zip"

# Seeding
GLOBAL_SEED = 42

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


## Dataset Setup

### Dataset Download

In [3]:
# Utility function for downloading and extracting ZIP file
def download_and_store_ZIP(filename, save_dir):
    print(f"Fetching file {filename} from {repo_id} repo")

    try:
        # Caches the file locally and returns the path to the cached file
        cached_zip_path = hf_hub_download(
          repo_id=repo_id,
          filename=filename,
          repo_type=repo_type
        )
        print(f"{filename} download complete. Cached at: {cached_zip_path}")

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Extract the contents
        print(f"Extracting to {save_dir}...")
        with zipfile.ZipFile(cached_zip_path, 'r') as zip_ref:
            zip_ref.extractall(save_dir)

        print("Extraction complete.")
        print(f"{filename} files are located in: {os.path.abspath(save_dir)}")
    except Exception as e:
        print(f"Failed to download or extract {filename}: {e}")

# Scoping to English only
def filter_language(original):
    return original.filter(lambda data: data['q_lang'] == 'en')

# Download and store the dataset
def download_and_store_english_dataset():
    print(f"Downloading dataset from {repo_id} repo")

    # Load from Hugging Face
    original = load_dataset(repo_id)

    # Scope to English Only
    original = filter_language(original)

    # Show the dataset formatting
    pprint(original)

    # Save the original dataset
    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)

    if not os.path.exists(DATASET_PATH):
        os.makedirs(DATASET_PATH)

    original.save_to_disk(DATASET_PATH)
    return original

# Download and store the image files
def download_and_store_image():
    download_and_store_ZIP(img_file, DATA_DIR)

# Download necessary files
def download_and_store_slake():
    dataset = download_and_store_english_dataset()
    download_and_store_image()

    return dataset

### Vocabulary Builder

In [4]:
class VocabularyBuilder:
    def __init__(self, min_freq=1):
        self.min_freq = min_freq
        self.itos = {0: "<pad>", 1: "<start>", 2: "<end>", 3: "<unk>"}
        self.stoi = {v: k for k, v in self.itos.items()}

    def tokenize(self, text):
        return nltk.word_tokenize(text.lower())
    
    def __len__(self):
        return len(self.stoi)
    
    def build_word_vocabs(self, sentences):
        counter = Counter()
        start_index = len(self.stoi)

        # 1. Count frequencies of all tokens in the tokenized sentences
        for sentence in sentences:
            tokens = self.tokenize(sentence)
            counter.update(tokens)

        # 2. Add words that meet the frequency threshold
        for word, count in counter.items():
            if count >= self.min_freq and word not in self.stoi:
                self.stoi[word] = start_index
                self.itos[start_index] = word
                start_index += 1

        print(f"Vocabulary Built. Vocabulary Size: {len(self.stoi)}")

    def numericalize(self, text):
        tokens = self.tokenize(text)
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<unk>"]
            for token in tokens
        ]

In [5]:
# Build vocabularies for questions and answers
def build_vocabs(dataset):
    questions = [item['question'] for item in dataset]
    answers = [item['answer'] for item in dataset]

    # Question Vocabulary
    questvocab_builder = VocabularyBuilder(min_freq=1)
    questvocab_builder.build_word_vocabs(questions)
    
    # Answer Vocabulary
    ansvocab_builder = VocabularyBuilder(min_freq=1)

    # Use a dummy tokenizer that just returns the whole lowercased string as one token
    identity_tokenizer = lambda x: [x.lower().strip()]
    ansvocab_builder.tokenize = identity_tokenizer

    ansvocab_builder.build_word_vocabs(answers)

    return questvocab_builder, ansvocab_builder

# Save vocabularies to JSON files
def save_vocabs(quest_vocab, ans_vocab):
    if not os.path.exists(VOCABS_PATH):
        os.makedirs(VOCABS_PATH)

    # Save Question Vocabulary
    with open(os.path.join(VOCABS_PATH, 'question_vocab.json'), 'w') as f:
        json.dump({'stoi': quest_vocab.stoi, 'itos': quest_vocab.itos}, f)

    # Save Answer Vocabulary
    with open(os.path.join(VOCABS_PATH, 'answer_vocab.json'), 'w') as f:
        json.dump({'stoi': ans_vocab.stoi, 'itos': ans_vocab.itos}, f)

    print("Vocabularies saved successfully.")

### Dataset Class

In [6]:
class SlakeDataset(Dataset):
    def __init__(self, dataset, question_vocab, answer_vocab, transform=None, cache_images=True):
        self.data = dataset
        self.question_vocab = question_vocab
        self.answer_vocab = answer_vocab
        self.transform = transform
        self.cache_images = cache_images

        # Caching
        self.image_cache = {}
        if self.cache_images:
            print(f"Caching images for into RAM...")
            # Get unique image names to avoid duplicate loading
            unique_imgs = set(item['img_name'] for item in self.data)
            
            for img_name in unique_imgs:
                path = os.path.join(IMAGE_PATH, img_name)
                # Load and convert to RGB
                img = Image.open(path).convert('RGB')
                
                # Resize immediately to save RAM and CPU later
                img = img.resize((224, 224)) 
                
                self.image_cache[img_name] = img
            print(f"Cached {len(self.image_cache)} images.")

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]

        # 1. Image Processing
        image_path = item['img_name']

        if self.cache_images:
            # Get from RAM
            image = self.image_cache[image_path]
        else:
            # Load from Disk and Resize
            img_path = os.path.join(IMAGE_PATH, image_path)
            image = Image.open(img_path).convert('RGB')
            image = image.resize((224, 224))

        if self.transform:
            image = self.transform(image)

        # 2. Question Processing
        question = item['question']
        question_indices = self.question_vocab.numericalize(question)

        # 3. Answer Processing
        answer = str(item.get('answer', '')) # Answer may be missing in test set
        answer_index = self.answer_vocab.numericalize(answer)

        return {
            'image': image,
            'question' : torch.tensor(question_indices),
            'answer' : torch.tensor(answer_index, dtype=torch.long),
            # Add original items for reference
            'original_question': question,
            'original_answer': answer,
            # Add ID for tracking
            'id': item['qid']
        }

### Collate Function

In [7]:
def slake_collate_fn(batch, pad_index=0):
    # Separate different components
    images = []
    questions = []
    answers = []
    original_questions = []
    original_answers = []
    ids = []
    
    for item in batch:
        images.append(item['image'])
        questions.append(item['question'])
        answers.append(item['answer'])
        original_questions.append(item['original_question'])
        original_answers.append(item['original_answer'])
        ids.append(item['id'])
    
    # Stack images
    images = torch.stack(images)  # [batch_size, 3, H, W]
    
    # Get question lengths BEFORE padding
    question_lengths = torch.tensor([len(q) for q in questions])
    
    # Pad questions to the longest sequence in THIS batch
    # pad_sequence expects list of tensors, pads with 0 by default
    questions_padded = pad_sequence(questions, batch_first=True, padding_value=pad_index)
    # questions_padded: [batch_size, max_len_in_batch]
    
    # Handling answers
    # Handling each answer as a single class
    # answers = torch.stack(answers)
    answers = torch.tensor([item['answer'] for item in batch])
    
    return {
        'image': images,
        'question': questions_padded,
        'question_lengths': question_lengths,
        'answer': answers,
        'original_question': original_questions,
        'original_answer': original_answers,
        'id': ids
    }

## Preparation

In [8]:
# Comment out if dataset is already downloaded
dataset = download_and_store_slake()

# Uncomment if dataset is already downloaded
# dataset = load_from_disk(DATASET_PATH)

# Build vocabularies for training
train_data = dataset['train']
validation_data = dataset['validation']
question_vocab, answer_vocab = build_vocabs(train_data)

# Define image transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Create train dataset and dataloader
train_dataset = SlakeDataset(train_data, question_vocab, answer_vocab, transform=transform)
train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    collate_fn=slake_collate_fn
)

validation_dataset = SlakeDataset(validation_data, question_vocab, answer_vocab, transform=transform)
validation_loader = DataLoader(
    validation_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=slake_collate_fn
)

Downloading dataset from BoKelvin/SLAKE repo


README.md:   0%|          | 0.00/568 [00:00<?, ?B/s]

train.json: 0.00B [00:00, ?B/s]

validation.json: 0.00B [00:00, ?B/s]

test.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/9835 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2099 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2094 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9835 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2099 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2094 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['img_name', 'location', 'answer', 'modality', 'base_type', 'answer_type', 'question', 'qid', 'content_type', 'triple', 'img_id', 'q_lang'],
        num_rows: 4919
    })
    validation: Dataset({
        features: ['img_name', 'location', 'answer', 'modality', 'base_type', 'answer_type', 'question', 'qid', 'content_type', 'triple', 'img_id', 'q_lang'],
        num_rows: 1053
    })
    test: Dataset({
        features: ['img_name', 'location', 'answer', 'modality', 'base_type', 'answer_type', 'question', 'qid', 'content_type', 'triple', 'img_id', 'q_lang'],
        num_rows: 1061
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/4919 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1053 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1061 [00:00<?, ? examples/s]

Fetching file imgs.zip from BoKelvin/SLAKE repo


imgs.zip:   0%|          | 0.00/212M [00:00<?, ?B/s]

imgs.zip download complete. Cached at: /root/.cache/huggingface/hub/datasets--BoKelvin--SLAKE/snapshots/a9083ce6c34ac3ffb17671a605962924d8a8f9e9/imgs.zip
Extracting to /kaggle/working/...
Extraction complete.
imgs.zip files are located in: /kaggle/working
Vocabulary Built. Vocabulary Size: 281
Vocabulary Built. Vocabulary Size: 225
Caching images for into RAM...
Cached 450 images.
Caching images for into RAM...
Cached 96 images.


## Modeling Baseline

1. Basic CNN-LSTM
2. CNN with Bidirectional LSTM with Self-Attention

In [9]:
class MedicalVQABaseline(nn.Module):
    def __init__(self, vocab_size, num_classes, embed_dim=256, hidden_dim=512, fusion_dropout=0.5, fusion_dim=1024):
        super(MedicalVQABaseline, self).__init__()

        # 1. CNN - ResNet
        resnet = models.resnet34(pretrained=True)
        num_features = 512
        # Remove the last classification layer
        self.resnet_features = nn.Sequential(*list(resnet.children())[:-1])
        self.img_projector = nn.Linear(num_features, hidden_dim)
        self.bn_img = nn.BatchNorm1d(hidden_dim)

        # 2. LSTM
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embed_dim, 
            hidden_dim, 
            batch_first=True
        )

        # 4. Classifier
        # hidden_dim * 2 => One for the CNN, one for the LSTM
        total_dim = hidden_dim * 2

        self.attention = nn.Linear(embed_dim, 1)
        self.kg_gate = nn.Parameter(torch.tensor(0.0))

        self.classifier = nn.Sequential(
            nn.Linear(total_dim, fusion_dim),
            nn.ReLU(),
            nn.Dropout(fusion_dropout),
            nn.Linear(fusion_dim, num_classes)
        )

    def forward(self, images, questions, question_lengths=None):
        # CNN
        # Extract features
        img_feats = self.resnet_features(images).view(images.size(0), -1)
        img_feats = self.img_projector(img_feats)
        img_feats = self.bn_img(img_feats) # Normalize
        img_feats = torch.relu(img_feats)

        # LSTM
        embeds = self.embedding(questions) # (Batch, Seq, Embed_Dim)
        # LSTM output: (Batch, Seq, Hidden), (h_n, c_n)
        # Take and modify the final hidden state h_n: (1, Batch, Hidden)
        _, (h_n, _) = self.lstm(embeds)
        text_feats = h_n.squeeze(0) # (Batch, Hidden)
        
        # Fusion
        combined = torch.cat((img_feats, text_feats), dim=1)

        # Classification
        logits = self.classifier(combined)
        return logits

In [10]:
# Bidirectional LSTM with Self-Attention for question encoding
class BiLSTMWithSelfAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim=300, hidden_dim=512, num_layers=1, 
                 dropout=0.5, pooling_strategy='mean', attention_heads=8):
        super(BiLSTMWithSelfAttention, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.pooling_strategy = pooling_strategy
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # Bidirectional LSTM
        self.bilstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Self-attention mechanism
        # BiLSTM outputs hidden_dim * 2 (forward + backward)
        self.attention = nn.MultiheadAttention(
            embed_dim=hidden_dim * 2,
            num_heads=attention_heads,
            dropout=dropout,
            batch_first=True
        )
        
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        
    def forward(self, questions, question_lengths=None):
        # Embed questions
        embeds = self.embedding(questions)  # [B, seq_len, embed_dim]
        embeds = self.dropout(embeds)
        
        # Pack sequence if lengths provided (for efficiency)
        if question_lengths is not None:
            embeds = nn.utils.rnn.pack_padded_sequence(
                embeds, question_lengths.cpu(), 
                batch_first=True, enforce_sorted=False
            )
        
        # BiLSTM encoding
        lstm_out, (hidden, cell) = self.bilstm(embeds)
        
        # Unpack if needed
        if question_lengths is not None:
            lstm_out, _ = nn.utils.rnn.pad_packed_sequence(
                lstm_out, batch_first=True
            )
        
        # lstm_out: [B, seq_len, hidden_dim * 2]
        
        # Self-attention: query = key = value = lstm_out
        attn_out, attn_weights = self.attention(
            query=lstm_out,
            key=lstm_out,
            value=lstm_out,
            need_weights=True
        )
        
        # Residual connection + Layer Norm
        attn_out = self.layer_norm(lstm_out + attn_out)
        attn_out = self.dropout(attn_out)
        
        # Pooling strategy - you can experiment with these:
        if self.pooling_strategy == 'mean':
            question_feature = attn_out.mean(dim=1)  # [B, hidden_dim * 2]
        elif self.pooling_strategy == 'max':
            question_feature = attn_out.max(dim=1)[0]
        else:
            # Last hidden state (concatenate forward and backward)
            question_feature = torch.cat([hidden[-2], hidden[-1]], dim=1)
        
        return question_feature, attn_weights

In [11]:
# Complete VQA model: ResNet34 + BiLSTM with Self-Attention
class VQA_ResNet_BiLSTM_Attention(nn.Module):
    def __init__(self, vocab_size, num_classes, embed_dim=300, 
                 lstm_hidden=512, fusion_dim=1024, lstm_dropout=0.5, 
                 lstm_num_layers=1, attention_heads=8, fusion_dropout=0.5,
                 pooling_strategy='mean'):
        super(VQA_ResNet_BiLSTM_Attention, self).__init__()
        
        # Image encoder: ResNet34
        resnet = models.resnet34(pretrained=True)
        # Remove the final FC layer
        self.image_encoder = nn.Sequential(*list(resnet.children())[:-1])
        self.image_feature_dim = 512  # ResNet34 final layer
        
        # Question encoder: BiLSTM + Self-Attention
        self.question_encoder = BiLSTMWithSelfAttention(
            vocab_size=vocab_size,
            embed_dim=embed_dim,
            hidden_dim=lstm_hidden,
            num_layers=lstm_num_layers,
            dropout=lstm_dropout,
            attention_heads=attention_heads,
            pooling_strategy=pooling_strategy
        )
        self.question_feature_dim = lstm_hidden * 2  # Bidirectional
        
        # Multimodal fusion
        self.fusion = nn.Sequential(
            nn.Linear(self.image_feature_dim + self.question_feature_dim, fusion_dim),
            nn.BatchNorm1d(fusion_dim),
            nn.ReLU(),
            nn.Dropout(fusion_dropout),
            nn.Linear(fusion_dim, fusion_dim // 2),
            nn.BatchNorm1d(fusion_dim // 2),
            nn.ReLU(),
            nn.Dropout(fusion_dropout)
        )
        
        # Classifier
        self.classifier = nn.Linear(fusion_dim // 2, num_classes)
        
    def forward(self, images, questions, question_lengths=None):
        # Extract image features
        img_features = self.image_encoder(images)  # [B, 512, 1, 1]
        img_features = img_features.squeeze(-1).squeeze(-1)  # [B, 512]
        
        # Extract question features with attention
        q_features, attn_weights = self.question_encoder(questions, question_lengths) # [B, lstm_hidden * 2]
        
        # Concatenate image and question features
        combined = torch.cat([img_features, q_features], dim=1)
        # combined: [B, 512 + lstm_hidden*2]
        
        # Fusion
        fused = self.fusion(combined)  # [B, fusion_dim // 2]
        
        # Classification
        logits = self.classifier(fused)  # [B, num_classes]
        
        return logits

## Hyperparameter Tuning

In [12]:
class HyperparameterTuner:
    def __init__(self, train_dataset, validation_dataset, vocab_size, num_classes, 
                 n_trials=50, basic_model=True):
        self.train_dataset = train_dataset
        self.validation_dataset = validation_dataset
        self.vocab_size = vocab_size
        self.num_classes = num_classes
        self.n_trials = n_trials
        self.basic_model = basic_model
        
        # Track all trial results
        self.trial_results = []

    def train_single_epoch(self, model, dataloader, criterion, optimizer):
        model.train()
        total_loss = 0.0
        correct = 0
        total = 0
        
        for batch in dataloader:
            images = batch['image'].to(device)
            questions = batch['question'].to(device)
            question_lengths = batch['question_lengths'].to(device)
            answers = batch['answer'].to(device)
            
            # Forward
            logits = model(images, questions, question_lengths)
            loss = criterion(logits, answers)
            
            # Backward
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()
            
            # Metrics
            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=1)
            correct += (predictions == answers).sum().item()
            total += answers.size(0)
        
        return total_loss / len(dataloader), 100 * correct / total

    def validate(self, model, dataloader, criterion):
        model.eval()
        total_loss = 0.0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for batch in dataloader:
                images = batch['image'].to(device)
                questions = batch['question'].to(device)
                question_lengths = batch['question_lengths'].to(device)
                answers = batch['answer'].to(device)
                
                logits = model(images, questions, question_lengths)
                loss = criterion(logits, answers)
                
                total_loss += loss.item()
                predictions = torch.argmax(logits, dim=1)
                correct += (predictions == answers).sum().item()
                total += answers.size(0)
        
        return total_loss / len(dataloader), 100 * correct / total

    def config_BLSTM(self, trial):
        return {
            # Embedding parameters
            'embed_dim': trial.suggest_categorical('embed_dim', [200, 300, 512]),

            # LSTM parameters
            'lstm_hidden': trial.suggest_categorical('lstm_hidden', [256, 512, 768, 1024]),
            'lstm_num_layers': trial.suggest_int('lstm_num_layers', 1, 3),
            'lstm_dropout': trial.suggest_float('lstm_dropout', 0.1, 0.6),
            'pooling_strategy': trial.suggest_categorical('pooling_strategy', ['mean', 'max', 'last']),

            # Attention parameters
            'attention_heads': trial.suggest_categorical('attention_heads', [4, 8, 16]),

            # Fusion parameters
            'fusion_dim': trial.suggest_categorical('fusion_dim', [512, 1024, 2048]),
            'fusion_dropout': trial.suggest_float('fusion_dropout', 0.2, 0.6),

            # Training parameters
            'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64]),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
            'weight_decay': trial.suggest_loguniform('weight_decay', 1e-6, 1e-3),
            'scheduler_step_size': trial.suggest_int('scheduler_step_size', 5, 15),
            'scheduler_gamma': trial.suggest_float('scheduler_gamma', 0.3, 0.7),
        }

    def config_basic(self, trial):
        return {
            # Embedding parameters
            'embed_dim': trial.suggest_categorical('embed_dim', [256, 300, 512]),

            # Fusion parameters
            'fusion_dim': trial.suggest_categorical('fusion_dim', [512, 1024, 2048]),
            'fusion_dropout': trial.suggest_float('fusion_dropout', 0.2, 0.6),

            # LSTM and CNN parameters
            'hidden_dim': trial.suggest_categorical('lstm_hidden', [256, 512, 768, 1024]),

            # Training parameters
            'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64]),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
            'weight_decay': trial.suggest_loguniform('weight_decay', 1e-6, 1e-3),
            'scheduler_step_size': trial.suggest_int('scheduler_step_size', 5, 15),
            'scheduler_gamma': trial.suggest_float('scheduler_gamma', 0.3, 0.7),
        }

    def objective(self, trial):
        print(f"Trial {trial.number + 1}/{self.n_trials}")

        if not self.basic_model:
            config = self.config_BLSTM(trial)
            model = VQA_ResNet_BiLSTM_Attention(
                vocab_size=self.vocab_size,
                num_classes=self.num_classes,
                embed_dim=config['embed_dim'],
                lstm_hidden=config['lstm_hidden'],
                lstm_num_layers=config['lstm_num_layers'],
                attention_heads=config['attention_heads'],
                fusion_dim=config['fusion_dim'],
                lstm_dropout=config['lstm_dropout'],
                fusion_dropout=config['fusion_dropout'],
                pooling_strategy=config['pooling_strategy']
            ).to(device)
            
            for param in model.image_encoder.parameters():
                param.requires_grad = False
        else:
            config = self.config_basic(trial)
            model = MedicalVQABaseline(
                vocab_size=self.vocab_size,
                num_classes=self.num_classes,
                embed_dim=config['embed_dim'],
                fusion_dim=config['fusion_dim'],
                fusion_dropout=config['fusion_dropout'],
                hidden_dim=config['hidden_dim'],
            ).to(device)
            
            for param in model.resnet_features.parameters():
                param.requires_grad = False
                
        print(f"Config: {json.dumps(config, indent=2)}")

        train_loader = DataLoader(
            self.train_dataset,
            batch_size=config['batch_size'],
            shuffle=True,
            collate_fn=slake_collate_fn,
            # pin_memory=True
        )
        
        val_loader = DataLoader(
            self.validation_dataset,
            batch_size=config['batch_size'],
            shuffle=False,
            collate_fn=slake_collate_fn,
            # pin_memory=True
        )

        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=config['learning_rate'],
            weight_decay=config['weight_decay']
        )
        
        scheduler = torch.optim.lr_scheduler.StepLR(
            optimizer,
            step_size=config['scheduler_step_size'],
            gamma=config['scheduler_gamma']
        )

        best_val_acc = 0.0
        threshold = 5
        threshold_count = 0
        max_epochs = 30

        for epoch in range(max_epochs):
            train_loss, train_acc = self.train_single_epoch(
                model, train_loader, criterion, optimizer
            )

            val_loss, val_acc = self.validate(
                model, val_loader, criterion
            )

            scheduler.step()
            print(f"Epoch {epoch+1}: Train Acc={train_acc:.2f}%, Val Acc={val_acc:.2f}%")

            if val_acc > best_val_acc:
                best_val_acc = val_acc
                threshold_count = 0
            else:
                threshold_count += 1
            
            if threshold_count >= threshold:
                print(f"Early stopping at epoch {epoch+1}")
                break

            trial.report(val_acc, epoch)
            if trial.should_prune():
                raise optuna.TrialPruned()

        trial_result = {
            'trial_number': trial.number,
            'config': config,
            'best_val_acc': best_val_acc,
            'final_epoch': epoch + 1
        }
        self.trial_results.append(trial_result)
        
        return best_val_acc

    def run(self):
        if self.basic_model:
            print("STARTING HYPERPARAMETER TUNING FOR BASIC MODEL\n")
        else:
            print("STARTING HYPERPARAMETER TUNING FOR BLSTM MODEL\n")
        
        study = optuna.create_study(
            direction='maximize',
            pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
            sampler=optuna.samplers.TPESampler(seed=GLOBAL_SEED)
        )
        study.optimize(self.objective, n_trials=self.n_trials)

        # Print best results
        print("HYPERPARAMETER TUNING COMPLETE")
        print(f"Best Trial: {study.best_trial.number}")
        print(f"Best Validation Accuracy: {study.best_value:.2f}%\n")
        print(f"Best Hyperparameters:")
        for key, value in study.best_params.items():
            print(f"  {key}: {value}")
        
        return study

In [13]:
# Test run
tuner = HyperparameterTuner(
    vocab_size=len(question_vocab),
    num_classes=len(answer_vocab),
    train_dataset=train_dataset,
    validation_dataset=validation_dataset,
    n_trials=10,
    basic_model=True
)

# Run tuning
study = tuner.run()

[I 2025-12-16 19:16:28,969] A new study created in memory with name: no-name-dc0ce072-4c60-4900-bdd5-85adba9991df


STARTING HYPERPARAMETER TUNING FOR BASIC MODEL

Trial 1/10


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
  'weight_decay': trial.suggest_loguniform('weight_decay', 1e-6, 1e-3),
Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /root/.cache/torch/hub/checkpoints/resnet34-b627a593.pth
100%|██████████| 83.3M/83.3M [00:00<00:00, 191MB/s] 


Config: {
  "embed_dim": 300,
  "fusion_dim": 512,
  "fusion_dropout": 0.2232334448672798,
  "hidden_dim": 256,
  "batch_size": 16,
  "learning_rate": 2.3102018878452926e-05,
  "weight_decay": 3.549878832196506e-06,
  "scheduler_step_size": 8,
  "scheduler_gamma": 0.5099025726528951
}
Epoch 1: Train Acc=14.52%, Val Acc=18.52%
Epoch 2: Train Acc=17.26%, Val Acc=20.80%
Epoch 3: Train Acc=21.39%, Val Acc=26.88%
Epoch 4: Train Acc=28.26%, Val Acc=31.72%
Epoch 5: Train Acc=31.43%, Val Acc=34.57%
Epoch 6: Train Acc=34.82%, Val Acc=34.28%
Epoch 7: Train Acc=37.98%, Val Acc=39.13%
Epoch 8: Train Acc=41.84%, Val Acc=41.60%
Epoch 9: Train Acc=43.50%, Val Acc=43.30%
Epoch 10: Train Acc=45.31%, Val Acc=44.35%
Epoch 11: Train Acc=47.35%, Val Acc=45.77%
Epoch 12: Train Acc=47.81%, Val Acc=47.39%
Epoch 13: Train Acc=49.44%, Val Acc=48.81%
Epoch 14: Train Acc=50.29%, Val Acc=48.81%
Epoch 15: Train Acc=51.41%, Val Acc=50.05%
Epoch 16: Train Acc=51.72%, Val Acc=51.09%
Epoch 17: Train Acc=53.71%, Val Acc

[I 2025-12-16 19:22:45,731] Trial 0 finished with value: 55.3656220322887 and parameters: {'embed_dim': 300, 'fusion_dim': 512, 'fusion_dropout': 0.2232334448672798, 'lstm_hidden': 256, 'batch_size': 16, 'learning_rate': 2.3102018878452926e-05, 'weight_decay': 3.549878832196506e-06, 'scheduler_step_size': 8, 'scheduler_gamma': 0.5099025726528951}. Best is trial 0 with value: 55.3656220322887.


Epoch 27: Train Acc=56.68%, Val Acc=55.18%
Early stopping at epoch 27
Trial 2/10
Config: {
  "embed_dim": 512,
  "fusion_dim": 2048,
  "fusion_dropout": 0.38242799368681435,
  "hidden_dim": 256,
  "batch_size": 32,
  "learning_rate": 1.3492834268013232e-05,
  "weight_decay": 0.0007025166339242157,
  "scheduler_step_size": 15,
  "scheduler_gamma": 0.6233589392465844
}
Epoch 1: Train Acc=14.17%, Val Acc=17.57%
Epoch 2: Train Acc=18.11%, Val Acc=17.57%
Epoch 3: Train Acc=17.52%, Val Acc=18.14%
Epoch 4: Train Acc=19.29%, Val Acc=21.84%
Epoch 5: Train Acc=23.26%, Val Acc=25.45%
Epoch 6: Train Acc=27.10%, Val Acc=26.59%
Epoch 7: Train Acc=29.01%, Val Acc=29.25%
Epoch 8: Train Acc=29.17%, Val Acc=29.06%
Epoch 9: Train Acc=30.39%, Val Acc=28.58%
Epoch 10: Train Acc=30.68%, Val Acc=32.29%
Epoch 11: Train Acc=32.65%, Val Acc=30.96%
Epoch 12: Train Acc=32.26%, Val Acc=32.95%
Epoch 13: Train Acc=35.33%, Val Acc=33.71%
Epoch 14: Train Acc=36.47%, Val Acc=35.52%
Epoch 15: Train Acc=36.84%, Val Acc=3

[I 2025-12-16 19:29:15,920] Trial 1 finished with value: 48.812915479582145 and parameters: {'embed_dim': 512, 'fusion_dim': 2048, 'fusion_dropout': 0.38242799368681435, 'lstm_hidden': 256, 'batch_size': 32, 'learning_rate': 1.3492834268013232e-05, 'weight_decay': 0.0007025166339242157, 'scheduler_step_size': 15, 'scheduler_gamma': 0.6233589392465844}. Best is trial 0 with value: 55.3656220322887.


Epoch 30: Train Acc=51.07%, Val Acc=48.05%
Trial 3/10
Config: {
  "embed_dim": 512,
  "fusion_dim": 2048,
  "fusion_dropout": 0.21375540844608737,
  "hidden_dim": 256,
  "batch_size": 32,
  "learning_rate": 0.0008692991511139548,
  "weight_decay": 0.00021154290797261214,
  "scheduler_step_size": 15,
  "scheduler_gamma": 0.6579309401710595
}
Epoch 1: Train Acc=31.35%, Val Acc=44.92%
Epoch 2: Train Acc=55.52%, Val Acc=59.92%
Epoch 3: Train Acc=64.65%, Val Acc=66.48%
Epoch 4: Train Acc=69.81%, Val Acc=68.57%
Epoch 5: Train Acc=74.85%, Val Acc=67.90%
Epoch 6: Train Acc=77.84%, Val Acc=70.75%
Epoch 7: Train Acc=81.38%, Val Acc=72.08%
Epoch 8: Train Acc=84.33%, Val Acc=74.74%
Epoch 9: Train Acc=86.81%, Val Acc=73.50%
Epoch 10: Train Acc=89.06%, Val Acc=75.50%
Epoch 11: Train Acc=90.65%, Val Acc=76.83%
Epoch 12: Train Acc=91.85%, Val Acc=77.59%
Epoch 13: Train Acc=94.04%, Val Acc=74.83%
Epoch 14: Train Acc=93.31%, Val Acc=77.30%
Epoch 15: Train Acc=94.84%, Val Acc=78.16%
Epoch 16: Train Acc=9

[I 2025-12-16 19:34:04,658] Trial 2 finished with value: 79.86704653371321 and parameters: {'embed_dim': 512, 'fusion_dim': 2048, 'fusion_dropout': 0.21375540844608737, 'lstm_hidden': 256, 'batch_size': 32, 'learning_rate': 0.0008692991511139548, 'weight_decay': 0.00021154290797261214, 'scheduler_step_size': 15, 'scheduler_gamma': 0.6579309401710595}. Best is trial 2 with value: 79.86704653371321.


Epoch 22: Train Acc=98.82%, Val Acc=79.11%
Early stopping at epoch 22
Trial 4/10
Config: {
  "embed_dim": 300,
  "fusion_dim": 2048,
  "fusion_dropout": 0.3554709158757928,
  "hidden_dim": 512,
  "batch_size": 64,
  "learning_rate": 1.4096175149815848e-05,
  "weight_decay": 0.0009133995846860973,
  "scheduler_step_size": 13,
  "scheduler_gamma": 0.37948627261366896
}
Epoch 1: Train Acc=12.20%, Val Acc=18.23%
Epoch 2: Train Acc=17.83%, Val Acc=19.37%
Epoch 3: Train Acc=18.60%, Val Acc=18.99%
Epoch 4: Train Acc=19.50%, Val Acc=21.46%
Epoch 5: Train Acc=23.26%, Val Acc=26.40%
Epoch 6: Train Acc=27.77%, Val Acc=26.97%
Epoch 7: Train Acc=29.19%, Val Acc=29.15%
Epoch 8: Train Acc=29.58%, Val Acc=29.44%
Epoch 9: Train Acc=30.74%, Val Acc=30.67%
Epoch 10: Train Acc=30.80%, Val Acc=31.05%
Epoch 11: Train Acc=31.88%, Val Acc=32.67%
Epoch 12: Train Acc=33.83%, Val Acc=31.62%
Epoch 13: Train Acc=34.17%, Val Acc=31.81%
Epoch 14: Train Acc=35.62%, Val Acc=33.24%
Epoch 15: Train Acc=35.39%, Val Acc=3

[I 2025-12-16 19:38:56,092] Trial 3 finished with value: 34.5679012345679 and parameters: {'embed_dim': 300, 'fusion_dim': 2048, 'fusion_dropout': 0.3554709158757928, 'lstm_hidden': 512, 'batch_size': 64, 'learning_rate': 1.4096175149815848e-05, 'weight_decay': 0.0009133995846860973, 'scheduler_step_size': 13, 'scheduler_gamma': 0.37948627261366896}. Best is trial 2 with value: 79.86704653371321.


Epoch 23: Train Acc=37.69%, Val Acc=34.28%
Early stopping at epoch 23
Trial 5/10
Config: {
  "embed_dim": 300,
  "fusion_dim": 1024,
  "fusion_dropout": 0.34338629141770904,
  "hidden_dim": 512,
  "batch_size": 64,
  "learning_rate": 0.0002878805718308924,
  "weight_decay": 8.178476574339548e-05,
  "scheduler_step_size": 14,
  "scheduler_gamma": 0.4888859700647797
}
Epoch 1: Train Acc=19.76%, Val Acc=28.11%
Epoch 2: Train Acc=33.54%, Val Acc=36.94%
Epoch 3: Train Acc=41.92%, Val Acc=48.81%
Epoch 4: Train Acc=53.97%, Val Acc=55.18%
Epoch 5: Train Acc=62.17%, Val Acc=58.88%
Epoch 6: Train Acc=66.94%, Val Acc=63.44%
Epoch 7: Train Acc=71.48%, Val Acc=61.73%
Epoch 8: Train Acc=73.73%, Val Acc=66.10%
Epoch 9: Train Acc=76.54%, Val Acc=66.48%
Epoch 10: Train Acc=78.06%, Val Acc=67.33%
Epoch 11: Train Acc=80.12%, Val Acc=68.85%
Epoch 12: Train Acc=81.72%, Val Acc=67.14%
Epoch 13: Train Acc=82.17%, Val Acc=68.95%
Epoch 14: Train Acc=83.68%, Val Acc=68.76%
Epoch 15: Train Acc=86.07%, Val Acc=68

[I 2025-12-16 19:42:44,540] Trial 4 finished with value: 68.94586894586895 and parameters: {'embed_dim': 300, 'fusion_dim': 1024, 'fusion_dropout': 0.34338629141770904, 'lstm_hidden': 512, 'batch_size': 64, 'learning_rate': 0.0002878805718308924, 'weight_decay': 8.178476574339548e-05, 'scheduler_step_size': 14, 'scheduler_gamma': 0.4888859700647797}. Best is trial 2 with value: 79.86704653371321.


Epoch 18: Train Acc=87.07%, Val Acc=68.38%
Early stopping at epoch 18
Trial 6/10
Config: {
  "embed_dim": 512,
  "fusion_dim": 1024,
  "fusion_dropout": 0.4090931317527976,
  "hidden_dim": 256,
  "batch_size": 16,
  "learning_rate": 0.0006533305220227731,
  "weight_decay": 5.595986878006084e-06,
  "scheduler_step_size": 9,
  "scheduler_gamma": 0.6022204554172195
}
Epoch 1: Train Acc=32.16%, Val Acc=47.67%
Epoch 2: Train Acc=54.71%, Val Acc=61.54%
Epoch 3: Train Acc=64.28%, Val Acc=66.29%
Epoch 4: Train Acc=69.77%, Val Acc=71.98%
Epoch 5: Train Acc=72.98%, Val Acc=72.65%
Epoch 6: Train Acc=75.79%, Val Acc=71.51%
Epoch 7: Train Acc=78.59%, Val Acc=75.88%
Epoch 8: Train Acc=80.44%, Val Acc=75.88%
Epoch 9: Train Acc=82.29%, Val Acc=75.02%
Epoch 10: Train Acc=85.79%, Val Acc=76.73%
Epoch 11: Train Acc=87.88%, Val Acc=76.64%
Epoch 12: Train Acc=88.47%, Val Acc=78.63%
Epoch 13: Train Acc=90.24%, Val Acc=78.44%
Epoch 14: Train Acc=91.20%, Val Acc=78.82%
Epoch 15: Train Acc=91.81%, Val Acc=79.3

[I 2025-12-16 19:48:34,036] Trial 5 finished with value: 81.19658119658119 and parameters: {'embed_dim': 512, 'fusion_dim': 1024, 'fusion_dropout': 0.4090931317527976, 'lstm_hidden': 256, 'batch_size': 16, 'learning_rate': 0.0006533305220227731, 'weight_decay': 5.595986878006084e-06, 'scheduler_step_size': 9, 'scheduler_gamma': 0.6022204554172195}. Best is trial 5 with value: 81.19658119658119.


Epoch 25: Train Acc=97.34%, Val Acc=81.10%
Early stopping at epoch 25
Trial 7/10
Config: {
  "embed_dim": 512,
  "fusion_dim": 1024,
  "fusion_dropout": 0.45336150260416935,
  "hidden_dim": 1024,
  "batch_size": 64,
  "learning_rate": 4.325207525386527e-05,
  "weight_decay": 2.138729075414893e-06,
  "scheduler_step_size": 7,
  "scheduler_gamma": 0.4708431154505025
}
Epoch 1: Train Acc=16.89%, Val Acc=19.18%
Epoch 2: Train Acc=24.80%, Val Acc=29.72%
Epoch 3: Train Acc=31.41%, Val Acc=32.10%
Epoch 4: Train Acc=35.17%, Val Acc=34.95%
Epoch 5: Train Acc=38.56%, Val Acc=38.27%


[I 2025-12-16 19:50:00,424] Trial 6 pruned. 


Epoch 6: Train Acc=41.55%, Val Acc=39.60%
Trial 8/10
Config: {
  "embed_dim": 300,
  "fusion_dim": 512,
  "fusion_dropout": 0.24794614693347314,
  "hidden_dim": 512,
  "batch_size": 64,
  "learning_rate": 0.0008411909465645721,
  "weight_decay": 5.6930747676446135e-06,
  "scheduler_step_size": 10,
  "scheduler_gamma": 0.4203513239267078
}
Epoch 1: Train Acc=22.67%, Val Acc=33.14%
Epoch 2: Train Acc=44.16%, Val Acc=47.96%
Epoch 3: Train Acc=55.84%, Val Acc=56.51%
Epoch 4: Train Acc=63.53%, Val Acc=61.73%
Epoch 5: Train Acc=67.47%, Val Acc=64.10%
Epoch 6: Train Acc=71.23%, Val Acc=65.72%
Epoch 7: Train Acc=73.47%, Val Acc=65.24%
Epoch 8: Train Acc=76.48%, Val Acc=66.29%
Epoch 9: Train Acc=77.84%, Val Acc=68.66%
Epoch 10: Train Acc=80.24%, Val Acc=67.24%
Epoch 11: Train Acc=84.29%, Val Acc=68.66%
Epoch 12: Train Acc=85.34%, Val Acc=68.47%
Epoch 13: Train Acc=85.93%, Val Acc=68.95%
Epoch 14: Train Acc=86.72%, Val Acc=69.80%
Epoch 15: Train Acc=88.49%, Val Acc=71.98%
Epoch 16: Train Acc=88.

[I 2025-12-16 19:56:51,790] Trial 7 finished with value: 76.25830959164293 and parameters: {'embed_dim': 300, 'fusion_dim': 512, 'fusion_dropout': 0.24794614693347314, 'lstm_hidden': 512, 'batch_size': 64, 'learning_rate': 0.0008411909465645721, 'weight_decay': 5.6930747676446135e-06, 'scheduler_step_size': 10, 'scheduler_gamma': 0.4203513239267078}. Best is trial 5 with value: 81.19658119658119.


Epoch 30: Train Acc=95.06%, Val Acc=76.26%
Trial 9/10
Config: {
  "embed_dim": 512,
  "fusion_dim": 512,
  "fusion_dropout": 0.5633063543866614,
  "hidden_dim": 1024,
  "batch_size": 64,
  "learning_rate": 2.9872741995638415e-05,
  "weight_decay": 0.00015298506868937454,
  "scheduler_step_size": 9,
  "scheduler_gamma": 0.5529223322374317
}
Epoch 1: Train Acc=13.50%, Val Acc=16.52%
Epoch 2: Train Acc=17.32%, Val Acc=19.75%
Epoch 3: Train Acc=22.42%, Val Acc=26.31%
Epoch 4: Train Acc=26.81%, Val Acc=27.54%
Epoch 5: Train Acc=27.40%, Val Acc=29.63%


[I 2025-12-16 19:58:16,270] Trial 8 pruned. 


Epoch 6: Train Acc=30.05%, Val Acc=32.76%
Trial 10/10
Config: {
  "embed_dim": 256,
  "fusion_dim": 512,
  "fusion_dropout": 0.21631005662190558,
  "hidden_dim": 512,
  "batch_size": 32,
  "learning_rate": 0.00024092145436664767,
  "weight_decay": 1.4461256820152148e-05,
  "scheduler_step_size": 15,
  "scheduler_gamma": 0.35500837765839727
}
Epoch 1: Train Acc=25.01%, Val Acc=32.95%
Epoch 2: Train Acc=38.75%, Val Acc=41.79%
Epoch 3: Train Acc=49.79%, Val Acc=49.67%
Epoch 4: Train Acc=59.28%, Val Acc=59.92%
Epoch 5: Train Acc=62.63%, Val Acc=61.25%
Epoch 6: Train Acc=68.04%, Val Acc=65.53%
Epoch 7: Train Acc=70.34%, Val Acc=65.34%
Epoch 8: Train Acc=72.78%, Val Acc=66.86%
Epoch 9: Train Acc=75.02%, Val Acc=65.34%


[I 2025-12-16 20:00:24,845] Trial 9 pruned. 


Epoch 10: Train Acc=76.70%, Val Acc=65.81%
HYPERPARAMETER TUNING COMPLETE
Best Trial: 5
Best Validation Accuracy: 81.20%

Best Hyperparameters:
  embed_dim: 512
  fusion_dim: 1024
  fusion_dropout: 0.4090931317527976
  lstm_hidden: 256
  batch_size: 16
  learning_rate: 0.0006533305220227731
  weight_decay: 5.595986878006084e-06
  scheduler_step_size: 9
  scheduler_gamma: 0.6022204554172195


In [14]:
# Test run for BLSTM
tuner = HyperparameterTuner(
    vocab_size=len(question_vocab),
    num_classes=len(answer_vocab),
    train_dataset=train_dataset,
    validation_dataset=validation_dataset,
    n_trials=10,
    basic_model=False
)

# Run tuning
study = tuner.run()

[I 2025-12-16 20:03:01,106] A new study created in memory with name: no-name-c04660fd-4d82-483f-8d2d-903cc7a7fefa


STARTING HYPERPARAMETER TUNING FOR BLSTM MODEL

Trial 1/10


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
  'weight_decay': trial.suggest_loguniform('weight_decay', 1e-6, 1e-3),


Config: {
  "embed_dim": 300,
  "lstm_hidden": 256,
  "lstm_num_layers": 3,
  "lstm_dropout": 0.40055750587160444,
  "pooling_strategy": "last",
  "attention_heads": 4,
  "fusion_dim": 2048,
  "fusion_dropout": 0.3727780074568463,
  "batch_size": 32,
  "learning_rate": 3.8396292998041685e-05,
  "weight_decay": 1.2562773503807034e-05,
  "scheduler_step_size": 10,
  "scheduler_gamma": 0.6140703845572054
}
Epoch 1: Train Acc=12.60%, Val Acc=19.85%
Epoch 2: Train Acc=26.57%, Val Acc=30.86%
Epoch 3: Train Acc=36.61%, Val Acc=38.37%
Epoch 4: Train Acc=46.59%, Val Acc=47.48%
Epoch 5: Train Acc=52.39%, Val Acc=50.43%
Epoch 6: Train Acc=55.05%, Val Acc=51.28%
Epoch 7: Train Acc=58.22%, Val Acc=55.27%
Epoch 8: Train Acc=62.25%, Val Acc=57.83%
Epoch 9: Train Acc=63.12%, Val Acc=58.02%
Epoch 10: Train Acc=64.81%, Val Acc=61.06%
Epoch 11: Train Acc=67.47%, Val Acc=62.11%
Epoch 12: Train Acc=69.18%, Val Acc=63.44%
Epoch 13: Train Acc=69.83%, Val Acc=64.77%
Epoch 14: Train Acc=70.89%, Val Acc=64.01%


[I 2025-12-16 20:10:18,767] Trial 0 finished with value: 70.27540360873694 and parameters: {'embed_dim': 300, 'lstm_hidden': 256, 'lstm_num_layers': 3, 'lstm_dropout': 0.40055750587160444, 'pooling_strategy': 'last', 'attention_heads': 4, 'fusion_dim': 2048, 'fusion_dropout': 0.3727780074568463, 'batch_size': 32, 'learning_rate': 3.8396292998041685e-05, 'weight_decay': 1.2562773503807034e-05, 'scheduler_step_size': 10, 'scheduler_gamma': 0.6140703845572054}. Best is trial 0 with value: 70.27540360873694.


Epoch 30: Train Acc=81.64%, Val Acc=70.28%
Trial 2/10
Config: {
  "embed_dim": 512,
  "lstm_hidden": 512,
  "lstm_num_layers": 3,
  "lstm_dropout": 0.5828160165372797,
  "pooling_strategy": "mean",
  "attention_heads": 4,
  "fusion_dim": 2048,
  "fusion_dropout": 0.3035119926400068,
  "batch_size": 16,
  "learning_rate": 0.00012399967836846095,
  "weight_decay": 3.5856126103453987e-06,
  "scheduler_step_size": 15,
  "scheduler_gamma": 0.6100531293444458
}
Epoch 1: Train Acc=39.58%, Val Acc=52.80%
Epoch 2: Train Acc=57.92%, Val Acc=59.83%
Epoch 3: Train Acc=63.12%, Val Acc=65.72%
Epoch 4: Train Acc=67.49%, Val Acc=66.38%
Epoch 5: Train Acc=70.30%, Val Acc=70.09%
Epoch 6: Train Acc=74.34%, Val Acc=71.04%
Epoch 7: Train Acc=76.50%, Val Acc=72.74%
Epoch 8: Train Acc=77.56%, Val Acc=73.50%
Epoch 9: Train Acc=80.00%, Val Acc=76.07%
Epoch 10: Train Acc=81.15%, Val Acc=75.50%
Epoch 11: Train Acc=82.62%, Val Acc=77.59%
Epoch 12: Train Acc=83.80%, Val Acc=77.40%
Epoch 13: Train Acc=85.04%, Val A

[I 2025-12-16 20:16:32,522] Trial 1 finished with value: 79.58214624881292 and parameters: {'embed_dim': 512, 'lstm_hidden': 512, 'lstm_num_layers': 3, 'lstm_dropout': 0.5828160165372797, 'pooling_strategy': 'mean', 'attention_heads': 4, 'fusion_dim': 2048, 'fusion_dropout': 0.3035119926400068, 'batch_size': 16, 'learning_rate': 0.00012399967836846095, 'weight_decay': 3.5856126103453987e-06, 'scheduler_step_size': 15, 'scheduler_gamma': 0.6100531293444458}. Best is trial 1 with value: 79.58214624881292.


Epoch 19: Train Acc=92.78%, Val Acc=78.63%
Early stopping at epoch 19
Trial 3/10
Config: {
  "embed_dim": 200,
  "lstm_hidden": 256,
  "lstm_num_layers": 1,
  "lstm_dropout": 0.29433864484474104,
  "pooling_strategy": "max",
  "attention_heads": 8,
  "fusion_dim": 2048,
  "fusion_dropout": 0.508897907718663,
  "batch_size": 64,
  "learning_rate": 0.0002592475660475158,
  "weight_decay": 0.00015382308040278996,
  "scheduler_step_size": 13,
  "scheduler_gamma": 0.32961786069363613
}
Epoch 1: Train Acc=38.04%, Val Acc=53.28%
Epoch 2: Train Acc=57.92%, Val Acc=62.30%
Epoch 3: Train Acc=66.17%, Val Acc=67.33%
Epoch 4: Train Acc=69.91%, Val Acc=69.04%
Epoch 5: Train Acc=72.35%, Val Acc=73.31%
Epoch 6: Train Acc=76.05%, Val Acc=73.60%
Epoch 7: Train Acc=77.98%, Val Acc=74.93%
Epoch 8: Train Acc=79.79%, Val Acc=76.16%
Epoch 9: Train Acc=81.48%, Val Acc=76.07%
Epoch 10: Train Acc=83.39%, Val Acc=76.45%
Epoch 11: Train Acc=84.12%, Val Acc=75.88%
Epoch 12: Train Acc=85.93%, Val Acc=76.83%
Epoch 1

[I 2025-12-16 20:21:14,849] Trial 2 finished with value: 77.68281101614436 and parameters: {'embed_dim': 200, 'lstm_hidden': 256, 'lstm_num_layers': 1, 'lstm_dropout': 0.29433864484474104, 'pooling_strategy': 'max', 'attention_heads': 8, 'fusion_dim': 2048, 'fusion_dropout': 0.508897907718663, 'batch_size': 64, 'learning_rate': 0.0002592475660475158, 'weight_decay': 0.00015382308040278996, 'scheduler_step_size': 13, 'scheduler_gamma': 0.32961786069363613}. Best is trial 1 with value: 79.58214624881292.


Epoch 20: Train Acc=93.09%, Val Acc=77.68%
Early stopping at epoch 20
Trial 4/10
Config: {
  "embed_dim": 512,
  "lstm_hidden": 256,
  "lstm_num_layers": 1,
  "lstm_dropout": 0.464803089169032,
  "pooling_strategy": "max",
  "attention_heads": 16,
  "fusion_dim": 1024,
  "fusion_dropout": 0.4090931317527976,
  "batch_size": 16,
  "learning_rate": 1.1557352816269867e-05,
  "weight_decay": 8.11392957263784e-05,
  "scheduler_step_size": 8,
  "scheduler_gamma": 0.5034282764658811
}
Epoch 1: Train Acc=3.39%, Val Acc=17.66%
Epoch 2: Train Acc=15.84%, Val Acc=27.35%
Epoch 3: Train Acc=24.31%, Val Acc=35.23%
Epoch 4: Train Acc=29.64%, Val Acc=37.61%
Epoch 5: Train Acc=33.34%, Val Acc=40.08%
Epoch 6: Train Acc=35.60%, Val Acc=41.12%
Epoch 7: Train Acc=37.51%, Val Acc=42.83%
Epoch 8: Train Acc=40.62%, Val Acc=45.39%
Epoch 9: Train Acc=41.37%, Val Acc=45.77%
Epoch 10: Train Acc=42.63%, Val Acc=45.96%
Epoch 11: Train Acc=43.40%, Val Acc=46.72%
Epoch 12: Train Acc=44.97%, Val Acc=48.34%
Epoch 13: T

[I 2025-12-16 20:29:00,452] Trial 3 finished with value: 52.421652421652425 and parameters: {'embed_dim': 512, 'lstm_hidden': 256, 'lstm_num_layers': 1, 'lstm_dropout': 0.464803089169032, 'pooling_strategy': 'max', 'attention_heads': 16, 'fusion_dim': 1024, 'fusion_dropout': 0.4090931317527976, 'batch_size': 16, 'learning_rate': 1.1557352816269867e-05, 'weight_decay': 8.11392957263784e-05, 'scheduler_step_size': 8, 'scheduler_gamma': 0.5034282764658811}. Best is trial 1 with value: 79.58214624881292.


Epoch 29: Train Acc=49.89%, Val Acc=51.76%
Early stopping at epoch 29
Trial 5/10
Config: {
  "embed_dim": 200,
  "lstm_hidden": 256,
  "lstm_num_layers": 1,
  "lstm_dropout": 0.5648488261712865,
  "pooling_strategy": "last",
  "attention_heads": 16,
  "fusion_dim": 2048,
  "fusion_dropout": 0.32720138998874554,
  "batch_size": 64,
  "learning_rate": 0.0004325432427964555,
  "weight_decay": 0.00038211294416912254,
  "scheduler_step_size": 5,
  "scheduler_gamma": 0.5042989210310262
}
Epoch 1: Train Acc=31.55%, Val Acc=43.21%
Epoch 2: Train Acc=55.76%, Val Acc=60.68%
Epoch 3: Train Acc=66.44%, Val Acc=62.49%
Epoch 4: Train Acc=72.70%, Val Acc=68.57%
Epoch 5: Train Acc=77.60%, Val Acc=70.85%
Epoch 6: Train Acc=83.98%, Val Acc=72.46%
Epoch 7: Train Acc=85.87%, Val Acc=71.70%
Epoch 8: Train Acc=87.86%, Val Acc=72.46%
Epoch 9: Train Acc=89.10%, Val Acc=73.22%
Epoch 10: Train Acc=91.03%, Val Acc=74.55%
Epoch 11: Train Acc=93.68%, Val Acc=73.98%
Epoch 12: Train Acc=94.23%, Val Acc=75.40%
Epoch 

[I 2025-12-16 20:34:57,102] Trial 4 finished with value: 76.54320987654322 and parameters: {'embed_dim': 200, 'lstm_hidden': 256, 'lstm_num_layers': 1, 'lstm_dropout': 0.5648488261712865, 'pooling_strategy': 'last', 'attention_heads': 16, 'fusion_dim': 2048, 'fusion_dropout': 0.32720138998874554, 'batch_size': 64, 'learning_rate': 0.0004325432427964555, 'weight_decay': 0.00038211294416912254, 'scheduler_step_size': 5, 'scheduler_gamma': 0.5042989210310262}. Best is trial 1 with value: 79.58214624881292.


Epoch 26: Train Acc=98.70%, Val Acc=75.97%
Early stopping at epoch 26
Trial 6/10
Config: {
  "embed_dim": 200,
  "lstm_hidden": 512,
  "lstm_num_layers": 3,
  "lstm_dropout": 0.28181480118964697,
  "pooling_strategy": "mean",
  "attention_heads": 4,
  "fusion_dim": 1024,
  "fusion_dropout": 0.22059150049999576,
  "batch_size": 32,
  "learning_rate": 1.9489008462344228e-05,
  "weight_decay": 2.9400741309033083e-05,
  "scheduler_step_size": 15,
  "scheduler_gamma": 0.39682210860460015
}
Epoch 1: Train Acc=19.27%, Val Acc=42.07%
Epoch 2: Train Acc=44.20%, Val Acc=48.34%
Epoch 3: Train Acc=50.72%, Val Acc=52.04%
Epoch 4: Train Acc=54.95%, Val Acc=55.18%
Epoch 5: Train Acc=58.53%, Val Acc=57.45%


[I 2025-12-16 20:36:30,499] Trial 5 pruned. 


Epoch 6: Train Acc=60.74%, Val Acc=59.54%
Trial 7/10
Config: {
  "embed_dim": 300,
  "lstm_hidden": 256,
  "lstm_num_layers": 2,
  "lstm_dropout": 0.14514488502720416,
  "pooling_strategy": "mean",
  "attention_heads": 16,
  "fusion_dim": 1024,
  "fusion_dropout": 0.45806911616377993,
  "batch_size": 32,
  "learning_rate": 0.0007472397689332936,
  "weight_decay": 2.585608890731339e-06,
  "scheduler_step_size": 8,
  "scheduler_gamma": 0.3453894084962356
}
Epoch 1: Train Acc=45.58%, Val Acc=58.50%
Epoch 2: Train Acc=61.17%, Val Acc=67.71%
Epoch 3: Train Acc=68.16%, Val Acc=68.28%
Epoch 4: Train Acc=71.25%, Val Acc=73.22%
Epoch 5: Train Acc=74.14%, Val Acc=75.21%
Epoch 6: Train Acc=76.62%, Val Acc=76.35%
Epoch 7: Train Acc=78.43%, Val Acc=76.92%
Epoch 8: Train Acc=80.57%, Val Acc=75.50%
Epoch 9: Train Acc=83.80%, Val Acc=78.73%
Epoch 10: Train Acc=85.12%, Val Acc=79.77%
Epoch 11: Train Acc=86.07%, Val Acc=78.92%
Epoch 12: Train Acc=87.42%, Val Acc=79.77%
Epoch 13: Train Acc=87.76%, Val Ac

[I 2025-12-16 20:39:53,767] Trial 6 finished with value: 79.77207977207978 and parameters: {'embed_dim': 300, 'lstm_hidden': 256, 'lstm_num_layers': 2, 'lstm_dropout': 0.14514488502720416, 'pooling_strategy': 'mean', 'attention_heads': 16, 'fusion_dim': 1024, 'fusion_dropout': 0.45806911616377993, 'batch_size': 32, 'learning_rate': 0.0007472397689332936, 'weight_decay': 2.585608890731339e-06, 'scheduler_step_size': 8, 'scheduler_gamma': 0.3453894084962356}. Best is trial 6 with value: 79.77207977207978.


Epoch 15: Train Acc=90.06%, Val Acc=79.77%
Early stopping at epoch 15
Trial 8/10
Config: {
  "embed_dim": 200,
  "lstm_hidden": 512,
  "lstm_num_layers": 1,
  "lstm_dropout": 0.1465513839029496,
  "pooling_strategy": "max",
  "attention_heads": 16,
  "fusion_dim": 512,
  "fusion_dropout": 0.4568126584617151,
  "batch_size": 64,
  "learning_rate": 0.00016325185294676912,
  "weight_decay": 1.0655924993232579e-06,
  "scheduler_step_size": 6,
  "scheduler_gamma": 0.5654007076432224
}
Epoch 1: Train Acc=30.41%, Val Acc=46.34%
Epoch 2: Train Acc=49.05%, Val Acc=53.47%
Epoch 3: Train Acc=55.01%, Val Acc=56.89%
Epoch 4: Train Acc=57.88%, Val Acc=58.40%
Epoch 5: Train Acc=61.21%, Val Acc=61.82%


[I 2025-12-16 20:41:15,804] Trial 7 pruned. 


Epoch 6: Train Acc=63.49%, Val Acc=63.91%
Trial 9/10
Config: {
  "embed_dim": 512,
  "lstm_hidden": 1024,
  "lstm_num_layers": 1,
  "lstm_dropout": 0.26269984907963384,
  "pooling_strategy": "last",
  "attention_heads": 4,
  "fusion_dim": 512,
  "fusion_dropout": 0.5892042219009782,
  "batch_size": 32,
  "learning_rate": 0.0003887072196612053,
  "weight_decay": 3.2204108362516767e-05,
  "scheduler_step_size": 11,
  "scheduler_gamma": 0.4970070775275455
}
Epoch 1: Train Acc=33.52%, Val Acc=49.76%
Epoch 2: Train Acc=52.51%, Val Acc=57.74%
Epoch 3: Train Acc=57.49%, Val Acc=59.92%
Epoch 4: Train Acc=61.52%, Val Acc=63.15%
Epoch 5: Train Acc=64.89%, Val Acc=68.09%


[I 2025-12-16 20:42:40,310] Trial 8 pruned. 


Epoch 6: Train Acc=68.53%, Val Acc=69.99%
Trial 10/10
Config: {
  "embed_dim": 300,
  "lstm_hidden": 1024,
  "lstm_num_layers": 3,
  "lstm_dropout": 0.5574321951102242,
  "pooling_strategy": "last",
  "attention_heads": 8,
  "fusion_dim": 512,
  "fusion_dropout": 0.5404546686067427,
  "batch_size": 64,
  "learning_rate": 0.000745262979291264,
  "weight_decay": 0.00012248682856804866,
  "scheduler_step_size": 11,
  "scheduler_gamma": 0.3388705975083074
}
Epoch 1: Train Acc=28.22%, Val Acc=42.64%
Epoch 2: Train Acc=46.11%, Val Acc=51.57%
Epoch 3: Train Acc=52.63%, Val Acc=52.80%
Epoch 4: Train Acc=57.76%, Val Acc=59.26%
Epoch 5: Train Acc=61.01%, Val Acc=63.91%


[I 2025-12-16 20:44:29,600] Trial 9 pruned. 


Epoch 6: Train Acc=62.98%, Val Acc=63.82%
HYPERPARAMETER TUNING COMPLETE
Best Trial: 6
Best Validation Accuracy: 79.77%

Best Hyperparameters:
  embed_dim: 300
  lstm_hidden: 256
  lstm_num_layers: 2
  lstm_dropout: 0.14514488502720416
  pooling_strategy: mean
  attention_heads: 16
  fusion_dim: 1024
  fusion_dropout: 0.45806911616377993
  batch_size: 32
  learning_rate: 0.0007472397689332936
  weight_decay: 2.585608890731339e-06
  scheduler_step_size: 8
  scheduler_gamma: 0.3453894084962356
