In [3]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizer, BertModel,
    RobertaTokenizer, RobertaModel,
    DistilBertTokenizer, DistilBertModel,
    LongformerTokenizer, LongformerModel,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW # Changed import to torch.optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, hamming_loss
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import itertools
from collections import defaultdict


def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

import gc # Import the garbage collection module

# Memory management
def clear_memory():
    """Clear GPU memory"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# 1. LOAD TRAIN + VALIDATION DATA

train_data = pd.read_pickle('preprocessed_data.pkl')
val_df = pd.read_pickle('validation_preprocessed.pkl')  # Load external validation file

print(f"Loaded {len(train_data)} training instances and {len(val_df)} validation instances")

# Remove test creation (we don't need test_df)
train_data['label_str'] = train_data['label_encoding'].apply(lambda x: ''.join(map(str, x)))

# No test split — only shuffle training data
train_df = train_data.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Train: {len(train_df)}, Val: {len(val_df)}")



# 2. DATASET CLASS

class AERDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        question_text = row['question']
        options_text = f"Option A: {row['option_A']}. Option B: {row['option_B']}. Option C: {row['option_C']}. Option D: {row['option_D']}."
        context_text = row['context'][:1500] if len(row['context']) > 1500 else row['context']  # Reduced context
        full_text = f"Question: {question_text} {options_text} Context: {context_text}"

        encoding = self.tokenizer.encode_plus(
            full_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(row['label_encoding'], dtype=torch.float)
        }


# 3. MODEL ARCHITECTURES

# MODEL 1: Baseline - BERT/RoBERTa + Feedforward
class BaselineTransformer(nn.Module):
    """Baseline transformer with feedforward classifier"""

    def __init__(self, model_name='bert-base-uncased', num_labels=4, dropout=0.3, hidden_size=512):
        super(BaselineTransformer, self).__init__()

        if 'roberta' in model_name:
            self.encoder = RobertaModel.from_pretrained(model_name)
        elif 'distilbert' in model_name:
            self.encoder = DistilBertModel.from_pretrained(model_name)
        else:
            self.encoder = BertModel.from_pretrained(model_name)

        encoder_dim = self.encoder.config.hidden_size

        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(encoder_dim, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout * 0.5),
            nn.Linear(hidden_size, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)

        last_hidden = outputs.last_hidden_state
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
        sum_embeddings = torch.sum(last_hidden * attention_mask_expanded, 1)
        sum_mask = attention_mask_expanded.sum(1).clamp(min=1e-9)
        pooled = sum_embeddings / sum_mask

        logits = self.classifier(pooled)
        return torch.sigmoid(logits)




# MODEL 2: RoBERTa + BiLSTM + Attention
class BiLSTMAttentionModel(nn.Module):
    """RoBERTa with BiLSTM and attention mechanism"""

    def __init__(self, model_name='roberta-base', num_labels=4, dropout=0.3, lstm_hidden=256):
        super(BiLSTMAttentionModel, self).__init__()

        if 'roberta' in model_name:
            self.encoder = RobertaModel.from_pretrained(model_name)
        else:
            self.encoder = BertModel.from_pretrained(model_name)

        encoder_dim = self.encoder.config.hidden_size

        # Freeze some encoder layers to save memory
        for param in self.encoder.embeddings.parameters():
            param.requires_grad = False

        self.bilstm = nn.LSTM(
            encoder_dim, lstm_hidden,
            num_layers=1,  # Reduced from 2 to save memory
            batch_first=True,
            dropout=0.0,  # Remove dropout in LSTM
            bidirectional=True
        )

        self.attention = nn.Sequential(
            nn.Linear(lstm_hidden * 2, lstm_hidden),
            nn.Tanh(),
            nn.Linear(lstm_hidden, 1)
        )

        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(lstm_hidden * 2, 128),  # Reduced from 256
            nn.ReLU(),
            nn.Dropout(dropout * 0.5),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state

        lstm_output, _ = self.bilstm(sequence_output)

        attention_weights = self.attention(lstm_output)
        attention_weights = torch.softmax(attention_weights, dim=1)

        attended = torch.sum(attention_weights * lstm_output, dim=1)

        logits = self.classifier(attended)
        return torch.sigmoid(logits)



# MODEL 3: Longformer for long context
class LongformerClassifier(nn.Module):
    """Longformer for handling long contexts"""

    def __init__(self, model_name='allenai/longformer-base-4096', num_labels=4, dropout=0.3, hidden_size=512):
        super(LongformerClassifier, self).__init__()

        self.encoder = LongformerModel.from_pretrained(model_name)

        # Freeze embeddings to save memory
        for param in self.encoder.embeddings.parameters():
            param.requires_grad = False

        encoder_dim = self.encoder.config.hidden_size

        # Remove BatchNorm to avoid batch_size=1 issues
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(encoder_dim, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout * 0.5),
            nn.Linear(hidden_size, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        global_attention_mask = torch.zeros_like(attention_mask)
        global_attention_mask[:, 0] = 1

        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            global_attention_mask=global_attention_mask
        )

        pooled = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(pooled)
        return torch.sigmoid(logits)



# MODEL 4: DistilBERT - Lightweight and fast
class DistilBERTClassifier(nn.Module):
    """Lightweight DistilBERT model"""

    def __init__(self, model_name='distilbert-base-uncased', num_labels=4, dropout=0.3, hidden_size=384):
        super(DistilBERTClassifier, self).__init__()

        self.encoder = DistilBertModel.from_pretrained(model_name)
        encoder_dim = self.encoder.config.hidden_size

        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(encoder_dim, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout * 0.5),
            nn.Linear(hidden_size, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(pooled)
        return torch.sigmoid(logits)



# MODEL 5: Advanced - Hierarchical Attention
class HierarchicalAttentionModel(nn.Module):
    """Advanced model with hierarchical attention - FIXED"""

    def __init__(self, model_name='roberta-base', num_labels=4, dropout=0.3, hidden_size=384):
        super(HierarchicalAttentionModel, self).__init__()

        if 'roberta' in model_name:
            self.encoder = RobertaModel.from_pretrained(model_name)
        else:
            self.encoder = BertModel.from_pretrained(model_name)

        # Freeze embeddings to save memory
        for param in self.encoder.embeddings.parameters():
            param.requires_grad = False

        encoder_dim = self.encoder.config.hidden_size  # 768 for roberta-base

        self.word_attention = nn.Sequential(
            nn.Linear(encoder_dim, 256),
            nn.Tanh(),
            nn.Linear(256, 1)
        )

        self.sentence_lstm = nn.LSTM(encoder_dim, 256, batch_first=True, bidirectional=True)

        # LSTM output is 512 (256*2)
        self.sentence_attention = nn.Sequential(
            nn.Linear(512, 256),
            nn.Tanh(),
            nn.Linear(256, 1)
        )

        # FIX: Combine word (encoder_dim=768) and sentence (512) representations
        # Total input: 768 + 512 = 1280
        self.combine_layer = nn.Linear(encoder_dim + 512, encoder_dim)

        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(encoder_dim, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout * 0.5),
            nn.Linear(hidden_size, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state  # (batch, seq_len, 768)

        # Word-level attention
        word_attn_weights = self.word_attention(sequence_output)  # (batch, seq_len, 1)
        word_attn_weights = torch.softmax(word_attn_weights, dim=1)
        word_attended = torch.sum(word_attn_weights * sequence_output, dim=1)  # (batch, 768)

        # Sentence-level processing
        sentence_output, _ = self.sentence_lstm(sequence_output)  # (batch, seq_len, 512)

        # Sentence-level attention
        sent_attn_weights = self.sentence_attention(sentence_output)  # (batch, seq_len, 1)
        sent_attn_weights = torch.softmax(sent_attn_weights, dim=1)
        sent_attended = torch.sum(sent_attn_weights * sentence_output, dim=1)  # (batch, 512)

        # Combine: concatenate then project back to encoder_dim
        combined = torch.cat([word_attended, sent_attended], dim=1)  # (batch, 768+512=1280)
        combined = self.combine_layer(combined)  # (batch, 768)

        logits = self.classifier(combined)  # (batch, num_labels)
        return torch.sigmoid(logits)


# 4. FOCAL LOSS

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = nn.functional.binary_cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return focal_loss.mean()


# 5. TRAINING UTILITIES

def compute_metrics(predictions, labels, threshold=0.5):
    preds = (predictions > threshold).astype(int)
    labels = labels.astype(int)

    exact_match = np.all(preds == labels, axis=1).mean()
    hamming = 1 - hamming_loss(labels, preds)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='macro', zero_division=0
    )

    return {
        'exact_match': exact_match,
        'hamming_accuracy': hamming,
        'macro_precision': precision,
        'macro_recall': recall,
        'macro_f1': f1
    }

def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        # Clear cache periodically
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)

            all_predictions.append(outputs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    predictions = np.vstack(all_predictions)
    labels = np.vstack(all_labels)

    return compute_metrics(predictions, labels)

def find_optimal_threshold(model, dataloader, device):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            all_predictions.append(outputs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    predictions = np.vstack(all_predictions)
    labels = np.vstack(all_labels)

    best_threshold = 0.5
    best_f1 = 0

    for threshold in np.arange(0.3, 0.7, 0.05):
        metrics = compute_metrics(predictions, labels, threshold)
        if metrics['macro_f1'] > best_f1:
            best_f1 = metrics['macro_f1']
            best_threshold = threshold

    return best_threshold, best_f1



# 6. MODEL CONFIGURATIONS

MODEL_CONFIGS = {
    'bert_baseline': {
        'name': 'BERT Baseline',
        'model_class': BaselineTransformer,
        'model_name': 'bert-base-uncased',
        'tokenizer': BertTokenizer,
        'max_length': 512,
        'default_hyperparams': {
            'learning_rate': 2e-5,
            'dropout': 0.3,
            'hidden_size': 384,  # Reduced
            'batch_size': 4  # Reduced for memory
        },
        'tuning_grid': {
            'learning_rate': [1e-5, 2e-5],
            'dropout': [0.2, 0.3],
            'hidden_size': [256, 384],
            'batch_size': [4, 8]
        }
    },
    'roberta_baseline': {
        'name': 'RoBERTa Baseline',
        'model_class': BaselineTransformer,
        'model_name': 'roberta-base',
        'tokenizer': RobertaTokenizer,
        'max_length': 512,
        'default_hyperparams': {
            'learning_rate': 2e-5,
            'dropout': 0.3,
            'hidden_size': 384,  # Changed from 512
            'batch_size': 4
        },
        'tuning_grid': {
            'learning_rate': [1e-5, 2e-5],
            'dropout': [0.2, 0.3],
            'hidden_size': [256, 384],  # Changed from [384, 512]
            'batch_size': [4, 8]
        }
    },
    'roberta_bilstm': {
        'name': 'RoBERTa + BiLSTM + Attention',
        'model_class': BiLSTMAttentionModel,
        'model_name': 'roberta-base',
        'tokenizer': RobertaTokenizer,
        'max_length': 512,
        'default_hyperparams': {
            'learning_rate': 2e-5,
            'dropout': 0.3,
            'lstm_hidden': 256,
            'batch_size': 4
        },
        'tuning_grid': {
            'learning_rate': [1e-5, 2e-5],
            'dropout': [0.2, 0.3],
            'lstm_hidden': [128, 256],
            'batch_size': [4, 8]
        }
    },
    'longformer': {
        'name': 'Longformer',
        'model_class': LongformerClassifier,
        'model_name': 'allenai/longformer-base-4096',
        'tokenizer': LongformerTokenizer,
        'max_length': 1024,
        'default_hyperparams': {
            'learning_rate': 2e-5,
            'dropout': 0.3,
            'hidden_size': 384,
            'batch_size': 2  # Very small batch
        },
        'tuning_grid': {
            'learning_rate': [1e-5, 2e-5],
            'dropout': [0.2, 0.3],
            'hidden_size': [256, 384],
            'batch_size': [2, 4]
        }
    },
    'distilbert': {
        'name': 'DistilBERT (Fast)',
        'model_class': DistilBERTClassifier,
        'model_name': 'distilbert-base-uncased',
        'tokenizer': DistilBertTokenizer,
        'max_length': 512,
        'default_hyperparams': {
            'learning_rate': 3e-5,
            'dropout': 0.3,
            'hidden_size': 256,
            'batch_size': 8
        },
        'tuning_grid': {
            'learning_rate': [2e-5, 3e-5],
            'dropout': [0.2, 0.3],
            'hidden_size': [256, 384],
            'batch_size': [8, 16]
        }
    },
    'hierarchical': {
        'name': 'Hierarchical Attention',
        'model_class': HierarchicalAttentionModel,
        'model_name': 'roberta-base',
        'tokenizer': RobertaTokenizer,
        'max_length': 512,
        'default_hyperparams': {
            'learning_rate': 2e-5,
            'dropout': 0.3,
            'hidden_size': 384,
            'batch_size': 4
        },
        'tuning_grid': {
            'learning_rate': [1e-5, 2e-5],
            'dropout': [0.2, 0.3],
            'hidden_size': [256, 384],
            'batch_size': [4, 8]
        }
    }
}



Using device: cuda
Loaded 1819 training instances and 400 validation instances
Train: 1819, Val: 400


In [4]:
def train_model(config_name, config, train_df, val_df, hyperparams, epochs=4):
    """Train a single model with given hyperparameters"""

    print(f"\n{'='*70}")
    print(f"Training: {config['name']}")
    print(f"Hyperparameters: {hyperparams}")
    print(f"{'='*70}")

    # Clear memory before starting
    clear_memory()

    try:
        tokenizer = config['tokenizer'].from_pretrained(config['model_name'])

        train_dataset = AERDataset(train_df, tokenizer, config['max_length'])
        val_dataset = AERDataset(val_df, tokenizer, config['max_length'])

        train_loader = DataLoader(train_dataset, batch_size=hyperparams['batch_size'], shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=hyperparams['batch_size'])

        model_kwargs = {k: v for k, v in hyperparams.items() if k not in ['learning_rate', 'batch_size']}
        model_kwargs['model_name'] = config['model_name']
        model_kwargs['num_labels'] = 4

        model = config['model_class'](**model_kwargs).to(device)

        criterion = FocalLoss(alpha=0.25, gamma=2.0)
        optimizer = AdamW(model.parameters(), lr=hyperparams['learning_rate'], weight_decay=0.01)

        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
        )

        best_val_f1 = 0
        history = []

        for epoch in range(epochs):
            print(f"\nEpoch {epoch + 1}/{epochs}")

            train_loss = train_epoch(model, train_loader, optimizer, scheduler, criterion, device)
            val_metrics = evaluate(model, val_loader, device)

            print(f"Train Loss: {train_loss:.4f}")
            print(f"Val Macro F1: {val_metrics['macro_f1']:.4f}")
            print(f"Val Exact Match: {val_metrics['exact_match']:.4f}")

            history.append({
                'epoch': epoch + 1,
                'train_loss': train_loss,
                'val_metrics': val_metrics
            })

            if val_metrics['macro_f1'] > best_val_f1:
                best_val_f1 = val_metrics['macro_f1']
                best_model_state = model.state_dict().copy()

        model.load_state_dict(best_model_state)

        optimal_threshold, threshold_f1 = find_optimal_threshold(model, val_loader, device)

        print(f"\n✅ Best Val Macro F1: {best_val_f1:.4f}")
        print(f"✅ Optimal Threshold: {optimal_threshold:.3f} (F1: {threshold_f1:.4f})")

        return {
            'model': model,
            'tokenizer': tokenizer,
            'best_val_f1': best_val_f1,
            'optimal_threshold': optimal_threshold,
            'hyperparams': hyperparams,
            'history': history,
            'config_name': config_name,
            'success': True
        }

    except Exception as e:
        print(f"\n❌ Training failed: {e}")
        import traceback
        traceback.print_exc()
        clear_memory()
        return {
            'success': False,
            'error': str(e),
            'config_name': config_name
        }

In [5]:
print("\n" + "="*70)
print("STEP 1: TRAINING BASELINE MODELS")
print("="*70)

baseline_results = {}

for config_name, config in MODEL_CONFIGS.items():
    print(f"\n{'#'*70}")
    print(f"# BASELINE: {config['name']}")
    print(f"{'#'*70}")

    result = train_model(
        config_name,
        config,
        train_df,
        val_df,
        config['default_hyperparams'],
        epochs=3
    )

    if result.get('success', False):
        baseline_results[config_name] = result
        print(f"\n✅ Baseline {config['name']} - Val Macro F1: {result['best_val_f1']:.4f}")
    else:
        print(f"\n⚠️  Baseline {config['name']} failed, skipping")

    # Clear memory between models
    clear_memory()

# Display baseline results
print("\n" + "="*70)
print("BASELINE RESULTS SUMMARY")
print("="*70)

if baseline_results:
    baseline_comparison = []
    for config_name, result in baseline_results.items():
        baseline_comparison.append({
            'Model': MODEL_CONFIGS[config_name]['name'],
            'Val Macro F1': f"{result['best_val_f1']*100:.2f}%",
            'Threshold': f"{result['optimal_threshold']:.3f}"
        })

    baseline_df = pd.DataFrame(baseline_comparison).sort_values('Val Macro F1', ascending=False)
    print("\n" + baseline_df.to_string(index=False))
else:
    print("\n⚠️  No baseline models trained successfully")
    print("Try reducing batch sizes further or use CPU")


STEP 1: TRAINING BASELINE MODELS

######################################################################
# BASELINE: BERT Baseline
######################################################################

Training: BERT Baseline
Hyperparameters: {'learning_rate': 2e-05, 'dropout': 0.3, 'hidden_size': 384, 'batch_size': 4}


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


Epoch 1/3


Training: 100%|██████████| 455/455 [03:24<00:00,  2.23it/s]


Train Loss: 0.0507
Val Macro F1: 0.4277
Val Exact Match: 0.0775

Epoch 2/3


Training: 100%|██████████| 455/455 [03:24<00:00,  2.22it/s]


Train Loss: 0.0475
Val Macro F1: 0.4064
Val Exact Match: 0.0700

Epoch 3/3


Training: 100%|██████████| 455/455 [03:24<00:00,  2.23it/s]


Train Loss: 0.0466
Val Macro F1: 0.4648
Val Exact Match: 0.0575

✅ Best Val Macro F1: 0.4648
✅ Optimal Threshold: 0.400 (F1: 0.5720)

✅ Baseline BERT Baseline - Val Macro F1: 0.4648

######################################################################
# BASELINE: RoBERTa Baseline
######################################################################

Training: RoBERTa Baseline
Hyperparameters: {'learning_rate': 2e-05, 'dropout': 0.3, 'hidden_size': 384, 'batch_size': 4}


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 455/455 [03:14<00:00,  2.34it/s]


Train Loss: 0.0495
Val Macro F1: 0.1363
Val Exact Match: 0.1250

Epoch 2/3


Training: 100%|██████████| 455/455 [03:14<00:00,  2.34it/s]


Train Loss: 0.0464
Val Macro F1: 0.2740
Val Exact Match: 0.0450

Epoch 3/3


Training: 100%|██████████| 455/455 [03:12<00:00,  2.36it/s]


Train Loss: 0.0450
Val Macro F1: 0.2472
Val Exact Match: 0.0875

✅ Best Val Macro F1: 0.2740
✅ Optimal Threshold: 0.300 (F1: 0.5709)

✅ Baseline RoBERTa Baseline - Val Macro F1: 0.2740

######################################################################
# BASELINE: RoBERTa + BiLSTM + Attention
######################################################################

Training: RoBERTa + BiLSTM + Attention
Hyperparameters: {'learning_rate': 2e-05, 'dropout': 0.3, 'lstm_hidden': 256, 'batch_size': 4}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 455/455 [03:25<00:00,  2.21it/s]


Train Loss: 0.0422
Val Macro F1: 0.0804
Val Exact Match: 0.0050

Epoch 2/3


Training: 100%|██████████| 455/455 [03:25<00:00,  2.21it/s]


Train Loss: 0.0408
Val Macro F1: 0.4260
Val Exact Match: 0.0150

Epoch 3/3


Training: 100%|██████████| 455/455 [03:24<00:00,  2.22it/s]


Train Loss: 0.0395
Val Macro F1: 0.4165
Val Exact Match: 0.0475

✅ Best Val Macro F1: 0.4260
✅ Optimal Threshold: 0.400 (F1: 0.5722)

✅ Baseline RoBERTa + BiLSTM + Attention - Val Macro F1: 0.4260

######################################################################
# BASELINE: Longformer
######################################################################

Training: Longformer
Hyperparameters: {'learning_rate': 2e-05, 'dropout': 0.3, 'hidden_size': 384, 'batch_size': 2}


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/597M [00:00<?, ?B/s]


Epoch 1/3



Training:   0%|          | 0/910 [00:00<?, ?it/s][A
Training:   0%|          | 1/910 [00:01<27:46,  1.83s/it][A
Training:   0%|          | 2/910 [00:03<21:53,  1.45s/it][A
Training:   0%|          | 3/910 [00:04<19:15,  1.27s/it][A
Training:   0%|          | 4/910 [00:05<17:30,  1.16s/it][A
Training:   1%|          | 5/910 [00:06<16:26,  1.09s/it][A
Training:   1%|          | 6/910 [00:07<15:49,  1.05s/it][A
Training:   1%|          | 7/910 [00:07<15:28,  1.03s/it][A
Training:   1%|          | 8/910 [00:09<16:54,  1.13s/it][A
Training:   1%|          | 9/910 [00:10<17:09,  1.14s/it][A
Training:   1%|          | 10/910 [00:11<16:58,  1.13s/it][A
Training:   1%|          | 11/910 [00:12<17:20,  1.16s/it][A
Training:   1%|▏         | 12/910 [00:14<19:51,  1.33s/it][A
Training:   1%|▏         | 13/910 [00:15<19:52,  1.33s/it][A
Training:   2%|▏         | 14/910 [00:17<19:10,  1.28s/it][A
Training:   2%|▏         | 15/910 [00:18<17:48,  1.19s/it][A
Training:   2%|▏         

Train Loss: 0.0426
Val Macro F1: 0.3213
Val Exact Match: 0.0325

Epoch 2/3


Training: 100%|██████████| 910/910 [14:46<00:00,  1.03it/s]


Train Loss: 0.0407
Val Macro F1: 0.4510
Val Exact Match: 0.0425

Epoch 3/3


Training: 100%|██████████| 910/910 [14:46<00:00,  1.03it/s]


Train Loss: 0.0365
Val Macro F1: 0.6152
Val Exact Match: 0.2500

✅ Best Val Macro F1: 0.6152
✅ Optimal Threshold: 0.450 (F1: 0.6474)

✅ Baseline Longformer - Val Macro F1: 0.6152

######################################################################
# BASELINE: DistilBERT (Fast)
######################################################################

Training: DistilBERT (Fast)
Hyperparameters: {'learning_rate': 3e-05, 'dropout': 0.3, 'hidden_size': 256, 'batch_size': 8}


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]


Epoch 1/3


Training: 100%|██████████| 228/228 [01:40<00:00,  2.27it/s]


Train Loss: 0.0423
Val Macro F1: 0.1121
Val Exact Match: 0.0125

Epoch 2/3


Training: 100%|██████████| 228/228 [01:40<00:00,  2.26it/s]


Train Loss: 0.0415
Val Macro F1: 0.1736
Val Exact Match: 0.0125

Epoch 3/3


Training: 100%|██████████| 228/228 [01:40<00:00,  2.27it/s]


Train Loss: 0.0400
Val Macro F1: 0.3399
Val Exact Match: 0.0300

✅ Best Val Macro F1: 0.3399
✅ Optimal Threshold: 0.400 (F1: 0.5810)

✅ Baseline DistilBERT (Fast) - Val Macro F1: 0.3399

######################################################################
# BASELINE: Hierarchical Attention
######################################################################

Training: Hierarchical Attention
Hyperparameters: {'learning_rate': 2e-05, 'dropout': 0.3, 'hidden_size': 384, 'batch_size': 4}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 455/455 [03:30<00:00,  2.16it/s]


Train Loss: 0.0421
Val Macro F1: 0.0000
Val Exact Match: 0.0000

Epoch 2/3


Training: 100%|██████████| 455/455 [03:30<00:00,  2.16it/s]


Train Loss: 0.0412
Val Macro F1: 0.3501
Val Exact Match: 0.0275

Epoch 3/3


Training: 100%|██████████| 455/455 [03:30<00:00,  2.17it/s]


Train Loss: 0.0398
Val Macro F1: 0.4968
Val Exact Match: 0.0025

✅ Best Val Macro F1: 0.4968
✅ Optimal Threshold: 0.400 (F1: 0.5836)

✅ Baseline Hierarchical Attention - Val Macro F1: 0.4968

BASELINE RESULTS SUMMARY

                       Model Val Macro F1 Threshold
                  Longformer       61.52%     0.450
      Hierarchical Attention       49.68%     0.400
               BERT Baseline       46.48%     0.400
RoBERTa + BiLSTM + Attention       42.60%     0.400
           DistilBERT (Fast)       33.99%     0.400
            RoBERTa Baseline       27.40%     0.300


In [6]:

# SAVE BEST BASELINE MODEL

print("\n" + "="*70)
print("SAVING BEST BASELINE MODEL")
print("="*70)

# Find best baseline model
best_baseline_name = max(baseline_results.items(), key=lambda x: x[1]['best_val_f1'])[0]
best_baseline_result = baseline_results[best_baseline_name]
best_baseline_config = MODEL_CONFIGS[best_baseline_name]

print(f"\n🏆 Best Baseline Model: {best_baseline_config['name']}")
print(f"   Val Macro F1: {best_baseline_result['best_val_f1']*100:.2f}%")
print(f"   Optimal Threshold: {best_baseline_result['optimal_threshold']:.3f}")

# Save baseline model package
baseline_model_package = {
    'model_state_dict': best_baseline_result['model'].state_dict(),
    'model_config': {
        'model_name': best_baseline_config['model_name'],
        'model_class': best_baseline_name,
        'num_labels': 4,
        **best_baseline_result['hyperparams']
    },
    'tokenizer': best_baseline_result['tokenizer'],
    'optimal_threshold': best_baseline_result['optimal_threshold'],
    'val_metrics': {
        'macro_f1': best_baseline_result['best_val_f1'],
        'exact_match': best_baseline_result['history'][-1]['val_metrics']['exact_match'],
        'hamming_accuracy': best_baseline_result['history'][-1]['val_metrics']['hamming_accuracy'],
        'macro_precision': best_baseline_result['history'][-1]['val_metrics']['macro_precision'],
        'macro_recall': best_baseline_result['history'][-1]['val_metrics']['macro_recall']
    },
    'model_type': best_baseline_config['name'],
    'training_stage': 'baseline',
    'all_baseline_results': baseline_df.to_dict('records'),
    'training_history': best_baseline_result['history']
}

# Save to file
with open('/content/models/best_baseline_model.pkl', 'wb') as f:
    pickle.dump(baseline_model_package, f)

print("\n✅ Best baseline model saved to 'best_baseline_model.pkl'")
print(f"   Model: {best_baseline_config['name']}")
print(f"   Config: {best_baseline_name}")
print(f"   Hyperparameters: {best_baseline_result['hyperparams']}")

# Also save a summary CSV
baseline_summary = []
for config_name, result in baseline_results.items():
    baseline_summary.append({
        'Model': MODEL_CONFIGS[config_name]['name'],
        'Config': config_name,
        'Val Macro F1': f"{result['best_val_f1']*100:.2f}%",
        'Val Exact Match': f"{result['history'][-1]['val_metrics']['exact_match']*100:.2f}%",
        'Threshold': f"{result['optimal_threshold']:.3f}",
        'Learning Rate': result['hyperparams']['learning_rate'],
        'Dropout': result['hyperparams']['dropout'],
        'Batch Size': result['hyperparams']['batch_size']
    })

baseline_summary_df = pd.DataFrame(baseline_summary).sort_values('Val Macro F1', ascending=False)
baseline_summary_df.to_csv('/content/models/baseline_models_summary.csv', index=False)

print("✅ Baseline summary saved to '/content/models/baseline_models_summary.csv'")

# Save individual baseline models (optional - saves all baselines)
print("\n" + "="*70)
print("SAVING ALL BASELINE MODELS (Optional)")
print("="*70)

for config_name, result in baseline_results.items():
    config = MODEL_CONFIGS[config_name]

    model_package = {
        'model_state_dict': result['model'].state_dict(),
        'model_config': {
            'model_name': config['model_name'],
            'model_class': config_name,
            'num_labels': 4,
            **result['hyperparams']
        },
        'tokenizer': result['tokenizer'],
        'optimal_threshold': result['optimal_threshold'],
        'val_metrics': {
            'macro_f1': result['best_val_f1'],
            'exact_match': result['history'][-1]['val_metrics']['exact_match'],
            'hamming_accuracy': result['history'][-1]['val_metrics']['hamming_accuracy']
        },
        'model_type': config['name'],
        'training_stage': 'baseline'
    }

    filename = f"models/baseline_{config_name}.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(model_package, f)

    print(f"✅ Saved {config['name']} to '{filename}'")

print("\n" + "="*70)
print("✨ BASELINE TRAINING COMPLETE!")
print("="*70)
print(f"\nBest Model: {best_baseline_config['name']}")
print(f"Val Macro F1: {best_baseline_result['best_val_f1']*100:.2f}%")
print(f"\nYou can now:")
print("  1. Continue with hyperparameter tuning (run next cells)")
print("  2. Use the best baseline model for predictions")
print("  3. Load it with: pickle.load(open('models/best_baseline_model.pkl', 'rb'))")
print("="*70)


🏆 BEST BASELINE MODEL
Model: Longformer
Val Macro F1: 0.6152
Optimal Threshold: 0.450

✅ Best model saved successfully → saved_models/longformer_best_model.pkl


#### Further we will try hyperparameter tuning using GridSearchCV and RandomizedSearchCV to improve model performance. And testing will be done after the testing datsaset released by the competition host.

In [None]:
# 9. STEP 2: HYPERPARAMETER TUNING

print("\n" + "="*70)
print("STEP 2: HYPERPARAMETER TUNING FOR EACH MODEL")
print("="*70)

tuned_results = {}
all_tuning_history = {}

for config_name, config in MODEL_CONFIGS.items():
    if config_name not in baseline_results:
        print(f"\n⚠️  Skipping {config['name']} - baseline training failed")
        continue

    print(f"\n{'#'*70}")
    print(f"# HYPERPARAMETER TUNING: {config['name']}")
    print(f"{'#'*70}")

    tuning_grid = config['tuning_grid']

    keys = list(tuning_grid.keys())
    values = list(tuning_grid.values())
    combinations = list(itertools.product(*values))

    n_trials = min(4, len(combinations))
    if len(combinations) > n_trials:
        combinations = random.sample(combinations, n_trials)

    print(f"\nTesting {len(combinations)} hyperparameter combinations...")

    trial_results = []

    for i, combination in enumerate(combinations):
        hyperparams = dict(zip(keys, combination))

        print(f"\n--- Trial {i+1}/{len(combinations)} ---")
        print(f"Hyperparameters: {hyperparams}")

        result = train_model(
            config_name,
            config,
            train_df,
            val_df,
            hyperparams,
            epochs=3
        )

        if result.get('success', False):
            trial_results.append(result)
            print(f"Val Macro F1: {result['best_val_f1']:.4f}")
        else:
            print(f"⚠️  Trial failed, skipping")

        # Clear memory between trials
        clear_memory()

    if trial_results:
        # Select best configuration
        best_result = max(trial_results, key=lambda x: x['best_val_f1'])

        tuned_results[config_name] = best_result
        all_tuning_history[config_name] = trial_results

        print(f"\n{'='*70}")
        print(f"✨ BEST CONFIGURATION FOR {config['name']}")
        print(f"{'='*70}")
        print(f"Hyperparameters: {best_result['hyperparams']}")
        print(f"Val Macro F1: {best_result['best_val_f1']:.4f}")

        baseline_f1 = baseline_results[config_name]['best_val_f1']
        improvement = (best_result['best_val_f1'] - baseline_f1) * 100
        print(f"Improvement over baseline: {improvement:+.2f}%")
    else:
        print(f"\n⚠️  All tuning trials failed for {config['name']}")


STEP 2: HYPERPARAMETER TUNING FOR EACH MODEL

######################################################################
# HYPERPARAMETER TUNING: BERT Baseline
######################################################################

Testing 4 hyperparameter combinations...

--- Trial 1/4 ---
Hyperparameters: {'learning_rate': 1e-05, 'dropout': 0.2, 'hidden_size': 384, 'batch_size': 8}

Training: BERT Baseline
Hyperparameters: {'learning_rate': 1e-05, 'dropout': 0.2, 'hidden_size': 384, 'batch_size': 8}

Epoch 1/3


Training:  12%|█▏        | 27/228 [00:23<02:52,  1.16it/s]

In [None]:

# 10. SELECT BEST MODEL OVERALL

print("\n" + "="*70)
print("COMPARING ALL MODELS")
print("="*70)

if tuned_results:
    comparison = []
    for config_name, result in tuned_results.items():
        comparison.append({
            'Model': MODEL_CONFIGS[config_name]['name'],
            'Val Macro F1': result['best_val_f1'],
            'Threshold': result['optimal_threshold'],
            'Config': config_name
        })

    comparison_df = pd.DataFrame(comparison).sort_values('Val Macro F1', ascending=False)
    print("\n" + comparison_df.to_string(index=False))

    # Select best model
    best_config_name = comparison_df.iloc[0]['Config']
    best_model_result = tuned_results[best_config_name]

    print(f"\n{'='*70}")
    print(f"🏆 BEST MODEL: {MODEL_CONFIGS[best_config_name]['name']}")
    print(f"🏆 Val Macro F1: {best_model_result['best_val_f1']:.4f}")
    print(f"{'='*70}")

else:
    print("\n⚠️  No models successfully tuned")
    print("Using best baseline model instead...")

    if baseline_results:
        best_baseline = max(baseline_results.items(), key=lambda x: x[1]['best_val_f1'])
        best_config_name = best_baseline[0]
        best_model_result = best_baseline[1]

        comparison_df = pd.DataFrame([{
            'Model': MODEL_CONFIGS[best_config_name]['name'],
            'Val Macro F1': best_model_result['best_val_f1'],
            'Threshold': best_model_result['optimal_threshold'],
            'Config': best_config_name
        }])

        print(f"\n🏆 BEST BASELINE MODEL: {MODEL_CONFIGS[best_config_name]['name']}")
        print(f"🏆 Val Macro F1: {best_model_result['best_val_f1']:.4f}")
    else:
        print("\n❌ No models trained successfully. Exiting.")
        import sys
        sys.exit(1)

In [None]:
# 11. FINAL TRAINING AND EVALUATION

print("\n" + "="*70)
print("FINAL TRAINING ON FULL TRAINING DATA")
print("="*70)

best_config = MODEL_CONFIGS[best_config_name]

print("\nRetraining best model...")
final_result = train_model(
    best_config_name,
    best_config,
    train_df,
    val_df,
    best_model_result['hyperparams'],
    epochs=6
)

if final_result.get('success', False):
    # Evaluate on test set
    print("\nEvaluating on test set...")
    test_tokenizer = final_result['tokenizer']
    test_dataset = AERDataset(test_df, test_tokenizer, best_config['max_length'])
    test_loader = DataLoader(test_dataset, batch_size=best_model_result['hyperparams']['batch_size'])

    test_metrics = evaluate(final_result['model'], test_loader, device)

    print(f"\n{'='*70}")
    print("📊 FINAL TEST RESULTS")
    print(f"{'='*70}")
    print(f"Exact Match Accuracy: {test_metrics['exact_match']*100:.2f}%")
    print(f"Hamming Accuracy: {test_metrics['hamming_accuracy']*100:.2f}%")
    print(f"Macro Precision: {test_metrics['macro_precision']*100:.2f}%")
    print(f"Macro Recall: {test_metrics['macro_recall']*100:.2f}%")
    print(f"⭐ Macro F1: {test_metrics['macro_f1']*100:.2f}%")
    print(f"{'='*70}")


In [None]:

# 12. SAVE BEST MODEL

print("\n" + "="*70)
print("SAVING BEST MODEL")
print("="*70)
model_package = {
    'model_state_dict': final_result['model'].state_dict(),
    'model_config': {
        'model_name': best_config['model_name'],
        'model_class': best_config_name,
        'num_labels': 4,
        **best_model_result['hyperparams']
    },
    'tokenizer': final_result['tokenizer'],
    'optimal_threshold': final_result['optimal_threshold'],
    'test_metrics': test_metrics,
    'model_type': best_config['name'],
    'all_results': comparison_df.to_dict('records')
}
with open('models/random_forest_new2.pkl', 'wb') as f:
    pickle.dump(model_package, f)
print("✅ Best model saved to '/content/aer_project/models/best_aductive_models.pkl'")
print(f"   Model: {best_config['name']}")
print(f"   Test Macro F1: {test_metrics['macro_f1']*100:.2f}%")

In [None]:
# 13. VISUALIZATIONS

print("\n" + "="*70)
print("GENERATING VISUALIZATIONS")
print("="*70)
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
# 1. Model comparison bar chart
ax1 = axes[0, 0]
models = comparison_df['Model'].values
f1_scores = comparison_df['Val Macro F1'].values
colors = ['#10b981' if i == 0 else '#3b82f6' for i in range(len(models))]
bars = ax1.barh(models, f1_scores, color=colors, alpha=0.8, edgecolor='black')
ax1.set_xlabel('Validation Macro F1', fontsize=12, fontweight='bold')
ax1.set_title('Model Comparison', fontsize=14, fontweight='bold')
ax1.set_xlim([0, 1.0])
ax1.grid(True, alpha=0.3, axis='x')
for bar, score in zip(bars, f1_scores):
    width = bar.get_width()
    ax1.text(width + 0.01, bar.get_y() + bar.get_height()/2,
            f'{score:.3f}', ha='left', va='center', fontweight='bold')
# 2. Training history of best model
ax2 = axes[0, 1]
if best_model_result.get('history'):
    epochs_range = [h['epoch'] for h in best_model_result['history']]
    train_losses = [h['train_loss'] for h in best_model_result['history']]
    val_f1s = [h['val_metrics']['macro_f1'] for h in best_model_result['history']]
    ax2_twin = ax2.twinx()
    line1 = ax2.plot(epochs_range, train_losses, marker='o', color='#ef4444',
                     linewidth=2, markersize=8, label='Train Loss')
    line2 = ax2_twin.plot(epochs_range, val_f1s, marker='s', color='#10b981',
                          linewidth=2, markersize=8, label='Val F1')
    ax2.set_xlabel('Epoch', fontsize=12, fontweight='bold')
    ax2.set_ylabel('Loss', color='#ef4444', fontsize=12, fontweight='bold')
    ax2_twin.set_ylabel('F1 Score', color='#10b981', fontsize=12, fontweight='bold')
    ax2.set_title(f'Best Model Training: {best_config["name"]}', fontsize=14, fontweight='bold')
    ax2.tick_params(axis='y', labelcolor='#ef4444')
    ax2_twin.tick_params(axis='y', labelcolor='#10b981')
    ax2.grid(True, alpha=0.3)
    lines = line1 + line2
    labels = [l.get_label() for l in lines]
    ax2.legend(lines, labels, loc='upper right')
# 3. Baseline vs Tuned comparison
ax3 = axes[1, 0]
if baseline_results and tuned_results:
    model_names = []
    baseline_f1s = []
    tuned_f1s = []
    for config_name in tuned_results.keys():
        if config_name in baseline_results:
            model_names.append(MODEL_CONFIGS[config_name]['name'][:15])
            baseline_f1s.append(baseline_results[config_name]['best_val_f1'])
            tuned_f1s.append(tuned_results[config_name]['best_val_f1'])
    x = np.arange(len(model_names))
    width = 0.35
    bars1 = ax3.bar(x - width/2, baseline_f1s, width, label='Baseline', color='#94a3b8', alpha=0.8)
    bars2 = ax3.bar(x + width/2, tuned_f1s, width, label='Tuned', color='#10b981', alpha=0.8)
    ax3.set_ylabel('Macro F1 Score', fontsize=12, fontweight='bold')
    ax3.set_title('Baseline vs Tuned Performance', fontsize=14, fontweight='bold')
    ax3.set_xticks(x)
    ax3.set_xticklabels(model_names, rotation=45, ha='right')
    ax3.legend()
    ax3.grid(True, alpha=0.3, axis='y')
# 4. Final test metrics
ax4 = axes[1, 1]
metrics_names = ['Exact\nMatch', 'Hamming\nAccuracy', 'Macro\nPrecision',
                 'Macro\nRecall', 'Macro\nF1']
metrics_values = [
    test_metrics['exact_match'],
    test_metrics['hamming_accuracy'],
    test_metrics['macro_precision'],
    test_metrics['macro_recall'],
    test_metrics['macro_f1']
]
bars = ax4.bar(metrics_names, metrics_values,
               color=['#3b82f6', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6'],
               alpha=0.8, edgecolor='black', linewidth=2)
ax4.set_ylabel('Score', fontsize=12, fontweight='bold')
ax4.set_title('Final Test Metrics', fontsize=14, fontweight='bold')
ax4.set_ylim([0, 1.0])
ax4.grid(True, alpha=0.3, axis='y')
for bar, value in zip(bars, metrics_values):
    height = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{value*100:.1f}%', ha='center', va='bottom', fontweight='bold', fontsize=10)
plt.tight_layout()
plt.savefig('/content/aer_project/static/images/multi_model_comparison.png', dpi=300,bbox_inches='tight')
print("✅ Visualizations saved to '/content/aer_project/static/images/multi_model_comparison.png'")

In [None]:
# 14. DETAILED RESULTS TABLE

print("\n" + "="*70)
print("DETAILED RESULTS TABLE")
print("="*70)
results_table = []
for config_name in tuned_results.keys():
    config = MODEL_CONFIGS[config_name]
    result = tuned_results[config_name]
    baseline_f1 = baseline_results.get(config_name, {}).get('best_val_f1', 0)
    improvement = (result['best_val_f1'] - baseline_f1) * 100 if baseline_f1 > 0 else 0
    results_table.append({
        'Model': config['name'],
        'Baseline F1': f"{baseline_f1*100:.2f}%",
        'Tuned F1': f"{result['best_val_f1']*100:.2f}%",
        'Improvement': f"{improvement:+.2f}%",
        'Threshold': f"{result['optimal_threshold']:.3f}",
        'Learning Rate': result['hyperparams']['learning_rate'],
        'Batch Size': result['hyperparams']['batch_size']
    })
results_df = pd.DataFrame(results_table).sort_values('Tuned F1', ascending=False)
print("\n" + results_df.to_string(index=False))
# Save to CSV
results_df.to_csv('/content/aer_project/models/model_comparison_results.csv', index=False)
print("\n✅ Results saved to '/content/aer_project/models/model_comparison_results.csv'")

In [None]:
# 15. SUMMARY REPORT

print("\n" + "="*70)
print("📊 TRAINING SUMMARY REPORT")
print("="*70)
print(f"\n🔢 Total Models Trained:")
print(f"   - Baseline models: {len(baseline_results)}")
print(f"   - Successfully tuned: {len(tuned_results)}")
print(f"\n🏆 Winner: {best_config['name']}")
print(f"   - Config: {best_config_name}")
print(f"   - Model Type: {best_config['model_name']}")
print(f"   - Hyperparameters:")
for key, value in best_model_result['hyperparams'].items():
    print(f"     • {key}: {value}")
print(f"\n📈 Performance:")
print(f"   - Validation Macro F1: {best_model_result['best_val_f1']*100:.2f}%")
print(f"   - Test Macro F1: {test_metrics['macro_f1']*100:.2f}%")
print(f"   - Test Exact Match: {test_metrics['exact_match']*100:.2f}%")
print(f"   - Optimal Threshold: {final_result['optimal_threshold']:.3f}")
if len(comparison_df) > 1:
    print(f"\n🎯 Top 3 Models:")
    for i, (_, row) in enumerate(comparison_df.head(3).iterrows(), 1):
        print(f"   {i}. {row['Model']}: {row['Val Macro F1']*100:.2f}%")
print("\n" + "="*70)
print("🎉 MULTI-ALGORITHM TRAINING COMPLETE!")
print("="*70)
# Save complete training log
training_log = {
    'timestamp': pd.Timestamp.now().isoformat(),
    'best_model': best_config_name,
    'best_model_name': best_config['name'],
    'test_metrics': test_metrics,
    'baseline_results': {k: {
        'val_f1': v['best_val_f1'],
        'threshold': v['optimal_threshold']
    } for k, v in baseline_results.items()},
    'tuned_results': {k: {
        'val_f1': v['best_val_f1'],
        'threshold': v['optimal_threshold'],
        'hyperparams': v['hyperparams']
    } for k, v in tuned_results.items()},
    'all_models_comparison': comparison_df.to_dict('records'),
    'hyperparameters': best_model_result['hyperparams'],
    'optimal_threshold': final_result['optimal_threshold']
}
with open('models/training_log.pkl', 'wb') as f:
    pickle.dump(training_log, f)
print("\n✅ Complete training log saved to '/content/aer_project/models/training_log.pkl'")

In [2]:
# 16. RECOMMENDATIONS

print("\n" + "="*70)
print("💡 RECOMMENDATIONS FOR FURTHER IMPROVEMENT")
print("="*70)
print("\n1. Memory Optimization:")
print("   - Current batch sizes are reduced for memory")
print("   - Consider using gradient accumulation for effective larger batches")
print("   - Use mixed precision training (torch.cuda.amp)")
print("\n2. Model Improvements:")
print("   - Ensemble top 3 models: +2-3% F1")
print("   - Try roberta-large with smaller batch: +3-5% F1")
print("   - Implement data augmentation: +1-2% F1")
print("\n3. Training Strategies:")
print("   - Use gradient accumulation: effective_batch = batch * accumulation_steps")
print("   - Implement K-fold cross-validation for robustness")
print("   - Try different optimizers (AdaFactor for memory efficiency)")
print("\n4. Context Enhancement:")
print("   - Better document retrieval and ranking")
print("   - Use sentence transformers for semantic search")
print("   - Expected improvement: +2-4% F1")
print("\n" + "="*70)
print("✨ Final Model Package Includes:")
print("   ✓ Best performing model weights")
print("   ✓ Tokenizer")
print("   ✓ Optimal threshold")
print("   ✓ Test metrics")
print("   ✓ Baseline and tuned hyperparameters")
print("   ✓ Comparison of all models")
print("   ✓ Complete training history")
print("="*70 + "\n")


💡 RECOMMENDATIONS FOR FURTHER IMPROVEMENT

1. Memory Optimization:
   - Current batch sizes are reduced for memory
   - Consider using gradient accumulation for effective larger batches
   - Use mixed precision training (torch.cuda.amp)

2. Model Improvements:
   - Ensemble top 3 models: +2-3% F1
   - Try roberta-large with smaller batch: +3-5% F1
   - Implement data augmentation: +1-2% F1

3. Training Strategies:
   - Use gradient accumulation: effective_batch = batch * accumulation_steps
   - Implement K-fold cross-validation for robustness
   - Try different optimizers (AdaFactor for memory efficiency)

4. Context Enhancement:
   - Better document retrieval and ranking
   - Use sentence transformers for semantic search
   - Expected improvement: +2-4% F1

✨ Final Model Package Includes:
   ✓ Best performing model weights
   ✓ Tokenizer
   ✓ Optimal threshold
   ✓ Test metrics
   ✓ Baseline and tuned hyperparameters
   ✓ Comparison of all models
   ✓ Complete training history

