# Hyperparameter Search (Optuna)

TPE sampler + MedianPruner over the multimodal RCA model.

In [4]:
import sys
sys.path.insert(0, '/root/lemm')

import os
import json
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
import optuna
from optuna.trial import TrialState

torch.manual_seed(42)
np.random.seed(42)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {DEVICE}')

Device: cuda


In [2]:
# Fixed config (not tuned) - Same structure as train_multimodal_rca.ipynb
BASE_DIR = Path('/root/lemm')
MULTIMODAL_DIR = BASE_DIR / 'core_multimodal_tmp'
METRICS_DIR = BASE_DIR / 'core_metrics_tmp'
LOGS_DIR = BASE_DIR / 'core_logs_tmp'

FIXED_CONFIG = {
    'window_size': 22,
    'n_metrics': 7,
    'bin_seconds': 30,
    'log_embed_dim': 384,
    'epochs': 30,
    'early_stop_patience': 10
}

# Metric file names - Same as train_multimodal_rca.ipynb
METRIC_FILES = [
    'pod_cpu_usage_total.parquet',
    'pod_memory_working_set.parquet',
    'pod_network_rx_bytes.parquet',
    'pod_network_tx_bytes.parquet',
    'pod_latency_server_p95.parquet',
    'pod_latency_client_p95.parquet',
    'pod_workload_ops.parquet',
]

# Split by DAY to guarantee zero temporal overlap (same as train_multimodal_rca.ipynb)
# Nezha scenarios have ~8 min gaps but 10 min windows = 85% overlap within same day
# By splitting by day, we eliminate ALL data leaks between train/val
TRAIN_DAYS = {'20220822', '20230129'}  # Online Boutique day 1 + Train-Ticket day 1
VAL_DAYS = {'20220823', '20230130'}    # Online Boutique day 2 + Train-Ticket day 2

# Excluded scenarios - Same as train_multimodal_rca.ipynb
EXCLUDED = {
    '20220822_nezha_14', '20220822_nezha_22', '20220822_nezha_23',
    '20220823_nezha_21', '20220823_nezha_24',
    '20230130_nezha_15', '20230130_nezha_16',
}

def get_split(scenario_id):
    """Determine if scenario belongs to train or val set."""
    if '_nezha_' in scenario_id:
        date_prefix = scenario_id.split('_nezha_')[0]
        if date_prefix in VAL_DAYS:
            return 'val'
        elif date_prefix in TRAIN_DAYS:
            return 'train'
    # LEMMA scenarios always go to train
    return 'train'

In [3]:
def discover_scenarios():
    """Discover all valid scenarios - same as train_multimodal_rca.ipynb"""
    scenarios = []
    for d in sorted(MULTIMODAL_DIR.iterdir()):
        if not d.is_dir():
            continue
        if d.name in EXCLUDED:
            continue
        if not (d / 'manifest.json').exists():
            continue
        if not (d / 'ground_truth.json').exists():
            continue
        scenarios.append(d.name)
    return scenarios

def load_scenario_data(scenario_id, window_size, n_metrics):
    """Load scenario data - same structure as train_multimodal_rca.ipynb"""
    multimodal_path = MULTIMODAL_DIR / scenario_id
    metrics_path = METRICS_DIR / scenario_id
    logs_path = LOGS_DIR / scenario_id
    
    # Load manifest and ground truth
    with open(multimodal_path / 'manifest.json') as f:
        manifest = json.load(f)
    with open(multimodal_path / 'ground_truth.json') as f:
        gt = json.load(f)
    
    pods = manifest['pods']
    services = manifest['services']
    rc_service = gt['root_cause_service']
    fault_idx = gt['fault_time_idx']
    label = services.index(rc_service) if rc_service in services else 0
    
    # Pod to service mapping (from manifest)
    pod_to_service_idx = manifest.get('pod_to_service_idx', [])
    if not pod_to_service_idx:
        # Fallback: build from pod_to_service dict
        pod_to_svc = manifest.get('pod_to_service', {})
        pod_to_service_idx = []
        for pod in pods:
            svc = pod_to_svc.get(pod, None)
            if svc and svc in services:
                pod_to_service_idx.append(services.index(svc))
            else:
                pod_to_service_idx.append(-1)
    
    # Load metrics - same as train_multimodal_rca.ipynb
    metrics_dict = {}
    for metric_file in METRIC_FILES:
        metric_name = metric_file.replace('pod_', '').replace('.parquet', '')
        path = metrics_path / metric_file
        if path.exists():
            metrics_dict[metric_name] = pd.read_parquet(path)
    
    # Build metrics tensor: (T, n_pods, n_metrics)
    metrics_list = []
    for metric_name in ['cpu_usage_total', 'memory_working_set', 'network_rx_bytes', 
                        'network_tx_bytes', 'latency_server_p95', 'latency_client_p95', 'workload_ops']:
        if metric_name in metrics_dict:
            df = metrics_dict[metric_name]
            metrics_list.append(df.values)  # (T, n_pods)
        else:
            # Fill with NaN if metric missing
            first_metric = list(metrics_dict.values())[0] if metrics_dict else None
            n_timesteps = len(first_metric) if first_metric is not None else window_size
            metrics_list.append(np.full((n_timesteps, len(pods)), np.nan))
    
    metrics = np.stack(metrics_list, axis=-1).astype(np.float32)  # (T, n_pods, n_metrics)
    n_bins = metrics.shape[0]
    
    # Load logs - same as train_multimodal_rca.ipynb
    logs_file = logs_path / 'logs_service_texts.parquet'
    log_texts = {svc: '[N-LGS-DST-TKN-LEZHSA]' for svc in services}
    if logs_file.exists():
        logs_df = pd.read_parquet(logs_file)
        for service in services:
            if service in logs_df.columns:
                service_logs = logs_df[service].fillna('').tolist()
                combined = ' | '.join([l for l in service_logs if l.strip()])
                if combined.strip():
                    log_texts[service] = combined[:512]  # Truncate for MiniLM
    
    return {
        'metrics': metrics, 'n_bins': n_bins, 'fault_idx': fault_idx,
        'pods': pods, 'services': services, 'log_texts': log_texts,
        'label': label, 'rc_service': rc_service,
        'pod_to_service_idx': pod_to_service_idx
    }

In [4]:
class RCADataset(Dataset):
    def __init__(self, scenario_ids, config, mode='train'):
        self.scenario_ids = scenario_ids
        self.config = config
        self.mode = mode
        
        # PRE-LOAD ALL DATA INTO MEMORY (major speedup)
        self.data_cache = {}
        all_services = set()
        for sid in scenario_ids:
            data = load_scenario_data(sid, FIXED_CONFIG['window_size'], FIXED_CONFIG['n_metrics'])
            self.data_cache[sid] = data
            all_services.update(data['services'])
        self.all_services = sorted(all_services)
        print(f"[Dataset] Cached {len(scenario_ids)} scenarios in memory")
    
    def __len__(self):
        return len(self.scenario_ids)
    
    def __getitem__(self, idx):
        sid = self.scenario_ids[idx]
        data = self.data_cache[sid]  # Use cached data instead of loading
        
        metrics = data['metrics'].copy()  # Copy to avoid modifying cache
        window = FIXED_CONFIG['window_size']
        n_bins = data['n_bins']
        fault_idx = data['fault_idx']
        
        if n_bins <= window:
            if n_bins < window:
                pad = window - n_bins
                metrics = np.pad(metrics, ((0, pad), (0, 0), (0, 0)), constant_values=np.nan)
        else:
            jitter = self.config.get('jitter', 0) if self.mode == 'train' else 0
            offset = np.random.randint(-jitter, jitter + 1) if jitter > 0 else 0
            ideal_start = fault_idx - window // 2 + offset
            start = max(0, min(ideal_start, n_bins - window))
            metrics = metrics[start:start + window]
        
        valid_mask = ~np.isnan(metrics).any(axis=-1)
        
        for m in range(metrics.shape[-1]):
            vals = metrics[:, :, m]
            valid = ~np.isnan(vals)
            if valid.sum() > 1:
                mean = np.nanmean(vals)
                std = np.nanstd(vals)
                if std > 1e-8:
                    metrics[:, :, m] = (vals - mean) / std
        
        if self.mode == 'train':
            noise_std = self.config.get('noise_std', 0.0)
            mask_prob = self.config.get('mask_prob', 0.0)
            if noise_std > 0:
                noise = np.random.normal(0, noise_std, metrics.shape).astype(np.float32)
                metrics = metrics + noise * valid_mask[:, :, np.newaxis]
            if mask_prob > 0:
                mask_random = np.random.random(valid_mask.shape) > mask_prob
                valid_mask = valid_mask & mask_random
        
        metrics = np.nan_to_num(metrics, nan=0.0)
        
        return {
            'scenario_id': sid,
            'metrics': torch.tensor(metrics, dtype=torch.float32),
            'metrics_mask': torch.tensor(valid_mask, dtype=torch.bool),
            'log_texts': data['log_texts'],
            'pods': data['pods'],
            'services': data['services'],
            'pod_to_service_idx': data['pod_to_service_idx'],
            'label': data['label'],
            'rc_service': data['rc_service']
        }

In [5]:
class MetricEncoder(nn.Module):
    def __init__(self, n_metrics, d_model, n_heads, n_layers, dropout):
        super().__init__()
        self.input_proj = nn.Linear(n_metrics, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads,
            dim_feedforward=d_model * 4, dropout=dropout,
            activation='gelu', batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.output_proj = nn.Linear(d_model, d_model)
    
    def forward(self, x, mask=None):
        B, W, P, M = x.shape
        x = x.permute(0, 2, 1, 3).reshape(B * P, W, M)
        if mask is not None:
            mask = mask.permute(0, 2, 1).reshape(B * P, W)
            src_key_padding_mask = ~mask
        else:
            src_key_padding_mask = None
        x = self.input_proj(x)
        x = self.encoder(x, src_key_padding_mask=src_key_padding_mask)
        x = x.mean(dim=1)
        x = self.output_proj(x)
        x = x.view(B, P, -1)
        return x

class LogEncoder(nn.Module):
    def __init__(self, output_dim, dropout):
        super().__init__()
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.sentence_model.eval()
        for p in self.sentence_model.parameters():
            p.requires_grad = False
        self.proj = nn.Sequential(
            nn.Linear(384, output_dim), nn.GELU(),
            nn.Dropout(dropout), nn.Linear(output_dim, output_dim)
        )
    
    def forward(self, log_texts_batch, services_batch):
        all_texts = []
        structure = []
        for log_texts, services in zip(log_texts_batch, services_batch):
            texts = [log_texts.get(s, '[N-LGS-DST-TKN-LEZHSA]') for s in services]
            all_texts.extend(texts)
            structure.append(len(services))
        if not all_texts:
            return []
        with torch.no_grad():
            embeddings = self.sentence_model.encode(all_texts, convert_to_tensor=True)
        # Clone to allow gradients through proj (sentence_model is frozen)
        embeddings = embeddings.clone().detach().requires_grad_(True)
        embeddings = self.proj(embeddings)
        result = []
        idx = 0
        for n in structure:
            result.append(embeddings[idx:idx + n])
            idx += n
        return result

class FusionLayer(nn.Module):
    def __init__(self, metric_dim, log_dim, output_dim, dropout):
        super().__init__()
        self.metric_dim = metric_dim
        self.fusion = nn.Sequential(
            nn.Linear(metric_dim * 2 + log_dim, output_dim), nn.GELU(),
            nn.Dropout(dropout), nn.Linear(output_dim, output_dim)
        )
    
    def forward(self, metric_emb, log_emb, pod_to_service_idx, services_batch):
        B = metric_emb.size(0)
        device = metric_emb.device
        fused = []
        for b in range(B):
            services = services_batch[b]
            n_svc = len(services)
            p2s = pod_to_service_idx[b]
            svc_features = []
            for s_idx in range(n_svc):
                pod_indices = [i for i, si in enumerate(p2s) if si == s_idx]
                if pod_indices:
                    pod_metrics = metric_emb[b, pod_indices]
                    svc_max = pod_metrics.max(dim=0).values
                    svc_mean = pod_metrics.mean(dim=0)
                    svc_metric = torch.cat([svc_max, svc_mean], dim=-1)
                else:
                    svc_metric = torch.zeros(2 * self.metric_dim, device=device)
                svc_log = log_emb[b][s_idx]
                combined = torch.cat([svc_metric, svc_log], dim=-1)
                svc_features.append(combined)
            svc_tensor = torch.stack(svc_features)
            fused.append(self.fusion(svc_tensor))
        return fused

class SimilarityClassifier(nn.Module):
    def __init__(self, embed_dim, temperature):
        super().__init__()
        self.temperature = temperature
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.sentence_model.eval()
        for p in self.sentence_model.parameters():
            p.requires_grad = False
        self.proj = nn.Linear(384, embed_dim)
    
    def forward(self, fused_batch, services_batch):
        B = len(fused_batch)
        device = fused_batch[0].device
        max_services = max(len(s) for s in services_batch)
        logits = []
        for b in range(B):
            services = services_batch[b]
            fused = fused_batch[b]
            with torch.no_grad():
                svc_emb = self.sentence_model.encode(services, convert_to_tensor=True)
            # Clone to allow gradients through proj (sentence_model is frozen)
            svc_emb = svc_emb.clone().detach().requires_grad_(True).to(device)
            svc_emb = self.proj(svc_emb)
            fused_norm = F.normalize(fused, dim=-1)
            svc_norm = F.normalize(svc_emb, dim=-1)
            sim = (fused_norm * svc_norm).sum(dim=-1) / self.temperature
            n_svc = len(services)
            if n_svc < max_services:
                pad = torch.full((max_services - n_svc,), -100.0, device=device)
                sim = torch.cat([sim, pad])
            logits.append(sim)
        return torch.stack(logits)

class MultimodalRCA(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.metric_encoder = MetricEncoder(
            n_metrics=FIXED_CONFIG['n_metrics'],
            d_model=config['d_model'], n_heads=config['n_heads'],
            n_layers=config['n_layers'], dropout=config['dropout']
        )
        self.log_encoder = LogEncoder(output_dim=config['d_model'], dropout=config['dropout'])
        self.fusion = FusionLayer(
            metric_dim=config['d_model'], log_dim=config['d_model'],
            output_dim=config['fusion_dim'], dropout=config['dropout']
        )
        self.classifier = SimilarityClassifier(
            embed_dim=config['fusion_dim'], temperature=config['temperature']
        )
    
    def forward(self, metrics, metrics_mask, log_texts, pod_to_service_idx, services):
        metric_emb = self.metric_encoder(metrics, metrics_mask)
        log_emb = self.log_encoder(log_texts, services)
        fused = self.fusion(metric_emb, log_emb, pod_to_service_idx, services)
        logits = self.classifier(fused, services)
        return logits

In [6]:
def collate_fn(batch):
    max_pods = max(s['metrics'].shape[1] for s in batch)
    window = FIXED_CONFIG['window_size']
    n_metrics = FIXED_CONFIG['n_metrics']
    
    metrics_padded, masks_padded, pod_to_service_padded = [], [], []
    for s in batch:
        m = s['metrics']
        mask = s['metrics_mask']
        n_pods = m.shape[1]
        if n_pods < max_pods:
            pad_m = torch.zeros(window, max_pods - n_pods, n_metrics)
            pad_mask = torch.zeros(window, max_pods - n_pods, dtype=torch.bool)
            m = torch.cat([m, pad_m], dim=1)
            mask = torch.cat([mask, pad_mask], dim=1)
        metrics_padded.append(m)
        masks_padded.append(mask)
        p2s = s['pod_to_service_idx'] + [-1] * (max_pods - n_pods)
        pod_to_service_padded.append(p2s)
    
    return {
        'scenario_ids': [s['scenario_id'] for s in batch],
        'metrics': torch.stack(metrics_padded),
        'metrics_mask': torch.stack(masks_padded),
        'log_texts': [s['log_texts'] for s in batch],
        'pods': [s['pods'] for s in batch],
        'services': [s['services'] for s in batch],
        'pod_to_service_idx': pod_to_service_padded,
        'labels': torch.tensor([s['label'] for s in batch], dtype=torch.long),
        'rc_services': [s['rc_service'] for s in batch]
    }

In [7]:
def train_epoch(model, dataloader, optimizer, config):
    model.train()
    total_loss, n_batches = 0, 0
    for batch in dataloader:
        metrics = batch['metrics'].to(DEVICE)
        metrics_mask = batch['metrics_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        logits = model(metrics, metrics_mask, batch['log_texts'],
                      batch['pod_to_service_idx'], batch['services'])
        loss = F.cross_entropy(logits, labels, label_smoothing=config.get('label_smoothing', 0.0))
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), config.get('grad_clip_norm', 1.0))
        optimizer.step()
        total_loss += loss.item()
        n_batches += 1
    return total_loss / n_batches

def evaluate(model, dataloader):
    model.eval()
    all_preds, all_labels, all_ranks = [], [], []
    with torch.no_grad():
        for batch in dataloader:
            metrics = batch['metrics'].to(DEVICE)
            metrics_mask = batch['metrics_mask'].to(DEVICE)
            logits = model(metrics, metrics_mask, batch['log_texts'],
                          batch['pod_to_service_idx'], batch['services'])
            preds = logits.argmax(dim=-1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(batch['labels'].tolist())
            for i, label in enumerate(batch['labels']):
                sorted_idx = logits[i].cpu().argsort(descending=True)
                rank = (sorted_idx == label).nonzero(as_tuple=True)[0].item() + 1
                all_ranks.append(rank)
    acc1 = sum(p == l for p, l in zip(all_preds, all_labels)) / len(all_labels)
    mrr = sum(1.0/r for r in all_ranks) / len(all_ranks)
    return {'acc1': acc1, 'mrr': mrr}

In [8]:
def objective(trial):
    config = {
        'd_model': trial.suggest_categorical('d_model', [64, 128, 256]),
        'n_heads': trial.suggest_categorical('n_heads', [2, 4, 8]),
        'n_layers': trial.suggest_int('n_layers', 1, 3),
        'fusion_dim': trial.suggest_categorical('fusion_dim', [128, 256]),
        'dropout': trial.suggest_float('dropout', 0.1, 0.5),
        'weight_decay': trial.suggest_float('weight_decay', 1e-5, 1e-2, log=True),
        'label_smoothing': trial.suggest_float('label_smoothing', 0.0, 0.1),
        'grad_clip_norm': trial.suggest_float('grad_clip_norm', 0.5, 2.0),
        'batch_size': trial.suggest_categorical('batch_size', [8, 16, 32]),  # Increased for better GPU utilization
        'learning_rate': trial.suggest_float('lr', 1e-5, 5e-4, log=True),
        'warmup_epochs': trial.suggest_int('warmup_epochs', 3, 15),
        'temperature': trial.suggest_float('temperature', 0.05, 0.2),
        'noise_std': trial.suggest_float('noise_std', 0.0, 0.2),
        'mask_prob': trial.suggest_float('mask_prob', 0.0, 0.2),
        'jitter': trial.suggest_int('jitter', 0, 5)
    }
    
    if config['d_model'] % config['n_heads'] != 0:
        return 0.0
    
    all_scenarios = discover_scenarios()
    train_scenarios = [s for s in all_scenarios if get_split(s) == 'train']
    val_scenarios = [s for s in all_scenarios if get_split(s) == 'val']
    
    train_dataset = RCADataset(train_scenarios, config, mode='train')
    val_dataset = RCADataset(val_scenarios, config, mode='val')
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False, collate_fn=collate_fn)
    
    model = MultimodalRCA(config).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
    
    best_mrr = 0.0
    patience_counter = 0
    
    for epoch in range(FIXED_CONFIG['epochs']):
        if epoch < config['warmup_epochs']:
            lr = config['learning_rate'] * (epoch + 1) / config['warmup_epochs']
            for pg in optimizer.param_groups:
                pg['lr'] = lr
        
        train_epoch(model, train_loader, optimizer, config)
        results = evaluate(model, val_loader)
        mrr = results['mrr']
        
        trial.report(mrr, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        if mrr > best_mrr:
            best_mrr = mrr
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= FIXED_CONFIG['early_stop_patience']:
                break
    
    return best_mrr

print('Objective function ready')

Objective function ready


In [9]:
# Optimized: ~8 hours with cache speedup, more random exploration
study = optuna.create_study(
    direction='maximize',
    study_name='multimodal_rca_hpo',
    # More random exploration to cover discrete space well
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=15,       # 15 random trials (up from 10)
        n_warmup_steps=12,         # 12 epochs before pruning each trial
        interval_steps=1           # Check for pruning every epoch
    ),
    sampler=optuna.samplers.TPESampler(
        n_startup_trials=15,       # Match pruner startup
        seed=42                    # Reproducibility
    )
)

print('='*70)
print('OPTUNA HYPERPARAMETER SEARCH')
print('='*70)
print(f'Objective: Maximize MRR')
print(f'Epochs per trial: {FIXED_CONFIG["epochs"]}')
print('='*70)

[I 2025-12-14 21:41:47,809] A new study created in memory with name: multimodal_rca_hpo


OPTUNA HYPERPARAMETER SEARCH
Objective: Maximize MRR
Epochs per trial: 30


In [10]:
# Balanced: 100 trials for ~8 hours total
N_TRIALS = 100
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True, gc_after_trial=True)
print(f'\nCompleted {len(study.trials)} trials')

  0%|          | 0/100 [00:00<?, ?it/s]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 0. Best value: 0.495667:   1%|          | 1/100 [02:51<4:43:12, 171.64s/it]

[I 2025-12-14 21:44:39,265] Trial 0 finished with value: 0.4956665864999198 and parameters: {'d_model': 128, 'n_heads': 2, 'n_layers': 1, 'fusion_dim': 128, 'dropout': 0.3832290311184182, 'weight_decay': 1.1527987128232396e-05, 'label_smoothing': 0.09699098521619944, 'grad_clip_norm': 1.7486639612006325, 'batch_size': 8, 'lr': 3.2877474139911175e-05, 'warmup_epochs': 9, 'temperature': 0.11479175279631737, 'noise_std': 0.058245828039608386, 'mask_prob': 0.1223705789444759, 'jitter': 0}. Best is trial 0 with value: 0.4956665864999198.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 0. Best value: 0.495667:   2%|▏         | 2/100 [04:39<3:39:18, 134.28s/it]

[I 2025-12-14 21:46:27,392] Trial 1 finished with value: 0.47889194139194136 and parameters: {'d_model': 256, 'n_heads': 2, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.16820964947491662, 'weight_decay': 1.5673095467235405e-05, 'label_smoothing': 0.09488855372533334, 'grad_clip_norm': 1.948448049611839, 'batch_size': 8, 'lr': 0.00014537555576161912, 'warmup_epochs': 8, 'temperature': 0.06830573522671683, 'noise_std': 0.09903538202225404, 'mask_prob': 0.006877704223043679, 'jitter': 5}. Best is trial 0 with value: 0.4956665864999198.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 0. Best value: 0.495667:   3%|▎         | 3/100 [07:30<4:04:00, 150.93s/it]

[I 2025-12-14 21:49:18,141] Trial 2 finished with value: 0.46569872393401807 and parameters: {'d_model': 128, 'n_heads': 4, 'n_layers': 3, 'fusion_dim': 256, 'dropout': 0.4579309401710595, 'weight_decay': 0.0006218704727769079, 'label_smoothing': 0.09218742350231168, 'grad_clip_norm': 0.6327387530778792, 'batch_size': 32, 'lr': 4.574578205475403e-05, 'warmup_epochs': 6, 'temperature': 0.17431062637278943, 'noise_std': 0.07135066533871785, 'mask_prob': 0.05618690193747616, 'jitter': 3}. Best is trial 0 with value: 0.4956665864999198.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 3. Best value: 0.518283:   3%|▎         | 3/100 [10:11<4:04:00, 150.93s/it]

[I 2025-12-14 21:51:58,871] Trial 3 finished with value: 0.5182829207829208 and parameters: {'d_model': 128, 'n_heads': 2, 'n_layers': 1, 'fusion_dim': 128, 'dropout': 0.3916028672163949, 'weight_decay': 0.0020597335357437196, 'label_smoothing': 0.007404465173409036, 'grad_clip_norm': 1.0376985928164089, 'batch_size': 16, 'lr': 3.649100451857357e-05, 'warmup_epochs': 3, 'temperature': 0.09664734825734933, 'noise_std': 0.06503666440534941, 'mask_prob': 0.1459212356676128, 'jitter': 3}. Best is trial 3 with value: 0.5182829207829208.


Best trial: 3. Best value: 0.518283:   4%|▍         | 4/100 [10:11<4:07:41, 154.80s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:   5%|▌         | 5/100 [11:55<3:36:23, 136.67s/it]

[I 2025-12-14 21:53:43,402] Trial 4 finished with value: 0.5811022927689594 and parameters: {'d_model': 64, 'n_heads': 4, 'n_layers': 3, 'fusion_dim': 256, 'dropout': 0.27101640734341986, 'weight_decay': 1.1919481947918725e-05, 'label_smoothing': 0.010789142699330446, 'grad_clip_norm': 0.5471437785301014, 'batch_size': 8, 'lr': 0.00034827974366176894, 'warmup_epochs': 6, 'temperature': 0.11155743845534447, 'noise_std': 0.15111022770860974, 'mask_prob': 0.045759633098324495, 'jitter': 0}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:   6%|▌         | 6/100 [14:47<3:52:52, 148.64s/it]

[I 2025-12-14 21:56:35,286] Trial 5 finished with value: 0.5514734648067982 and parameters: {'d_model': 256, 'n_heads': 8, 'n_layers': 3, 'fusion_dim': 256, 'dropout': 0.31573689676626027, 'weight_decay': 0.0026443593078398627, 'label_smoothing': 0.08960912999234932, 'grad_clip_norm': 0.9770052124577958, 'batch_size': 32, 'lr': 0.0002453480159918335, 'warmup_epochs': 14, 'temperature': 0.05104281957967861, 'noise_std': 0.10214946051551316, 'mask_prob': 0.08348220062975581, 'jitter': 1}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:   7%|▋         | 7/100 [19:00<4:43:26, 182.87s/it]

[I 2025-12-14 22:00:48,624] Trial 6 finished with value: 0.51303907917943 and parameters: {'d_model': 256, 'n_heads': 8, 'n_layers': 2, 'fusion_dim': 128, 'dropout': 0.20071291833014568, 'weight_decay': 0.0003102740950912838, 'label_smoothing': 0.03008783098167697, 'grad_clip_norm': 0.9272607415662014, 'batch_size': 16, 'lr': 1.223096868463762e-05, 'warmup_epochs': 6, 'temperature': 0.1862398828949981, 'noise_std': 0.04791237813339449, 'mask_prob': 0.02897897441824462, 'jitter': 2}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:   8%|▊         | 8/100 [22:53<5:04:48, 198.79s/it]

[I 2025-12-14 22:04:41,496] Trial 7 finished with value: 0.5423124805477747 and parameters: {'d_model': 64, 'n_heads': 2, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.31430987362990337, 'weight_decay': 1.865818136012483e-05, 'label_smoothing': 0.0835302495589238, 'grad_clip_norm': 0.9811700974576038, 'batch_size': 32, 'lr': 0.0001416320452869398, 'warmup_epochs': 3, 'temperature': 0.12681395874489215, 'noise_std': 0.04529915503958759, 'mask_prob': 0.12903455808189, 'jitter': 1}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:   9%|▉         | 9/100 [27:13<5:30:22, 217.83s/it]

[I 2025-12-14 22:09:01,206] Trial 8 finished with value: 0.5544254909941184 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 3, 'fusion_dim': 128, 'dropout': 0.3639936184136716, 'weight_decay': 0.0028292192255361887, 'label_smoothing': 0.055520081159946236, 'grad_clip_norm': 1.2944758675340098, 'batch_size': 32, 'lr': 0.00033867510216238227, 'warmup_epochs': 11, 'temperature': 0.1008544686573051, 'noise_std': 0.06984191492253218, 'mask_prob': 0.14519113577404788, 'jitter': 5}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  10%|█         | 10/100 [31:21<5:40:29, 226.99s/it]

[I 2025-12-14 22:13:08,700] Trial 9 finished with value: 0.5202098765432098 and parameters: {'d_model': 64, 'n_heads': 8, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.3654007076432223, 'weight_decay': 1.0355826161899173e-05, 'label_smoothing': 0.016080805141749865, 'grad_clip_norm': 1.3231006840498791, 'batch_size': 8, 'lr': 0.0001621702307943825, 'warmup_epochs': 6, 'temperature': 0.09880995472389018, 'noise_std': 0.14929828102360485, 'mask_prob': 0.12992657980944294, 'jitter': 5}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  11%|█         | 11/100 [35:14<5:39:37, 228.96s/it]

[I 2025-12-14 22:17:02,119] Trial 10 finished with value: 0.4903812636165577 and parameters: {'d_model': 64, 'n_heads': 2, 'n_layers': 3, 'fusion_dim': 256, 'dropout': 0.35245545039890513, 'weight_decay': 0.0024234491447023164, 'label_smoothing': 0.05026370931051921, 'grad_clip_norm': 1.3653558269395387, 'batch_size': 32, 'lr': 2.9993270329588456e-05, 'warmup_epochs': 3, 'temperature': 0.14682084438607518, 'noise_std': 0.03542213588140979, 'mask_prob': 0.18809171687058288, 'jitter': 5}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  12%|█▏        | 12/100 [39:28<5:46:54, 236.53s/it]

[I 2025-12-14 22:21:15,955] Trial 11 finished with value: 0.5424780774780774 and parameters: {'d_model': 64, 'n_heads': 8, 'n_layers': 3, 'fusion_dim': 128, 'dropout': 0.2540390914407701, 'weight_decay': 0.0035761029634855065, 'label_smoothing': 0.031692200515627766, 'grad_clip_norm': 0.7542391200291387, 'batch_size': 16, 'lr': 9.300725537404403e-05, 'warmup_epochs': 4, 'temperature': 0.14225108400487546, 'noise_std': 0.19801077002085266, 'mask_prob': 0.028016803047304806, 'jitter': 3}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  13%|█▎        | 13/100 [43:23<5:42:20, 236.10s/it]

[I 2025-12-14 22:25:11,070] Trial 12 finished with value: 0.5419715046512432 and parameters: {'d_model': 64, 'n_heads': 2, 'n_layers': 3, 'fusion_dim': 256, 'dropout': 0.4652962210225885, 'weight_decay': 0.00034200085877350014, 'label_smoothing': 0.05015162946871996, 'grad_clip_norm': 1.6974427684501627, 'batch_size': 32, 'lr': 0.0003251564545592096, 'warmup_epochs': 7, 'temperature': 0.10633744289599162, 'noise_std': 0.018796387968173803, 'mask_prob': 0.1156560281992348, 'jitter': 0}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  14%|█▍        | 14/100 [47:22<5:39:43, 237.02s/it]

[I 2025-12-14 22:29:10,203] Trial 13 finished with value: 0.5239791472144414 and parameters: {'d_model': 128, 'n_heads': 2, 'n_layers': 3, 'fusion_dim': 128, 'dropout': 0.3088973040219217, 'weight_decay': 0.002041647020718277, 'label_smoothing': 0.02158210274968432, 'grad_clip_norm': 1.4343357137285004, 'batch_size': 32, 'lr': 8.289395383720038e-05, 'warmup_epochs': 11, 'temperature': 0.15891370005839925, 'noise_std': 0.19517041589250694, 'mask_prob': 0.10326006966023907, 'jitter': 1}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  15%|█▌        | 15/100 [51:03<5:29:01, 232.26s/it]

[I 2025-12-14 22:32:51,432] Trial 14 finished with value: 0.5739798021911843 and parameters: {'d_model': 64, 'n_heads': 8, 'n_layers': 3, 'fusion_dim': 128, 'dropout': 0.16931772802833833, 'weight_decay': 2.946531769463084e-05, 'label_smoothing': 0.025024289816459534, 'grad_clip_norm': 1.3238399970591808, 'batch_size': 8, 'lr': 0.00041907086648071184, 'warmup_epochs': 12, 'temperature': 0.13315310787671011, 'noise_std': 0.12234414924687045, 'mask_prob': 0.08392001248555798, 'jitter': 1}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  16%|█▌        | 16/100 [55:05<5:29:08, 235.10s/it]

[I 2025-12-14 22:36:53,139] Trial 15 finished with value: 0.5470705220705221 and parameters: {'d_model': 64, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 128, 'dropout': 0.12033465363120699, 'weight_decay': 5.455418424847078e-05, 'label_smoothing': 0.001151555641127901, 'grad_clip_norm': 0.516313811933251, 'batch_size': 8, 'lr': 0.0004950196683805461, 'warmup_epochs': 15, 'temperature': 0.13377870961689356, 'noise_std': 0.14929030093805118, 'mask_prob': 0.07328442640807727, 'jitter': 0}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  17%|█▋        | 17/100 [57:29<4:47:18, 207.69s/it]

[I 2025-12-14 22:39:17,092] Trial 16 finished with value: 0.5202748885101827 and parameters: {'d_model': 64, 'n_heads': 4, 'n_layers': 3, 'fusion_dim': 128, 'dropout': 0.23715935832352925, 'weight_decay': 5.902063280496525e-05, 'label_smoothing': 0.03456937452482666, 'grad_clip_norm': 1.5426764423517132, 'batch_size': 8, 'lr': 0.0004815317874598218, 'warmup_epochs': 12, 'temperature': 0.08121441685918773, 'noise_std': 0.1423955604426823, 'mask_prob': 0.053384309188877745, 'jitter': 2}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  18%|█▊        | 18/100 [1:01:04<4:46:47, 209.85s/it]

[I 2025-12-14 22:42:51,954] Trial 17 finished with value: 0.5416931216931217 and parameters: {'d_model': 64, 'n_heads': 8, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.1447300967032409, 'weight_decay': 4.698088329030802e-05, 'label_smoothing': 0.07021924859446878, 'grad_clip_norm': 1.1215314993639969, 'batch_size': 8, 'lr': 0.0001994345485449423, 'warmup_epochs': 13, 'temperature': 0.16274201372236835, 'noise_std': 0.1241126030825291, 'mask_prob': 0.07999613987674298, 'jitter': 1}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  19%|█▉        | 19/100 [1:03:51<4:26:00, 197.05s/it]

[I 2025-12-14 22:45:39,189] Trial 18 finished with value: 0.5255796763149705 and parameters: {'d_model': 64, 'n_heads': 4, 'n_layers': 1, 'fusion_dim': 256, 'dropout': 0.24720494901309636, 'weight_decay': 0.00012394010600890351, 'label_smoothing': 0.016047427643090934, 'grad_clip_norm': 1.1574425690213608, 'batch_size': 8, 'lr': 0.00030640123446682343, 'warmup_epochs': 10, 'temperature': 0.12033230466846362, 'noise_std': 0.17217870695549853, 'mask_prob': 0.05101141384277922, 'jitter': 0}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  20%|██        | 20/100 [1:05:44<3:49:14, 171.93s/it]

[I 2025-12-14 22:47:32,552] Trial 19 pruned. 
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  21%|██        | 21/100 [1:08:32<3:44:27, 170.47s/it]

[I 2025-12-14 22:50:19,648] Trial 20 pruned. 
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  22%|██▏       | 22/100 [1:11:59<3:56:07, 181.63s/it]

[I 2025-12-14 22:53:47,306] Trial 21 finished with value: 0.5300176366843034 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 3, 'fusion_dim': 128, 'dropout': 0.4213156930028955, 'weight_decay': 0.00855801059063899, 'label_smoothing': 0.06980966400054013, 'grad_clip_norm': 1.240195914304413, 'batch_size': 8, 'lr': 0.00029952831147398145, 'warmup_epochs': 11, 'temperature': 0.11311156273997826, 'noise_std': 0.08535306106831572, 'mask_prob': 0.16215487498064857, 'jitter': 4}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  23%|██▎       | 23/100 [1:14:30<3:41:06, 172.29s/it]

[I 2025-12-14 22:56:17,810] Trial 22 finished with value: 0.5203181785534726 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 3, 'fusion_dim': 128, 'dropout': 0.28338267639206216, 'weight_decay': 0.0008612298485045519, 'label_smoothing': 0.06040346341022424, 'grad_clip_norm': 1.2349206593346485, 'batch_size': 32, 'lr': 0.00039987550927044515, 'warmup_epochs': 12, 'temperature': 0.09686895424240177, 'noise_std': 0.13138112699679086, 'mask_prob': 0.09404130334943556, 'jitter': 4}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  24%|██▍       | 24/100 [1:17:15<3:35:41, 170.29s/it]

[I 2025-12-14 22:59:03,415] Trial 23 finished with value: 0.5315855424295323 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 3, 'fusion_dim': 128, 'dropout': 0.19379187341470347, 'weight_decay': 2.7573806623740686e-05, 'label_smoothing': 0.04154580980764175, 'grad_clip_norm': 1.5422549282682705, 'batch_size': 16, 'lr': 0.00022184361901306492, 'warmup_epochs': 10, 'temperature': 0.1355939943910166, 'noise_std': 0.08816344599512897, 'mask_prob': 0.16143837975714065, 'jitter': 1}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  25%|██▌       | 25/100 [1:20:36<3:44:07, 179.31s/it]

[I 2025-12-14 23:02:23,763] Trial 24 finished with value: 0.5358649991983325 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 3, 'fusion_dim': 128, 'dropout': 0.276097592064039, 'weight_decay': 0.00011281032518470341, 'label_smoothing': 0.022731375258208154, 'grad_clip_norm': 0.8625940993764027, 'batch_size': 8, 'lr': 0.00036191215476585653, 'warmup_epochs': 13, 'temperature': 0.08494399814016797, 'noise_std': 0.16985683732703188, 'mask_prob': 0.06373600275400489, 'jitter': 0}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  26%|██▌       | 26/100 [1:24:59<4:12:07, 204.42s/it]

[I 2025-12-14 23:06:46,782] Trial 25 finished with value: 0.5318593135259801 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 3, 'fusion_dim': 128, 'dropout': 0.3298887557239407, 'weight_decay': 0.008399166209714733, 'label_smoothing': 0.06083718726421279, 'grad_clip_norm': 1.4173557085982544, 'batch_size': 32, 'lr': 0.00024507916059991064, 'warmup_epochs': 8, 'temperature': 0.14979912323106043, 'noise_std': 0.007803022053485063, 'mask_prob': 0.03794101726549515, 'jitter': 2}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  27%|██▋       | 27/100 [1:28:41<4:15:22, 209.90s/it]

[I 2025-12-14 23:10:29,464] Trial 26 finished with value: 0.5331657848324515 and parameters: {'d_model': 64, 'n_heads': 8, 'n_layers': 3, 'fusion_dim': 256, 'dropout': 0.42434383302252376, 'weight_decay': 2.084736203740847e-05, 'label_smoothing': 0.010946293944052993, 'grad_clip_norm': 1.1058440969135757, 'batch_size': 8, 'lr': 0.00012385122382375339, 'warmup_epochs': 11, 'temperature': 0.12699786798163518, 'noise_std': 0.11364835194553259, 'mask_prob': 0.10211659004377964, 'jitter': 1}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  28%|██▊       | 28/100 [1:30:29<3:35:11, 179.32s/it]

[I 2025-12-14 23:12:17,439] Trial 27 pruned. 
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  29%|██▉       | 29/100 [1:32:22<3:08:39, 159.43s/it]

[I 2025-12-14 23:14:10,472] Trial 28 pruned. 
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  30%|███       | 30/100 [1:34:59<3:05:02, 158.61s/it]

[I 2025-12-14 23:16:47,166] Trial 29 finished with value: 0.5534740545163661 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 1, 'fusion_dim': 128, 'dropout': 0.49833587255360284, 'weight_decay': 1.0041227522604653e-05, 'label_smoothing': 0.058706238447490254, 'grad_clip_norm': 1.8531492729425925, 'batch_size': 32, 'lr': 0.0004094646829497481, 'warmup_epochs': 9, 'temperature': 0.11423568963366451, 'noise_std': 0.1539108146803626, 'mask_prob': 0.11211045858431785, 'jitter': 0}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  30%|███       | 30/100 [1:37:29<3:05:02, 158.61s/it]

[I 2025-12-14 23:19:16,889] Trial 30 pruned. 


Best trial: 4. Best value: 0.581102:  31%|███       | 31/100 [1:37:29<2:59:20, 155.95s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  32%|███▏      | 32/100 [1:40:18<3:01:12, 159.88s/it]

[I 2025-12-14 23:22:05,953] Trial 31 finished with value: 0.5102388969055636 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 1, 'fusion_dim': 128, 'dropout': 0.493232518487022, 'weight_decay': 1.0092884918568377e-05, 'label_smoothing': 0.05937978347128285, 'grad_clip_norm': 1.9409873572727059, 'batch_size': 32, 'lr': 0.0003990319792113139, 'warmup_epochs': 9, 'temperature': 0.11737777040544278, 'noise_std': 0.15913296288730389, 'mask_prob': 0.11792924553820872, 'jitter': 0}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  33%|███▎      | 33/100 [1:42:26<2:47:58, 150.43s/it]

[I 2025-12-14 23:24:14,334] Trial 32 finished with value: 0.5320391119410728 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 1, 'fusion_dim': 128, 'dropout': 0.4155564191589497, 'weight_decay': 1.342527063944103e-05, 'label_smoothing': 0.05886262985250757, 'grad_clip_norm': 1.833875491661162, 'batch_size': 32, 'lr': 0.00028506386399570317, 'warmup_epochs': 8, 'temperature': 0.10896472965634718, 'noise_std': 0.10250057642577144, 'mask_prob': 0.09159392433917136, 'jitter': 0}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  33%|███▎      | 33/100 [1:45:58<2:47:58, 150.43s/it]

[I 2025-12-14 23:27:46,586] Trial 33 finished with value: 0.5316049382716049 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 1, 'fusion_dim': 128, 'dropout': 0.49816551282808275, 'weight_decay': 2.9320139610702495e-05, 'label_smoothing': 0.05456866058439854, 'grad_clip_norm': 1.4760013539074772, 'batch_size': 32, 'lr': 0.0004282963479786794, 'warmup_epochs': 12, 'temperature': 0.13059024761926039, 'noise_std': 0.18578289321603517, 'mask_prob': 0.14224471656749488, 'jitter': 1}. Best is trial 4 with value: 0.5811022927689594.


Best trial: 4. Best value: 0.581102:  34%|███▍      | 34/100 [1:45:58<3:05:52, 168.98s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  34%|███▍      | 34/100 [1:48:48<3:05:52, 168.98s/it]

[I 2025-12-14 23:30:35,657] Trial 34 finished with value: 0.5638007054673722 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 128, 'dropout': 0.3915938084449787, 'weight_decay': 0.001261361149236636, 'label_smoothing': 0.008578713140243416, 'grad_clip_norm': 1.8180946231714583, 'batch_size': 32, 'lr': 0.00035266829557304785, 'warmup_epochs': 7, 'temperature': 0.1021106050993984, 'noise_std': 0.16129770813404312, 'mask_prob': 0.10904534289991677, 'jitter': 0}. Best is trial 4 with value: 0.5811022927689594.


Best trial: 4. Best value: 0.581102:  35%|███▌      | 35/100 [1:48:48<3:03:05, 169.01s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  35%|███▌      | 35/100 [1:51:32<3:03:05, 169.01s/it]

[I 2025-12-14 23:33:20,009] Trial 35 finished with value: 0.5700232061635571 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.37919606714469456, 'weight_decay': 0.001271755232289605, 'label_smoothing': 0.007671905644722428, 'grad_clip_norm': 0.6789785884026921, 'batch_size': 32, 'lr': 0.0002606381617442105, 'warmup_epochs': 7, 'temperature': 0.07278733425136487, 'noise_std': 0.07154082865207743, 'mask_prob': 0.004693231081162277, 'jitter': 3}. Best is trial 4 with value: 0.5811022927689594.


Best trial: 4. Best value: 0.581102:  36%|███▌      | 36/100 [1:51:32<2:58:47, 167.61s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  36%|███▌      | 36/100 [1:55:44<2:58:47, 167.61s/it]

[I 2025-12-14 23:37:31,895] Trial 36 finished with value: 0.5658473625140292 and parameters: {'d_model': 128, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.40278064790559054, 'weight_decay': 0.0009840075716925666, 'label_smoothing': 0.008064702216195386, 'grad_clip_norm': 0.6430703214903457, 'batch_size': 32, 'lr': 0.0001753025563502927, 'warmup_epochs': 7, 'temperature': 0.06961864071816912, 'noise_std': 0.13509585740857605, 'mask_prob': 0.005566635241909174, 'jitter': 3}. Best is trial 4 with value: 0.5811022927689594.


Best trial: 4. Best value: 0.581102:  37%|███▋      | 37/100 [1:55:44<3:22:32, 192.89s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  37%|███▋      | 37/100 [2:00:08<3:22:32, 192.89s/it]

[I 2025-12-14 23:41:56,360] Trial 37 finished with value: 0.5477384960718295 and parameters: {'d_model': 128, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.46184551690512965, 'weight_decay': 0.00045838266169208347, 'label_smoothing': 0.01592663008842142, 'grad_clip_norm': 0.6313844410954206, 'batch_size': 8, 'lr': 0.00016828690470899253, 'warmup_epochs': 7, 'temperature': 0.06516099644952655, 'noise_std': 0.09822214504479954, 'mask_prob': 0.0014442997151568851, 'jitter': 3}. Best is trial 4 with value: 0.5811022927689594.


Best trial: 4. Best value: 0.581102:  38%|███▊      | 38/100 [2:00:08<3:41:30, 214.37s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  38%|███▊      | 38/100 [2:04:44<3:41:30, 214.37s/it]

[I 2025-12-14 23:46:32,154] Trial 38 finished with value: 0.5427777777777778 and parameters: {'d_model': 128, 'n_heads': 8, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.3835543791122773, 'weight_decay': 0.0012437502576649446, 'label_smoothing': 0.006814848447030723, 'grad_clip_norm': 0.6493051768674112, 'batch_size': 16, 'lr': 0.00012994342659856503, 'warmup_epochs': 5, 'temperature': 0.05874530775846595, 'noise_std': 0.13087797425198847, 'mask_prob': 0.016752127147749353, 'jitter': 3}. Best is trial 4 with value: 0.5811022927689594.


Best trial: 4. Best value: 0.581102:  39%|███▉      | 39/100 [2:04:44<3:56:40, 232.80s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  39%|███▉      | 39/100 [2:06:37<3:56:40, 232.80s/it]

[I 2025-12-14 23:48:24,884] Trial 39 pruned. 


Best trial: 4. Best value: 0.581102:  40%|████      | 40/100 [2:06:37<3:16:46, 196.77s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  40%|████      | 40/100 [2:10:59<3:16:46, 196.77s/it]

[I 2025-12-14 23:52:46,856] Trial 40 finished with value: 0.5626063952150908 and parameters: {'d_model': 128, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.4064349059123773, 'weight_decay': 0.0002039156271163522, 'label_smoothing': 0.0048761450742007394, 'grad_clip_norm': 0.762897691790038, 'batch_size': 32, 'lr': 6.0122816512101934e-05, 'warmup_epochs': 6, 'temperature': 0.058282320281537084, 'noise_std': 0.14049946907752583, 'mask_prob': 0.04373202612926256, 'jitter': 4}. Best is trial 4 with value: 0.5811022927689594.


Best trial: 4. Best value: 0.581102:  41%|████      | 41/100 [2:10:59<3:32:43, 216.33s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  41%|████      | 41/100 [2:13:55<3:32:43, 216.33s/it]

[I 2025-12-14 23:55:43,597] Trial 41 finished with value: 0.5311201761201761 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.4465293130426559, 'weight_decay': 0.0013174521842633004, 'label_smoothing': 0.011129129672022592, 'grad_clip_norm': 0.7008809882243399, 'batch_size': 32, 'lr': 0.00024564249253955034, 'warmup_epochs': 7, 'temperature': 0.07379369714397188, 'noise_std': 0.16314121003315157, 'mask_prob': 0.002759123421777754, 'jitter': 3}. Best is trial 4 with value: 0.5811022927689594.


Best trial: 4. Best value: 0.581102:  42%|████▏     | 42/100 [2:13:55<3:17:38, 204.46s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  43%|████▎     | 43/100 [2:16:59<3:08:18, 198.21s/it]

[I 2025-12-14 23:58:47,256] Trial 42 finished with value: 0.539304152637486 and parameters: {'d_model': 128, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.3762754016325732, 'weight_decay': 0.0004946007441005523, 'label_smoothing': 0.011361739655777608, 'grad_clip_norm': 0.8694452824063542, 'batch_size': 32, 'lr': 0.00020021905314460193, 'warmup_epochs': 8, 'temperature': 0.05210697586703822, 'noise_std': 0.18372139386274167, 'mask_prob': 0.019175063826506866, 'jitter': 1}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  43%|████▎     | 43/100 [2:19:47<3:08:18, 198.21s/it]

[I 2025-12-15 00:01:35,724] Trial 43 finished with value: 0.5285470484490092 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.39590197092547924, 'weight_decay': 0.0009927264122581078, 'label_smoothing': 0.0177328294027517, 'grad_clip_norm': 0.6006192409907012, 'batch_size': 32, 'lr': 0.00035191865018520944, 'warmup_epochs': 7, 'temperature': 0.09090922746763068, 'noise_std': 0.14175534695029662, 'mask_prob': 0.027830533245986683, 'jitter': 2}. Best is trial 4 with value: 0.5811022927689594.


Best trial: 4. Best value: 0.581102:  44%|████▍     | 44/100 [2:19:48<2:56:40, 189.29s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  45%|████▌     | 45/100 [2:23:01<2:54:40, 190.56s/it]

[I 2025-12-15 00:04:49,258] Trial 44 finished with value: 0.5630836139169472 and parameters: {'d_model': 64, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.3381693154177482, 'weight_decay': 0.0016068177969678905, 'label_smoothing': 0.0051406464359029375, 'grad_clip_norm': 1.022944203495627, 'batch_size': 32, 'lr': 0.00016257240839069544, 'warmup_epochs': 6, 'temperature': 0.07432963905605441, 'noise_std': 0.10830982456259061, 'mask_prob': 0.08590341596849055, 'jitter': 3}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  46%|████▌     | 46/100 [2:25:02<2:32:46, 169.75s/it]

[I 2025-12-15 00:06:50,447] Trial 45 pruned. 
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  47%|████▋     | 47/100 [2:27:19<2:21:07, 159.77s/it]

[I 2025-12-15 00:09:06,945] Trial 46 finished with value: 0.5104579987913321 and parameters: {'d_model': 256, 'n_heads': 2, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.43970594105850236, 'weight_decay': 0.000607304965100166, 'label_smoothing': 0.019959550743934205, 'grad_clip_norm': 1.9954437042351767, 'batch_size': 16, 'lr': 0.00044713437366810743, 'warmup_epochs': 6, 'temperature': 0.10288960960721331, 'noise_std': 0.06516033626831375, 'mask_prob': 0.010272616217958294, 'jitter': 0}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  47%|████▋     | 47/100 [2:30:04<2:21:07, 159.77s/it]

[I 2025-12-15 00:11:51,928] Trial 47 finished with value: 0.5457002766423056 and parameters: {'d_model': 128, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.32435259279332623, 'weight_decay': 1.8073405750029056e-05, 'label_smoothing': 0.09965344069268112, 'grad_clip_norm': 0.7052823116879536, 'batch_size': 8, 'lr': 0.00033805193102360074, 'warmup_epochs': 8, 'temperature': 0.08905079747917713, 'noise_std': 0.03783085031647454, 'mask_prob': 0.04726975169894775, 'jitter': 3}. Best is trial 4 with value: 0.5811022927689594.


Best trial: 4. Best value: 0.581102:  48%|████▊     | 48/100 [2:30:04<2:19:49, 161.34s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  49%|████▉     | 49/100 [2:32:03<2:06:29, 148.82s/it]

[I 2025-12-15 00:13:51,556] Trial 48 pruned. 
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  49%|████▉     | 49/100 [2:33:57<2:06:29, 148.82s/it]

[I 2025-12-15 00:15:45,145] Trial 49 pruned. 


Best trial: 4. Best value: 0.581102:  50%|█████     | 50/100 [2:33:57<1:55:12, 138.26s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  51%|█████     | 51/100 [2:37:26<2:10:15, 159.51s/it]

[I 2025-12-15 00:19:14,249] Trial 50 finished with value: 0.5314373475776984 and parameters: {'d_model': 256, 'n_heads': 2, 'n_layers': 3, 'fusion_dim': 256, 'dropout': 0.35684001940197224, 'weight_decay': 0.003143727247355676, 'label_smoothing': 0.01378062341867476, 'grad_clip_norm': 0.9088114479303832, 'batch_size': 32, 'lr': 0.0004965747094868234, 'warmup_epochs': 5, 'temperature': 0.1441805377841077, 'noise_std': 0.15101943174216279, 'mask_prob': 0.03753322647075045, 'jitter': 2}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  52%|█████▏    | 52/100 [2:39:11<1:54:26, 143.05s/it]

[I 2025-12-15 00:20:58,897] Trial 51 pruned. 
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  53%|█████▎    | 53/100 [2:40:56<1:43:11, 131.73s/it]

[I 2025-12-15 00:22:44,217] Trial 52 pruned. 
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  54%|█████▍    | 54/100 [2:44:51<2:04:41, 162.65s/it]

[I 2025-12-15 00:26:38,998] Trial 53 finished with value: 0.5723418121457337 and parameters: {'d_model': 64, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.3360198781796106, 'weight_decay': 0.0009622849755364431, 'label_smoothing': 0.009002983229825314, 'grad_clip_norm': 0.9364847296183094, 'batch_size': 32, 'lr': 0.00015025547502463198, 'warmup_epochs': 7, 'temperature': 0.07633055027302116, 'noise_std': 0.1090217308922487, 'mask_prob': 0.1256854550824144, 'jitter': 3}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  55%|█████▌    | 55/100 [2:46:31<1:47:57, 143.94s/it]

[I 2025-12-15 00:28:19,309] Trial 54 pruned. 
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  56%|█████▌    | 56/100 [2:48:17<1:37:08, 132.46s/it]

[I 2025-12-15 00:30:04,963] Trial 55 pruned. 
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  57%|█████▋    | 57/100 [2:51:20<1:45:45, 147.56s/it]

[I 2025-12-15 00:33:07,773] Trial 56 finished with value: 0.5238007054673721 and parameters: {'d_model': 64, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.31893320730058144, 'weight_decay': 0.0006604523683589959, 'label_smoothing': 0.008857187964720263, 'grad_clip_norm': 0.6610521711562374, 'batch_size': 8, 'lr': 0.00021920869587313402, 'warmup_epochs': 8, 'temperature': 0.09584533774938046, 'noise_std': 0.1806226658552806, 'mask_prob': 0.12329438803788263, 'jitter': 2}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  58%|█████▊    | 58/100 [2:53:06<1:34:42, 135.30s/it]

[I 2025-12-15 00:34:54,442] Trial 57 pruned. 
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  59%|█████▉    | 59/100 [2:55:40<1:36:13, 140.81s/it]

[I 2025-12-15 00:37:28,122] Trial 58 finished with value: 0.5234617744038034 and parameters: {'d_model': 256, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.3887040974840062, 'weight_decay': 1.559343861246598e-05, 'label_smoothing': 0.014135179450181307, 'grad_clip_norm': 1.6696235540761706, 'batch_size': 8, 'lr': 0.0003697710619715144, 'warmup_epochs': 14, 'temperature': 0.1727813906213071, 'noise_std': 0.1323662517064758, 'mask_prob': 0.07820383897973904, 'jitter': 0}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  60%|██████    | 60/100 [2:59:05<1:46:39, 160.00s/it]

[I 2025-12-15 00:40:52,884] Trial 59 finished with value: 0.5458221807241415 and parameters: {'d_model': 128, 'n_heads': 4, 'n_layers': 3, 'fusion_dim': 128, 'dropout': 0.43987212893136246, 'weight_decay': 0.00043971351763282283, 'label_smoothing': 0.0031563495795968533, 'grad_clip_norm': 1.079510292699057, 'batch_size': 16, 'lr': 0.0001491542480924514, 'warmup_epochs': 9, 'temperature': 0.06767265343259518, 'noise_std': 0.09936680422859259, 'mask_prob': 0.1027886400175175, 'jitter': 2}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  61%|██████    | 61/100 [3:00:48<1:32:58, 143.04s/it]

[I 2025-12-15 00:42:36,369] Trial 60 finished with value: 0.5294781852134793 and parameters: {'d_model': 64, 'n_heads': 2, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.13466741093197393, 'weight_decay': 2.330853721911466e-05, 'label_smoothing': 0.02443388811946955, 'grad_clip_norm': 1.1983467507303482, 'batch_size': 8, 'lr': 0.0002687653726717407, 'warmup_epochs': 6, 'temperature': 0.1021627593946521, 'noise_std': 0.10618898279946647, 'mask_prob': 0.0970527031137983, 'jitter': 3}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  62%|██████▏   | 62/100 [3:04:29<1:45:24, 166.44s/it]

[I 2025-12-15 00:46:17,395] Trial 61 finished with value: 0.5661956870290203 and parameters: {'d_model': 64, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.33614391562651713, 'weight_decay': 0.0015470102880145453, 'label_smoothing': 0.005921821234436757, 'grad_clip_norm': 0.9451852923102726, 'batch_size': 32, 'lr': 0.00018206217372601608, 'warmup_epochs': 6, 'temperature': 0.07639890223182975, 'noise_std': 0.11211908381386287, 'mask_prob': 0.08544492060579362, 'jitter': 3}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  63%|██████▎   | 63/100 [3:06:15<1:31:27, 148.31s/it]

[I 2025-12-15 00:48:03,386] Trial 62 pruned. 
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  64%|██████▍   | 64/100 [3:08:43<1:28:52, 148.13s/it]

[I 2025-12-15 00:50:31,128] Trial 63 finished with value: 0.5371172037838704 and parameters: {'d_model': 64, 'n_heads': 4, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.29617932975795586, 'weight_decay': 0.0026080520456896648, 'label_smoothing': 0.01459976412888812, 'grad_clip_norm': 0.7052990452535293, 'batch_size': 32, 'lr': 0.00033177348766200257, 'warmup_epochs': 7, 'temperature': 0.06387104248172408, 'noise_std': 0.09129864101394453, 'mask_prob': 0.11069356412398262, 'jitter': 3}. Best is trial 4 with value: 0.5811022927689594.
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  65%|██████▌   | 65/100 [3:10:24<1:18:13, 134.11s/it]

[I 2025-12-15 00:52:12,503] Trial 64 pruned. 
[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  65%|██████▌   | 65/100 [3:12:26<1:18:13, 134.11s/it]

[I 2025-12-15 00:54:14,085] Trial 65 pruned. 


Best trial: 4. Best value: 0.581102:  66%|██████▌   | 66/100 [3:12:26<1:13:52, 130.35s/it]

[Dataset] Cached 53 scenarios in memory
[Dataset] Cached 45 scenarios in memory


Best trial: 4. Best value: 0.581102:  66%|██████▌   | 66/100 [3:16:03<1:13:52, 130.35s/it]

[W 2025-12-15 00:57:50,921] Trial 66 failed with parameters: {'d_model': 256, 'n_heads': 8, 'n_layers': 2, 'fusion_dim': 256, 'dropout': 0.409307297811371, 'weight_decay': 0.0017725276229324008, 'label_smoothing': 0.01781246010867884, 'grad_clip_norm': 0.7975406800204813, 'batch_size': 32, 'lr': 0.0003032311333761149, 'warmup_epochs': 7, 'temperature': 0.12563714828155803, 'noise_std': 0.14722279099708685, 'mask_prob': 0.08132007347742223, 'jitter': 0} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/root/lemm/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_14370/1721322844.py", line 44, in objective
    train_epoch(model, train_loader, optimizer, config)
  File "/tmp/ipykernel_14370/3145803616.py", line 8, in train_epoch
    logits = model(metrics, metrics_mask, batch['log_texts'],
             ^^^^^^^^^^^^^^^

Best trial: 4. Best value: 0.581102:  66%|██████▌   | 66/100 [3:16:03<1:40:59, 178.23s/it]


KeyboardInterrupt: 

In [11]:
# Celda para analizar importancia
import optuna.importance
import json

print("="*70)
print("ANÁLISIS DE IMPORTANCIA DE PARÁMETROS")
print("="*70)

# Obtener importancias
importances = optuna.importance.get_param_importances(study)

print("\nImportancia de cada parámetro (mayor = más impacto en MRR):\n")
for param, importance in sorted(importances.items(), key=lambda x: x[1], reverse=True):
    bar = "█" * int(importance * 50)
    print(f"{param:20s} {importance:.4f} {bar}")

# Ver parámetros del mejor trial
print("\n" + "="*70)
print("MEJOR TRIAL (#4)")
print("="*70)
best_trial = study.best_trial
print(f"\nMRR: {best_trial.value:.4f}")
print("\nParámetros:")
for k, v in best_trial.params.items():
    print(f"  {k}: {v}")

# Guardar para LODOCV
with open('/root/lemm/best_hyperparams.json', 'w') as f:
    json.dump(best_trial.params, f, indent=2)
print("\n✓ Guardado en /root/lemm/best_hyperparams.json")

# Ver distribución de los mejores trials
print("\n" + "="*70)
print("TOP 10 TRIALS")
print("="*70)
trials_df = study.trials_dataframe()
top10 = trials_df.nsmallest(10, 'value') if 'value' in trials_df else trials_df.head(10)
print(top10[['number', 'value', 'params_d_model', 'params_n_layers', 'params_dropout']].to_string())

ANÁLISIS DE IMPORTANCIA DE PARÁMETROS

Importancia de cada parámetro (mayor = más impacto en MRR):

lr                   0.3165 ███████████████
label_smoothing      0.2782 █████████████
grad_clip_norm       0.1610 ████████
n_heads              0.0544 ██
temperature          0.0512 ██
noise_std            0.0297 █
dropout              0.0293 █
jitter               0.0186 
warmup_epochs        0.0179 
mask_prob            0.0161 
batch_size           0.0108 
fusion_dim           0.0061 
d_model              0.0048 
n_layers             0.0033 
weight_decay         0.0021 

MEJOR TRIAL (#4)

MRR: 0.5811

Parámetros:
  d_model: 64
  n_heads: 4
  n_layers: 3
  fusion_dim: 256
  dropout: 0.27101640734341986
  weight_decay: 1.1919481947918725e-05
  label_smoothing: 0.010789142699330446
  grad_clip_norm: 0.5471437785301014
  batch_size: 8
  lr: 0.00034827974366176894
  warmup_epochs: 6
  temperature: 0.11155743845534447
  noise_std: 0.15111022770860974
  mask_prob: 0.045759633098324495
  jitte

In [None]:
print('='*70)
print('BEST TRIAL')
print('='*70)

best_trial = study.best_trial
print(f'\nBest MRR: {best_trial.value:.4f}')
print('\nBest hyperparameters:')
for key, value in best_trial.params.items():
    print(f'  {key}: {value}')

pruned = len([t for t in study.trials if t.state == TrialState.PRUNED])
complete = len([t for t in study.trials if t.state == TrialState.COMPLETE])
print(f'\nTrial statistics: Complete={complete}, Pruned={pruned}')

In [None]:
best_config = {**FIXED_CONFIG, **best_trial.params}
with open('/root/lemm/best_hyperparams.json', 'w') as f:
    json.dump(best_config, f, indent=2)
print('Saved to /root/lemm/best_hyperparams.json')

# Second Round: Focused Search

Based on importance analysis:
- **Explore deeply**: lr, label_smoothing, grad_clip_norm, d_model, n_layers, n_heads, temperature
- **Fix**: dropout, noise_std, jitter, warmup_epochs, mask_prob, batch_size, fusion_dim, weight_decay


In [5]:
# Second round objective with fixed low-importance params
def objective_v2(trial):
    """Focused objective: fix low-importance params, explore important ones."""
    
    # FIXED PARAMS (from trial #4, <3% importance)
    dropout = 0.27
    noise_std = 0.15
    jitter = 0
    warmup_epochs = 6
    mask_prob = 0.05
    batch_size = 8
    fusion_dim = 256
    weight_decay = 1e-5
    
    # EXPLORE: High importance params (fine-grained)
    lr = trial.suggest_float('lr', 1e-4, 5e-4, step=2e-5)  # 21 values
    label_smoothing = trial.suggest_float('label_smoothing', 0.0, 0.05, step=0.005)  # 11 values
    grad_clip_norm = trial.suggest_float('grad_clip_norm', 0.3, 1.0, step=0.1)  # 8 values
    
    # EXPLORE: Architecture (we don't know if low importance is real)
    d_model = trial.suggest_categorical('d_model', [32, 64])
    n_layers = trial.suggest_int('n_layers', 3)
    n_heads = trial.suggest_categorical('n_heads', [4])
    
    # EXPLORE: Medium importance
    temperature = trial.suggest_float('temperature', 0.05, 0.20, step=0.025)  # 7 values
    
    config = {
        **FIXED_CONFIG,
        'd_model': d_model,
        'n_heads': n_heads,
        'n_layers': n_layers,
        'fusion_dim': fusion_dim,
        'dropout': dropout,
        'weight_decay': weight_decay,
        'label_smoothing': label_smoothing,
        'grad_clip_norm': grad_clip_norm,
        'batch_size': batch_size,
        'learning_rate': lr,
        'warmup_epochs': warmup_epochs,
        'temperature': temperature,
        'noise_std': noise_std,
        'mask_prob': mask_prob,
        'jitter': jitter,
    }
    
    # Create datasets
    train_scenarios, val_scenarios = [], []
    for s in discover_scenarios():
        if s not in EXCLUDED:
            if get_split(s) == 'train':
                train_scenarios.append(s)
            else:
                val_scenarios.append(s)
    
    train_ds = RCADataset(train_scenarios, config, mode='train')
    val_ds = RCADataset(val_scenarios, config, mode='val')
    
    train_loader = DataLoader(train_ds, batch_size=config['batch_size'], 
                             shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_ds, batch_size=config['batch_size'],
                           shuffle=False, collate_fn=collate_fn)
    
    # Create model
    model = MultimodalRCA(config).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), 
                                  lr=config['learning_rate'],
                                  weight_decay=config['weight_decay'])
    
    # Training with pruning
    best_mrr = 0.0
    for epoch in range(config['epochs']):
        train_epoch(model, train_loader, optimizer, config)
        metrics = evaluate(model, val_loader)
        mrr = metrics['mrr']
        
        if mrr > best_mrr:
            best_mrr = mrr
        
        # Report for pruning
        trial.report(mrr, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    return best_mrr

print("Second round objective defined")


Second round objective defined


In [6]:
# Create new study for second round
study_v2 = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(
        seed=42,
        n_startup_trials=10,  # 10 random trials first
    ),
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=10,
        n_warmup_steps=12,
        interval_steps=1
    )
)

# Seed with best trial from round 1
study_v2.enqueue_trial({
    'lr': 0.00035,  # Close to trial #4
    'label_smoothing': 0.01,
    'grad_clip_norm': 0.55,
    'd_model': 64,
    'n_layers': 3,
    'n_heads': 4,
    'temperature': 0.11,
})

print("Study v2 created, seeded with trial #4 values")
print("Starting second round: 50 trials (~2-3 hours)")


[I 2025-12-15 13:38:56,368] A new study created in memory with name: no-name-bdfcaa8e-5eda-4835-8764-0c375b2f19a7


Study v2 created, seeded with trial #4 values
Starting second round: 50 trials (~2-3 hours)


In [7]:
# Run second round - 50 trials focused search
N_TRIALS_V2 = 50
study_v2.optimize(objective_v2, n_trials=N_TRIALS_V2, show_progress_bar=True, gc_after_trial=True)
print(f'\nCompleted {len(study_v2.trials)} trials')


  0%|          | 0/50 [00:00<?, ?it/s]


[W 2025-12-15 13:39:00,118] Trial 0 failed with parameters: {'lr': 0.00035, 'label_smoothing': 0.01, 'grad_clip_norm': 0.55, 'd_model': 64} because of the following error: TypeError("Trial.suggest_int() missing 1 required positional argument: 'high'").
Traceback (most recent call last):
  File "/root/lemm/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_18622/4235438134.py", line 22, in objective_v2
    n_layers = trial.suggest_int('n_layers', 3)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/lemm/.venv/lib/python3.12/site-packages/optuna/_convert_positional_args.py", line 135, in converter_wrapper
    return func(**kwargs)  # type: ignore[call-arg]
           ^^^^^^^^^^^^^^
TypeError: Trial.suggest_int() missing 1 required positional argument: 'high'
[W 2025-12-15 13:39:00,124] Trial 0 failed with value None.


TypeError: Trial.suggest_int() missing 1 required positional argument: 'high'

In [None]:
# Analyze second round results
print("="*70)
print("SECOND ROUND RESULTS")
print("="*70)

# Importance analysis
importances_v2 = optuna.importance.get_param_importances(study_v2)

print("\nImportancia de parámetros (segunda ronda):\n")
for param, importance in sorted(importances_v2.items(), key=lambda x: x[1], reverse=True):
    bar = "█" * int(importance * 50)
    print(f"{param:20s} {importance:.4f} {bar}")

# Best trial
print("\n" + "="*70)
print("MEJOR TRIAL - SEGUNDA RONDA")
print("="*70)
best_v2 = study_v2.best_trial
print(f"\nMRR: {best_v2.value:.4f}")
print(f"Mejor que ronda 1? {best_v2.value:.4f} vs 0.5811")
print("\nParámetros:")
for k, v in best_v2.params.items():
    print(f"  {k}: {v}")

# Compare with round 1
print("\n" + "="*70)
print("COMPARACIÓN")
print("="*70)
print(f"Ronda 1 mejor: 0.5811 (trial #4)")
print(f"Ronda 2 mejor: {best_v2.value:.4f}")
if best_v2.value > 0.5811:
    print("✓ MEJORAMOS!")
else:
    print("✗ No mejoramos, usar trial #4 de ronda 1")


SECOND ROUND RESULTS

Importancia de parámetros (segunda ronda):

temperature          0.3299 ████████████████
label_smoothing      0.1622 ████████
grad_clip_norm       0.1514 ███████
lr                   0.1449 ███████
d_model              0.1016 █████
n_heads              0.0605 ███
n_layers             0.0496 ██

MEJOR TRIAL - SEGUNDA RONDA

MRR: 0.5788
Mejor que ronda 1? 0.5788 vs 0.5811

Parámetros:
  lr: 0.00036
  label_smoothing: 0.0
  grad_clip_norm: 0.7
  d_model: 64
  n_layers: 3
  n_heads: 2
  temperature: 0.175

COMPARACIÓN
Ronda 1 mejor: 0.5811 (trial #4)
Ronda 2 mejor: 0.5788
✗ No mejoramos, usar trial #4 de ronda 1


In [1]:
# Save best hyperparameters (from whichever round was better)
if best_v2.value > 0.5811:
    # Use round 2 params + fixed params
    final_params = {
        **best_v2.params,
        # Add fixed params back
        'dropout': 0.27,
        'noise_std': 0.15,
        'jitter': 0,
        'warmup_epochs': 6,
        'mask_prob': 0.05,
        'batch_size': 8,
        'fusion_dim': 256,
        'weight_decay': 1e-5,
    }
    source = "round 2"
else:
    # Use round 1 params (trial #4)
    final_params = study.best_trial.params
    source = "round 1 (trial #4)"

# Add fixed config
final_config = {**FIXED_CONFIG, **final_params}

with open('/root/lemm/best_hyperparams.json', 'w') as f:
    json.dump(final_config, f, indent=2)

print(f"✓ Saved best hyperparameters from {source} to /root/lemm/best_hyperparams.json")
print("\nFinal config for LODOCV:")
for k, v in final_config.items():
    print(f"  {k}: {v}")


NameError: name 'best_v2' is not defined