In [None]:
!pip install torchmetrics mlflow databricks-cli requests optuna joblib optuna-integration[mlflow]

In [None]:
import mlflow.pytorch
import os
import mlflow
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get credentials from environment variables
DATABRICKS_HOST = os.getenv("DATABRICKS_HOST")
DATABRICKS_TOKEN = os.getenv("DATABRICKS_TOKEN")

# Set environment variables for MLflow
os.environ["DATABRICKS_HOST"] = DATABRICKS_HOST
os.environ["DATABRICKS_TOKEN"] = DATABRICKS_TOKEN

# Configure MLflow to use Databricks
mlflow.set_tracking_uri("databricks")

# Test connection
try:
    # Set or create experiment
    experiment_name = "/Users/colizu2020@gmail.com/cifar-100-vision-transformer"  # Replace with your email
    mlflow.set_experiment(experiment_name)
    print("‚úÖ Successfully connected to Databricks MLflow!")
except Exception as e:
    print(f"‚ùå Connection failed: {e}")

In [None]:
import mlflow
import mlflow.pytorch
from pathlib import Path
import os
from datetime import datetime

class ExperimentTracker:
    def __init__(self, experiment_name: str = "/Users/colizu2020@gmail.com/cifar100"):
        # Set Databricks as tracking URI
        mlflow.set_tracking_uri("databricks")

        # Set or create experiment
        self.experiment_name = experiment_name
        mlflow.set_experiment(experiment_name)

    def start_run(self, run_name: str = None):
        """Start a new MLflow run"""
        # Ensure any active run is ended before starting a new one,
        # which can happen in interactive environments if a previous run was interrupted.
        if mlflow.active_run():
            mlflow.end_run()

        if run_name is None:
            run_name = f"cifar100_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

        return mlflow.start_run(run_name=run_name)

    def log_params(self, params: dict):
        """Log hyperparameters"""
        mlflow.log_params(params)

    def log_metrics(self, metrics: dict, step: int = None):
        """Log training metrics"""
        for key, value in metrics.items():
            if step is not None:
                mlflow.log_metric(key, value, step=step)
            else:
                mlflow.log_metric(key, value)

    def log_model(self, model, artifact_path: str = "model", registered_model_name: str = None):
        """
        Log PyTorch model with proper parameter separation

        Args:
            model: The trained PyTorch model
            artifact_path: Where to store model files in the run (simple path)
            registered_model_name: Name for Unity Catalog registration (catalog.schema.model_name)
        """
        try:
            if registered_model_name:
                # Register model in Unity Catalog
                mlflow.pytorch.log_model(
                    pytorch_model=model,
                    artifact_path=artifact_path,  # Simple path like "model"
                    registered_model_name=registered_model_name  # Unity Catalog name
                )
                print(f"‚úÖ Model logged and registered as: {registered_model_name}")
            else:
                # Just log as artifact without registration
                mlflow.pytorch.log_model(
                    pytorch_model=model,
                    artifact_path=artifact_path
                )
                print(f"‚úÖ Model logged as artifact at: {artifact_path}")

        except Exception as e:
            print(f"‚ùå Model logging failed: {e}")
            # Fallback: basic artifact logging
            try:
                mlflow.pytorch.log_model(pytorch_model=model, artifact_path=artifact_path)
                print(f"‚úÖ Fallback: Model logged as artifact only")
            except Exception as fallback_error:
                print(f"‚ùå All model logging failed: {fallback_error}")

    def log_artifact(self, local_path: str, artifact_path: str = None):
        """Log files/artifacts"""
        mlflow.log_artifact(local_path, artifact_path)

    def log_figure(self, figure, filename: str):
        """Log matplotlib figures"""
        import tempfile
        import matplotlib.pyplot as plt

        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
            figure.savefig(tmp.name, dpi=150, bbox_inches='tight')
            mlflow.log_artifact(tmp.name, f"plots/{filename}")

        plt.close(figure)  # Clean up


In [None]:
import torch
import math
from torch import nn

class NewGELUActivation(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415

    Taken from https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py
    """

    def forward(self, input):
        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))

config = {
    "patch_size": 4,
    "num_classes": 100,
    "num_channels": 3,
    "num_hidden_layers": 8,
    "hidden_size": 384,
    "image_size": 32,
    "dropout_rate": 0.2,
    "num_attent_heads": 6,
    "intermediate_size": 4 * 384,
    "qkv_bias": True,
    "initializer_range": 0.02
}

class PatchEmbeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.patch_size = config["patch_size"]
        self.num_channels = config["num_channels"]
        self.hidden_size = config["hidden_size"]
        self.image_size = config["image_size"]
        self.num_patches = (self.image_size // self.patch_size) ** 2

        self.projection = nn.Conv2d(self.num_channels, self.hidden_size, kernel_size=self.patch_size, stride=self.patch_size)


    def forward(self, x):
        """
        x: (batch_size, num_channels, height, width)
        new_x: (batch_size, num_patches, hidden_size)
        """
        x = self.projection(x)
        x = x.flatten(2).transpose(1, 2)

        return x

class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.patch_embeddings = PatchEmbeddings(config)
        num_patches = self.patch_embeddings.num_patches
        self.cls_token = nn.Parameter(torch.randn(1, 1, config["hidden_size"]))
        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config["hidden_size"])) # Corrected size
        self.dropout = nn.Dropout(config["dropout_rate"])

    def forward(self, x):
        x = self.patch_embeddings(x)
        batch_size, _, _ = x.size()
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.position_embeddings # Used position_embeddings
        x = self.dropout(x)
        return x

class AttentionHead(nn.Module):
    def __init__(self, hidden_size, attention_head_size, dropout, bias=True):
        super().__init__()
        self.hidden_size = hidden_size
        self.attention_head_size = attention_head_size
        self.dropout = nn.Dropout(dropout) # Keep this to use the passed dropout rate

        self.query = nn.Linear(hidden_size, attention_head_size, bias=bias)
        self.key = nn.Linear(hidden_size, attention_head_size, bias=bias)
        self.value = nn.Linear(hidden_size, attention_head_size, bias=bias)

    def forward(self, x):
        query = self.query(x)
        key = self.key(x)
        value = self.value(x)

        attention_scores = torch.matmul(query, key.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # Apply dropout to attention scores before softmax
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        attention_scores = self.dropout(attention_probs)

        attention_output = torch.matmul(attention_probs, value)

        return (attention_output, attention_probs)

class MultiHeadedAttention(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.hidden_size = config["hidden_size"]
        self.num_attent_heads = config["num_attent_heads"]

        self.attent_head_size = self.hidden_size // self.num_attent_heads # Fixed typo
        self.qkv_bias = config["qkv_bias"]
        self.heads = nn.ModuleList([])

        for _ in range(self.num_attent_heads):
            head = AttentionHead(
                self.hidden_size,
                self.attent_head_size,
                config["dropout_rate"],
                self.qkv_bias
            )

            self.heads.append(head)

        self.all_head_size = self.num_attent_heads * self.attent_head_size # Added initialization for all_head_size

        self.output_projection = nn.Linear(self.all_head_size, self.hidden_size)
        self.output_dropout = nn.Dropout(config["dropout_rate"])

    def forward(self, x):
        attention_outputs = [head(x) for head in self.heads]
        attention_output = torch.cat([attent for attent, _ in attention_outputs], dim=-1)

        attention_output = self.output_projection(attention_output)
        attention_output = self.output_dropout(attention_output)

        return attention_output

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.dense_1 = nn.Linear(config["hidden_size"], config["intermediate_size"])
        self.activation = NewGELUActivation()
        self.dense_2 = nn.Linear(config["intermediate_size"], config["hidden_size"])
        self.dropout = nn.Dropout(config["dropout_rate"])


    def forward(self, x):
        x = self.dense_1(x)
        x = self.activation(x)
        x = self.dense_2(x)
        x = self.dropout(x)

        return x
    
class DropPath(nn.Module):
    def __init__(self, dropout_prob=0.0):
        super().__init__()
        self.dropout_prob = dropout_prob
        
    def forward(self, x):
        if self.dropout_prob == 0.0 or not self.training:
            return x
        
        keep_prob = 1 - self.dropout_prob
        shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
        random_tensor.floor_()
        return x.div(keep_prob) * random_tensor

class Block(nn.Module):
    def __init__(self, config, drop_path_rate=0.0):
        super().__init__()

        self.layer_norm1 = nn.LayerNorm(config["hidden_size"])
        self.mlp = MLP(config)
        self.layer_norm2 = nn.LayerNorm(config["hidden_size"])
        self.attention = MultiHeadedAttention(config)
        self.drop_path1 = DropPath(drop_path_rate)
        self.drop_path2 = DropPath(drop_path_rate)

    def forward(self, x):

        attention_input = self.layer_norm1(x)
        attention_output = self.attention(attention_input) # Removed attention_probs as it's not returned by MultiHeadedAttention
        attention_output = self.drop_path1(attention_output)
        x = x + attention_output # Add skip connection for attention output

        mlp_input = self.layer_norm2(x)
        mlp_output = self.mlp(mlp_input)
        mlp_output = self.drop_path2(mlp_output)
        x = x + mlp_output # Add skip connection for MLP output

        return x, attention_output # Returning attention_output for consistency if needed later, but only x is used        
    
class Encoder(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.blocks = nn.ModuleList([])
        for _ in range(config["num_hidden_layers"]):
            block = Block(config)
            self.blocks.append(block)


    def forward(self, x, output_attentions):
        all_attention = []

        for block in self.blocks:
            # The block forward method has been updated to return `x, attention_output`
            # We can capture attention_output if output_attentions is True.
            x, attention_output_for_block = block(x)
            if output_attentions:
                all_attention.append(attention_output_for_block) # Storing attention_output, not attention_probs

        return (x, all_attention)

class ViT(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.config = config
        self.image_size = config["image_size"]
        self.hidden_size = config["hidden_size"]
        self.num_classes = config["num_classes"]

        self.embedding = Embeddings(config)
        self.encoder = Encoder(config)

        self.classifier = nn.Linear(self.hidden_size, self.num_classes)
        self.apply(self._init_weights)


    def forward(self, x, output_attentions=False):
        embedding_output = self.embedding(x)
        encoder_output, all_attentions = self.encoder(embedding_output, output_attentions)
        logits = self.classifier(encoder_output[:, 0, :])

        if output_attentions:
           return (logits, all_attentions)
        return logits # Return logits if output_attentions is False


    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            torch.nn.init.normal_(module.weight, mean=0.0, std=self.config["initializer_range"])
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        elif isinstance(module, Embeddings):
            module.position_embeddings.data = nn.init.trunc_normal_(
                module.position_embeddings.data.to(torch.float32),
                mean=0.0,
                std=self.config["initializer_range"],
            ).to(module.position_embeddings.dtype)

            module.cls_token.data = nn.init.trunc_normal_(
                module.cls_token.data.to(torch.float32),
                mean=0.0,
                std=self.config["initializer_range"],
            ).to(module.cls_token.dtype)


In [None]:
"""
Vision Transformer Hyperparameter Search - Colab-Friendly Version
Run ONE trial at a time, resume later
"""

import torch
from torch import nn
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms
import torchmetrics
from torch.amp import GradScaler, autocast
import optuna
from optuna.integration.mlflow import MLflowCallback
import mlflow
import math
from datetime import datetime
import joblib
import numpy as np
import gc 

def mixup_cutmix(inputs, targets, alpha=0.8, cutmix_prob=0.5):
    """Apply MixUp or CutMix randomly"""
    batch_size = inputs.size(0)
    lam = np.random.beta(alpha, alpha)
    
    rand_index = torch.randperm(batch_size).to(inputs.device)
    
    if np.random.rand() < cutmix_prob:
        # CutMix
        _, _, H, W = inputs.shape
        cut_rat = np.sqrt(1. - lam)
        cut_w, cut_h = int(W * cut_rat), int(H * cut_rat)
        cx, cy = np.random.randint(W), np.random.randint(H)
        
        x1 = np.clip(cx - cut_w // 2, 0, W)
        x2 = np.clip(cx + cut_w // 2, 0, W)
        y1 = np.clip(cy - cut_h // 2, 0, H)
        y2 = np.clip(cy + cut_h // 2, 0, H)
        
        inputs[:, :, y1:y2, x1:x2] = inputs[rand_index, :, y1:y2, x1:x2]
        lam = 1 - ((x2 - x1) * (y2 - y1) / (W * H))
    else:
        # MixUp
        inputs = lam * inputs + (1 - lam) * inputs[rand_index]
    
    return inputs, targets, targets[rand_index], lam

def get_data_loaders():
    """Optimized data loaders"""
    train_transform = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.TrivialAugmentWide(),  # Faster than AutoAugment
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5071, 0.4865, 0.4409], std=[0.2673, 0.2564, 0.2761]),
        transforms.RandomErasing(p=0.25)
    ])

    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5071, 0.4867, 0.4408], std=[0.2675, 0.2565, 0.2761])
    ])

    cifar_train_raw = datasets.CIFAR100(root="./data", train=True, download=True, transform=None)
    train_size = int(0.9 * len(cifar_train_raw))
    
    cifar_train = Subset(
        datasets.CIFAR100(root="./data", train=True, download=True, transform=train_transform), 
        list(range(train_size))
    )
    cifar_val = Subset(
        datasets.CIFAR100(root="./data", train=True, transform=test_transform), 
        list(range(train_size, len(cifar_train_raw)))
    )

    # Larger batch size + fewer workers on Colab
    train_loader = DataLoader(
        cifar_train, 
        batch_size=1024,  # Increase from 256
        shuffle=True, 
        num_workers=2, 
        pin_memory=True,
        persistent_workers=True,
        drop_last=True,  # Faster - avoids small final batch
        prefetch_factor=6
    )
    val_loader = DataLoader(
        cifar_val, 
        batch_size=512,  # Increase from 512
        shuffle=False, 
        num_workers=2, 
        pin_memory=True,
        persistent_workers=True,
        prefetch_factor=6
    )

    return train_loader, val_loader


def train_single_config(hp_config, num_epochs=30):
    """Train model with a specific config - single trial"""

    gc.collect()
    torch.cuda.empty_cache()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Create model config
    model_config = {
        "patch_size": 4,
        "num_classes": 100,
        "num_channels": 3,
        "num_hidden_layers": 4,    # Reduced from 6
        "hidden_size": 192,        # Reduced from 256
        "image_size": 32,
        "dropout_rate": hp_config["dropout_rate"],
        "num_attent_heads": 6,     # Reduced from 8
        "intermediate_size": 768,  # Reduced from 1024
        "qkv_bias": True,
        "initializer_range": 0.02,
    }

    model = ViT(model_config).to(device)
    print(f"‚úÖ Model created with {sum(p.numel() for p in model.parameters()):,} parameters")

    if hasattr(torch, 'compile'):
        model = torch.compile(model, mode="reduce-overhead")
        print("‚úÖ Model compiled with torch.compile")

    print("üì¶ Loading data...")
    train_loader, val_loader = get_data_loaders()
    print(f"‚úÖ Data loaded: {len(train_loader)} train batches, {len(val_loader)} val batches")

    # Training setup
    loss_function = nn.CrossEntropyLoss(label_smoothing=hp_config["label_smoothing"])
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=hp_config["learning_rate"],
        weight_decay=hp_config["weight_decay"]
    )

    # Warmup + Cosine schedule
    warmup_epochs = hp_config["warmup_epochs"]
    def get_lr_lambda(epoch):
        if epoch < warmup_epochs:
            return (epoch + 1) / warmup_epochs
        else:
            progress = (epoch - warmup_epochs) / (num_epochs - warmup_epochs)
            return max(1e-6 / hp_config["learning_rate"], 0.5 * (1 + math.cos(math.pi * progress)))

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=get_lr_lambda)
    scaler = GradScaler()

    train_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=100).to(device)
    val_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=100).to(device)

    best_val_acc = 0.0
    best_val_loss = float('inf')

    # Training loop
    for epoch in range(num_epochs):
        # Train
        model.train()
        train_loss = 0.0
        train_accuracy.reset()

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            if epoch < num_epochs - 10:
                inputs, targets_a, targets_b , lam= mixup_cutmix(inputs, targets)
                with autocast(device_type='cuda'):
                    outputs = model(inputs)
                    loss = lam * loss_function(outputs, targets_a) + \
                           (1 - lam) * loss_function(outputs, targets_b)
            else:
                with autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu'):
                    outputs = model(inputs)
                    loss = loss_function(outputs, targets)

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)

            train_loss += loss.item()
            train_accuracy.update(outputs.detach(), targets)

        avg_train_loss = train_loss / len(train_loader)
        train_acc = train_accuracy.compute().item()
        scheduler.step()
        print(f"Epoch {epoch+1}/{num_epochs} - Train Acc: {train_acc:.4f}")

        if (epoch + 1) % 3 == 0 or epoch == num_epochs - 1:
            # Validate
            model.eval()
            val_loss = 0.0
            val_accuracy.reset()

            with torch.no_grad():
                for inputs, targets in val_loader:
                    inputs, targets = inputs.to(device), targets.to(device)

                    with autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu'):
                        outputs = model(inputs)
                        loss = loss_function(outputs, targets)

                    val_loss += loss.item()
                    val_accuracy.update(outputs, targets)

            avg_val_loss = val_loss / len(val_loader)
            val_acc = val_accuracy.compute().item()

            # Track best
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_val_loss = avg_val_loss

            # Log to MLflow
            mlflow.log_metrics({
                "train_loss": avg_train_loss,
                "train_accuracy": train_acc,
                "val_loss": avg_val_loss,
                "val_accuracy": val_acc,
                "learning_rate": optimizer.param_groups[0]['lr'],
            }, step=epoch)

            print(f"Epoch {epoch+1}/{num_epochs} - Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Best: {best_val_acc:.4f}")

    # Clean up
    del model
    torch.cuda.empty_cache()

    return {
        "best_val_accuracy": best_val_acc,
        "best_val_loss": best_val_loss,
        "final_train_accuracy": train_acc,
        "final_val_accuracy": val_acc,
    }


# Pre-defined configs to try one at a time
PRIORITY_CONFIGS = [
    # Config 1: Balanced (Best starting point)
    {
        "name": "improved_vit",
        "learning_rate": 1e-3,        # Lower LR for larger model
        "weight_decay": 0.05,         # Less aggressive
        "warmup_epochs": 5,          # Longer warmup
        "dropout_rate": 0.1,
        "label_smoothing": 0.1
    },
]


def run_single_trial(trial_index=0, num_epochs=30, experiment_name="/Users/colizu2020@gmail.com/cifar-100-vit-manual"):
    """
    Run a SINGLE trial - perfect for Colab free tier

    Args:
        trial_index: Which config to run (0-7)
        num_epochs: How many epochs (30 for quick test, 50 for final)
        experiment_name: MLflow experiment name
    """

    if trial_index >= len(PRIORITY_CONFIGS):
        print(f"‚ùå Invalid trial_index {trial_index}. Must be 0-{len(PRIORITY_CONFIGS)-1}")
        return

    hp_config = PRIORITY_CONFIGS[trial_index]

    print("="*80)
    print(f"RUNNING TRIAL {trial_index + 1}/{len(PRIORITY_CONFIGS)}")
    print("="*80)
    print(f"Config: {hp_config['name']}")
    print(f"Hyperparameters:")
    for key, value in hp_config.items():
        if key != 'name':
            print(f"  {key}: {value}")
    print("="*80)
    print()

    # Set MLflow experiment
    mlflow.set_experiment(experiment_name)

    # Start MLflow run
    with mlflow.start_run(run_name=f"{hp_config['name']}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):

        # Log hyperparameters
        mlflow.log_params({k: v for k, v in hp_config.items() if k != 'name'})
        mlflow.log_param("num_epochs", num_epochs)
        mlflow.log_param("trial_index", trial_index)

        # Train model
        print(f"Starting training for {num_epochs} epochs...")
        metrics = train_single_config(hp_config, num_epochs=num_epochs)

        # Log final metrics
        mlflow.log_metrics({
            "best_val_accuracy": metrics["best_val_accuracy"],
            "best_val_loss": metrics["best_val_loss"],
            "final_train_accuracy": metrics["final_train_accuracy"],
            "final_val_accuracy": metrics["final_val_accuracy"],
        })

        run_id = mlflow.active_run().info.run_id

    print()
    print("="*80)
    print("TRIAL COMPLETE")
    print("="*80)
    print(f"‚úÖ Best Val Accuracy: {metrics['best_val_accuracy']:.4f}")
    print(f"‚úÖ Best Val Loss: {metrics['best_val_loss']:.4f}")
    print(f"‚úÖ MLflow Run ID: {run_id}")
    print("="*80)

    # Save progress
    progress_file = "search_progress.txt"
    with open(progress_file, "a") as f:
        f.write(f"\nTrial {trial_index}: {hp_config['name']}\n")
        f.write(f"Val Acc: {metrics['best_val_accuracy']:.4f}\n")
        f.write(f"Run ID: {run_id}\n")
        f.write("-"*40 + "\n")

    print(f"\nüìù Progress saved to {progress_file}")
    print("\nüí° Next steps:")
    print(f"   - Run trial {trial_index + 1} next: run_single_trial(trial_index={trial_index + 1})")
    print(f"   - Check MLflow UI to compare results")
    print(f"   - Total trials remaining: {len(PRIORITY_CONFIGS) - trial_index - 1}")

    return metrics


def show_all_configs():
    """Display all available configs"""
    print("\nAVAILABLE CONFIGURATIONS:")
    print("="*80)
    for i, config in enumerate(PRIORITY_CONFIGS):
        print(f"\nTrial {i}: {config['name']}")
        print("-"*40)
        for key, value in config.items():
            if key != 'name':
                print(f"  {key:20s}: {value}")
    print("\n" + "="*80)
    print(f"\nTotal configs: {len(PRIORITY_CONFIGS)}")
    print("\nTo run a specific config:")
    print("  run_single_trial(trial_index=0)  # Run first config")
    print("  run_single_trial(trial_index=1)  # Run second config")
    print("  etc...")


# Run single trial
run_single_trial(trial_index=0, num_epochs=30)
