# Model Training and Evaluation Script

## Import Required Libraries

In [None]:
import os
import gc
import json
import warnings
import numpy as np
import pandas as pd
from datetime import datetime
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix
)
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EvalPrediction,
    set_seed
)
import joblib
import itertools
import random

# Suppress warnings and set reproducibility
warnings.filterwarnings('ignore')
set_seed(42)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

## Configuration

In [None]:
CONFIG = {
    "input_file": "rel_typ.xlsx",
    "output_dir": "model_training_results",
    
    "models": [
        "distilbert-base-uncased",
        "bert-base-uncased",
        "roberta-base",
        "roberta-large",
        "microsoft/DialoGPT-medium",
        "facebook/bart-base",
        "facebook/bart-large-mnli",
        "google/electra-base-discriminator",
        "microsoft/deberta-v3-base",
        "microsoft/deberta-v3-large",
        "meta-llama/Llama-3.2-1B-Instruct",
    ],
    
    "training": {
        "base_epochs": 2,
        "base_max_length": 1024,
        "base_batch_size": 2,
        "base_learning_rate": 5e-5,
        "base_weight_decay": 0.01,
        "base_warmup_steps": 50,
        "eval_steps": 25,
        "save_steps": 50,
    },
    
    "hyperparameter_tuning": {
        "enabled": True,
        "max_total_runs": 30,
        "configs_per_model": 3,
        "learning_rates": [3e-5, 5e-5, 1e-4],
        "batch_sizes": [1, 2],
        "epochs": [2, 3],
        "max_lengths": [1024, 512, 384, 256],
        "weight_decays": [0.01]
    },
    
    "evaluation": {
        "confidence_bins": 5,
        "test_size": 0.2,
        "val_size": 0.1,
    }
}

## Directory Structure Setup

In [None]:
def create_directory_structure(base_dir):
    """Create organized directory structure for results"""
    dirs = [
        f"{base_dir}/metrics",
        f"{base_dir}/predictions", 
        f"{base_dir}/label_encoders",
        f"{base_dir}/config"
    ]
    for dir_path in dirs:
        os.makedirs(dir_path, exist_ok=True)
    return dirs

print("Creating directory structure...")
create_directory_structure(CONFIG["output_dir"])

## Data Loading and Preprocessing

In [None]:
def load_and_clean_data(file_path):
    """Load and clean the dataset"""
    print(f"Loading data from {file_path}...")
    
    try:
        df = pd.read_excel(file_path)
    except FileNotFoundError:
        print(f"❌ Error: File {file_path} not found!")
        return None
    except Exception as e:
        print(f"❌ Error loading file: {e}")
        return None
    
    print(f"Original dataset shape: {df.shape}")
    
    required_columns = ['tweet_text', 'note_text', 'label']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print(f"❌ Error: Missing required columns: {missing_columns}")
        print(f"Available columns: {list(df.columns)}")
        return None
    
    print(f"Original labels:\n{df['label'].value_counts()}")
    
    # Clean text columns
    df["tweet_text"] = df["tweet_text"].astype(str).fillna("").str.strip()
    df["note_text"] = df["note_text"].astype(str).fillna("").str.strip()
    
    # Clean and standardize labels
    df["label"] = df["label"].astype(str).str.strip().str.lower()
    
    print(f"After label cleaning:\n{df['label'].value_counts()}")
    
    # Remove irrelevant and probative labels
    excluded_labels = ["irrelevant", "probative"]
    initial_count = len(df)
    df = df[~df["label"].isin(excluded_labels)]
    print(f"Removed {initial_count - len(df)} rows with excluded labels {excluded_labels}")
    
    # Remove rows with missing labels
    df = df.dropna(subset=["label"])
    
    if len(df) == 0:
        print("❌ Error: No data remaining after cleaning!")
        return None
    
    if len(df['label'].unique()) < 2:
        print("❌ Error: Need at least 2 different labels for classification!")
        return None
    
    # Create combined text
    df["text"] = df["tweet_text"] + " [SEP] " + df["note_text"]
    
    print(f"Final dataset shape: {df.shape}")
    print(f"Final label distribution:\n{df['label'].value_counts()}")
    
    return df

def prepare_label_encoder(df):
    """Prepare and save label encoder"""
    label_encoder = LabelEncoder()
    df["label_id"] = label_encoder.fit_transform(df["label"])
    
    encoder_path = f"{CONFIG['output_dir']}/label_encoders/label_encoder.pkl"
    joblib.dump(label_encoder, encoder_path)
    
    label_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}
    mapping_path = f"{CONFIG['output_dir']}/label_encoders/label_mapping.json"
    with open(mapping_path, 'w') as f:
        json.dump(label_mapping, f, indent=2)
    
    print(f"Label encoder saved to {encoder_path}")
    print(f"Label mapping: {label_mapping}")
    
    return df, label_encoder

def split_data(df):
    """Split data into train, validation, and test sets"""
    min_samples = 10
    if len(df) < min_samples:
        print(f"❌ Error: Dataset too small ({len(df)} samples). Need at least {min_samples} samples.")
        return None, None, None
    
    class_counts = df['label_id'].value_counts()
    min_class_size = class_counts.min()
    if min_class_size < 3:
        print(f"❌ Error: Some classes have too few samples (minimum: {min_class_size})")
        print("Class distribution:", class_counts)
        return None, None, None
    
    try:
        train_val_df, test_df = train_test_split(
            df, 
            test_size=CONFIG["evaluation"]["test_size"], 
            stratify=df["label_id"], 
            random_state=42
        )
        
        val_size_adjusted = CONFIG["evaluation"]["val_size"] / (1 - CONFIG["evaluation"]["test_size"])
        train_df, val_df = train_test_split(
            train_val_df, 
            test_size=val_size_adjusted, 
            stratify=train_val_df["label_id"], 
            random_state=42
        )
        
        print(f"Data split - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
        
        # Save splits
        train_df.to_csv(f"{CONFIG['output_dir']}/config/train_split.csv", index=False)
        val_df.to_csv(f"{CONFIG['output_dir']}/config/val_split.csv", index=False)
        test_df.to_csv(f"{CONFIG['output_dir']}/config/test_split.csv", index=False)
        
        return train_df, val_df, test_df
    
    except ValueError as e:
        print(f"❌ Error splitting data: {e}")
        return None, None, None

## Hyperparameter Configuration Generation

In [None]:
def generate_hyperparameter_configs(model_name, num_configs=3):
    """Generate hyperparameter configurations for a model"""
    hp_config = CONFIG["hyperparameter_tuning"]
    
    # Model-specific adjustments based on known characteristics
    if "llama" in model_name.lower():
        batch_sizes = [1]
        learning_rates = [1e-5, 3e-5, 5e-5]
        max_lengths = [512, 384, 256]
    elif "deberta" in model_name.lower():
        batch_sizes = [1, 2]
        learning_rates = [1e-5, 2e-5, 3e-5]
        max_lengths = [1024, 512, 384, 256]
    elif "dialogpt" in model_name.lower():
        batch_sizes = [1, 2]
        learning_rates = [1e-5, 2e-5, 3e-5]
        max_lengths = [512, 384, 256]
    elif ("bart" in model_name.lower() and "mnli" not in model_name.lower()):
        batch_sizes = [1, 2]
        learning_rates = [1e-5, 2e-5, 3e-5]
        max_lengths = [1024, 512, 384, 256]
    elif "bart" in model_name.lower() and "mnli" in model_name.lower():
        batch_sizes = [1, 2]
        learning_rates = [1e-5, 2e-5]
        max_lengths = [1024, 512, 384, 256]
    elif "electra" in model_name.lower():
        batch_sizes = [1, 2]
        learning_rates = [2e-5, 3e-5, 5e-5]
        max_lengths = [1024, 512, 384, 256]
    elif "roberta" in model_name.lower():
        batch_sizes = [1, 2]
        learning_rates = hp_config["learning_rates"]
        max_lengths = [1024, 512, 384, 256]
    elif "bert-base" in model_name.lower():
        batch_sizes = [1, 2]
        learning_rates = hp_config["learning_rates"]
        max_lengths = [1024, 512, 384, 256]
    elif "distilbert" in model_name.lower():
        batch_sizes = [1, 2]
        learning_rates = hp_config["learning_rates"]
        max_lengths = [512, 384, 256]
    else:
        batch_sizes = [1, 2]
        learning_rates = hp_config["learning_rates"]
        max_lengths = [512, 384, 256]
    
    all_combinations = list(itertools.product(
        learning_rates,
        batch_sizes,
        hp_config["epochs"],
        max_lengths,
        hp_config["weight_decays"]
    ))
    
    if len(all_combinations) > num_configs:
        selected_combinations = random.sample(all_combinations, num_configs)
    else:
        selected_combinations = all_combinations
    
    configs = []
    for i, (lr, bs, epochs, max_len, wd) in enumerate(selected_combinations):
        config = {
            "config_id": i + 1,
            "learning_rate": lr,
            "batch_size": bs,
            "epochs": epochs,
            "max_length": max_len,
            "weight_decay": wd,
            "gradient_accumulation_steps": max(1, 4 // bs)
        }
        configs.append(config)
    
    return configs

## Model Training Functions

In [None]:
def tokenize_data(df, tokenizer, max_length):
    """Tokenize the data"""
    dataset = Dataset.from_pandas(df[["text", "label_id"]].rename(columns={"label_id": "label"}))
    
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=50)
    tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    
    return tokenized_dataset

def compute_class_weights(y):
    """Compute class weights for imbalanced data"""
    classes = np.unique(y)
    weights = compute_class_weight('balanced', classes=classes, y=y)
    return torch.tensor(weights, dtype=torch.float)

def compute_metrics(eval_pred: EvalPrediction):
    """Compute evaluation metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1_macro': f1_score(labels, predictions, average='macro'),
        'f1_weighted': f1_score(labels, predictions, average='weighted'),
        'precision_macro': precision_score(labels, predictions, average='macro'),
        'recall_macro': recall_score(labels, predictions, average='macro'),
    }

class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        
        if self.class_weights is not None:
            loss_fn = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        else:
            loss_fn = torch.nn.CrossEntropyLoss()
        
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

## Model Training with Fallback

In [None]:
def train_and_evaluate_model(model_name, train_df, val_df, test_df, class_weights, label_encoder, hp_config=None):
    """Train and evaluate a single model with specific hyperparameters"""
    model_short_name = model_name.split("/")[-1]
    
    if hp_config is None:
        hp_config = {
            "config_id": 0,
            "learning_rate": CONFIG["training"]["base_learning_rate"],
            "batch_size": CONFIG["training"]["base_batch_size"],
            "epochs": CONFIG["training"]["base_epochs"],
            "max_length": CONFIG["training"]["base_max_length"],
            "weight_decay": CONFIG["training"]["base_weight_decay"],
            "gradient_accumulation_steps": 1
        }
    
    config_id = hp_config["config_id"]
    run_name = f"{model_short_name}_config_{config_id}"
    
    print(f"\n{'='*80}")
    print(f"🚀 Training: {model_name}")
    print(f"📋 Config {config_id}: LR={hp_config['learning_rate']}, BS={hp_config['batch_size']}, "
          f"Epochs={hp_config['epochs']}, MaxLen={hp_config['max_length']}")
    print(f"{'='*80}")
    
    try:
        # Clear memory before each run
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        print("Loading tokenizer and model...")
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        
        # Handle missing pad token
        if tokenizer.pad_token is None:
            if tokenizer.eos_token:
                tokenizer.pad_token = tokenizer.eos_token
            else:
                tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        
        # Load model
        model_kwargs = {
            "num_labels": NUM_LABELS,
            "low_cpu_mem_usage": False,
        }
        
        if torch.cuda.is_available():
            model_kwargs["torch_dtype"] = torch.float32
            if torch.cuda.device_count() > 1:
                model_kwargs["device_map"] = None
        
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            **model_kwargs
        )
        
        # BART-specific model configuration
        if "bart" in model_name.lower():
            if hasattr(model.config, 'forced_bos_token_id'):
                model.config.forced_bos_token_id = None
                print("🔧 Disabled forced_bos_token_id for BART classification")
            
            if hasattr(model.config, 'decoder_start_token_id') and model.config.decoder_start_token_id is None:
                model.config.decoder_start_token_id = tokenizer.pad_token_id
                print(f"🔧 Set decoder_start_token_id to {tokenizer.pad_token_id}")
        
        model.resize_token_embeddings(len(tokenizer))
        model.config.pad_token_id = tokenizer.pad_token_id
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = model.to(device)
        
        print(f"Model loaded. Parameters: {model.num_parameters():,}")
        
        # Tokenize datasets
        print("Tokenizing datasets...")
        train_dataset = tokenize_data(train_df, tokenizer, hp_config["max_length"])
        val_dataset = tokenize_data(val_df, tokenizer, hp_config["max_length"])
        test_dataset = tokenize_data(test_df, tokenizer, hp_config["max_length"])
        
        # Training arguments
        training_args_dict = {
            "output_dir": f"./temp_training_{run_name}",
            "eval_strategy": "steps",
            "eval_steps": CONFIG["training"]["eval_steps"],
            "save_steps": CONFIG["training"]["save_steps"],
            "num_train_epochs": hp_config["epochs"],
            "per_device_train_batch_size": hp_config["batch_size"],
            "per_device_eval_batch_size": hp_config["batch_size"],
            "gradient_accumulation_steps": hp_config["gradient_accumulation_steps"],
            "learning_rate": hp_config["learning_rate"],
            "weight_decay": hp_config["weight_decay"],
            "warmup_steps": CONFIG["training"]["base_warmup_steps"],
            "logging_steps": 10,
            "save_strategy": "steps",
            "load_best_model_at_end": True,
            "metric_for_best_model": "eval_f1_macro",
            "greater_is_better": True,
            "report_to": "none",
            "dataloader_num_workers": 0,
            "remove_unused_columns": True,
            "dataloader_pin_memory": False,
            "max_grad_norm": 1.0,
            "save_total_limit": 1,
            "prediction_loss_only": False,
            "ddp_find_unused_parameters": False,
            "optim": "adamw_torch",
        }
        
        if torch.cuda.is_available() and device == "cuda":
            training_args_dict["fp16"] = True
            training_args_dict["fp16_opt_level"] = "O1"
            print("🚀 Using FP16 for GPU training")
        else:
            training_args_dict["fp16"] = False
            training_args_dict["bf16"] = False
            print("🔧 Using FP32 for CPU training")
        
        training_args = TrainingArguments(**training_args_dict)
        
        # Create trainer
        trainer = WeightedTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            class_weights=class_weights
        )
        
        # Train the model
        print("Starting training...")
        train_result = trainer.train()
        
        # Evaluate on test set
        print("Evaluating on test set...")
        test_results = trainer.predict(test_dataset)
        
        # Extract predictions and probabilities
        logits = test_results.predictions
        probabilities = torch.softmax(torch.tensor(logits), dim=1).numpy()
        predicted_labels = np.argmax(probabilities, axis=1)
        true_labels = test_results.label_ids
        confidence_scores = np.max(probabilities, axis=1)
        
        # Calculate metrics
        metrics = {
            'model_name': model_name,
            'run_name': run_name,
            'config_id': config_id,
            'hyperparameters': hp_config,
            'accuracy': accuracy_score(true_labels, predicted_labels),
            'f1_macro': f1_score(true_labels, predicted_labels, average='macro'),
            'f1_weighted': f1_score(true_labels, predicted_labels, average='weighted'),
            'precision_macro': precision_score(true_labels, predicted_labels, average='macro'),
            'recall_macro': recall_score(true_labels, predicted_labels, average='macro'),
            'train_time': train_result.metrics.get('train_runtime', 0),
            'train_loss': train_result.metrics.get('train_loss', 0),
            'timestamp': datetime.now().isoformat(),
        }
        
        # Add per-class metrics
        per_class_f1 = f1_score(true_labels, predicted_labels, average=None)
        for i, f1 in enumerate(per_class_f1):
            class_name = label_encoder.inverse_transform([i])[0]
            metrics[f'f1_{class_name}'] = f1
        
        print(f"✅ Results - Accuracy: {metrics['accuracy']:.4f}, F1-Macro: {metrics['f1_macro']:.4f}")
        
        # Create detailed results DataFrame
        results_df = pd.DataFrame({
            'text': test_df['text'].values,
            'true_label': label_encoder.inverse_transform(true_labels),
            'predicted_label': label_encoder.inverse_transform(predicted_labels),
            'confidence': confidence_scores,
            'correct': true_labels == predicted_labels
        })
        
        # Add probability columns for each class
        for i, class_name in enumerate(label_encoder.classes_):
            results_df[f'prob_{class_name}'] = probabilities[:, i]
        
        # Save predictions and metrics
        predictions_path = f"{CONFIG['output_dir']}/predictions/{run_name}_predictions.csv"
        results_df.to_csv(predictions_path, index=False)
        
        metrics_path = f"{CONFIG['output_dir']}/metrics/{run_name}_metrics.json"
        with open(metrics_path, 'w') as f:
            json.dump(metrics, f, indent=2, default=str)
        
        # Clean up temporary training directory
        import shutil
        temp_dir = f"./temp_training_{run_name}"
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)
        
        return metrics, results_df, run_name, True
        
    except Exception as e:
        error_msg = f"❌ Error training {model_name} (config {config_id}): {str(e)}"
        print(error_msg)
        print(f"Error type: {type(e).__name__}")
        
        error_metrics = {
            'model_name': model_name,
            'run_name': run_name,
            'config_id': config_id,
            'hyperparameters': hp_config,
            'error': str(e),
            'error_type': type(e).__name__,
            'accuracy': None,
            'f1_macro': None,
            'timestamp': datetime.now().isoformat(),
        }
        
        error_path = f"{CONFIG['output_dir']}/metrics/{run_name}_error.json"
        with open(error_path, 'w') as f:
            json.dump(error_metrics, f, indent=2, default=str)
        
        return error_metrics, None, run_name, False
        
    finally:
        # Aggressive cleanup
        try:
            del model, trainer, tokenizer
        except:
            pass
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

def train_and_evaluate_model_with_fallback(model_name, train_df, val_df, test_df, class_weights, label_encoder, hp_config=None):
    """Train model with adaptive sequence length fallback"""
    
    fallback_lengths = [1024, 512, 384, 256, 128]
    
    original_max_length = hp_config["max_length"] if hp_config else CONFIG["training"]["base_max_length"]
    
    if original_max_length in fallback_lengths:
        start_idx = fallback_lengths.index(original_max_length)
        lengths_to_try = fallback_lengths[start_idx:]
    else:
        lengths_to_try = [original_max_length] + fallback_lengths
    
    last_error = None
    
    for attempt, max_length in enumerate(lengths_to_try):
        print(f"\n🔄 Attempt {attempt + 1}: Trying max_length={max_length}")
        
        current_config = hp_config.copy() if hp_config else {
            "config_id": 0,
            "learning_rate": CONFIG["training"]["base_learning_rate"],
            "batch_size": CONFIG["training"]["base_batch_size"],
            "epochs": CONFIG["training"]["base_epochs"],
            "weight_decay": CONFIG["training"]["base_weight_decay"],
            "gradient_accumulation_steps": 1
        }
        current_config["max_length"] = max_length
        
        try:
            metrics, results_df, run_name, success = train_and_evaluate_model(
                model_name, train_df, val_df, test_df, class_weights, label_encoder, current_config
            )
            
            if success:
                print(f"✅ Success with max_length={max_length}")
                return metrics, results_df, run_name, success
            else:
                last_error = metrics.get('error', 'Unknown error')
                print(f"❌ Failed with max_length={max_length}: {last_error}")
                
        except Exception as e:
            last_error = str(e)
            print(f"❌ Exception with max_length={max_length}: {last_error}")
            
            if any(keyword in str(e).lower() for keyword in ['memory', 'cuda', 'out of memory', 'oom']):
                print(f"🔄 Memory error detected, trying smaller sequence length...")
                continue
            else:
                break
    
    model_short_name = model_name.split("/")[-1]
    config_id = hp_config["config_id"] if hp_config else 0
    run_name = f"{model_short_name}_config_{config_id}"
    
    error_metrics = {
        'model_name': model_name,
        'run_name': run_name,
        'config_id': config_id,
        'hyperparameters': hp_config,
        'error': f"All fallback attempts failed. Last error: {last_error}",
        'error_type': 'FallbackExhausted',
        'accuracy': None,
        'f1_macro': None,
        'timestamp': datetime.now().isoformat(),
    }
    
    return error_metrics, None, run_name, False

## Main Training Execution

In [None]:
def main():
    """Main execution function"""
    print(f"\n{'='*80}")
    print("🎯 STARTING MULTI-MODEL TRAINING WITH HYPERPARAMETER TUNING")
    print(f"{'='*80}")
    
    # Load and prepare data
    print("\n📊 Loading and preparing data...")
    df = load_and_clean_data(CONFIG["input_file"])
    if df is None:
        print("❌ Failed to load data. Exiting.")
        return
    
    df, label_encoder = prepare_label_encoder(df)
    train_df, val_df, test_df = split_data(df)
    
    if train_df is None or val_df is None or test_df is None:
        print("❌ Failed to split data. Exiting.")
        return
    
    global NUM_LABELS
    NUM_LABELS = len(label_encoder.classes_)
    print(f"Number of classes: {NUM_LABELS}")
    
    # Calculate class weights
    class_weights = compute_class_weights(train_df['label_id'].values)
    print(f"Class weights: {class_weights}")
    
    # Store results
    all_metrics = []
    successful_runs = 0
    failed_runs = 0
    total_runs = 0
    
    print(f"\n📊 Hyperparameter tuning: {CONFIG['hyperparameter_tuning']['configs_per_model']} configs per model")
    print(f"🎯 Maximum total runs: {CONFIG['hyperparameter_tuning']['max_total_runs']}")
    
    # Train each model
    for model_name in CONFIG["models"]:
        if total_runs >= CONFIG['hyperparameter_tuning']['max_total_runs']:
            print(f"\n⚠️ Reached maximum total runs, stopping...")
            break
        
        model_short_name = model_name.split("/")[-1]
        print(f"\n{'='*100}")
        print(f"🔬 HYPERPARAMETER TUNING FOR: {model_name}")
        print(f"{'='*100}")
        
        # Generate hyperparameter configurations
        hp_configs = generate_hyperparameter_configs(model_name, CONFIG["hyperparameter_tuning"]["configs_per_model"])
        print(f"Generated {len(hp_configs)} hyperparameter configurations")
        
        for hp_config in hp_configs:
            if total_runs >= CONFIG['hyperparameter_tuning']['max_total_runs']:
                break
            
            total_runs += 1
            
            metrics, results_df, run_name, success = train_and_evaluate_model_with_fallback(
                model_name, train_df, val_df, test_df, class_weights, label_encoder, hp_config
            )
            
            all_metrics.append(metrics)
            
            if success:
                successful_runs += 1
                print(f"✅ Successful run: {run_name}")
            else:
                failed_runs += 1
                print(f"❌ Failed run: {run_name}")
        
        # Print model summary
        successful_model_runs = [m for m in all_metrics if m['model_name'] == model_name and m.get('accuracy') is not None]
        if successful_model_runs:
            best_run = max(successful_model_runs, key=lambda x: x['f1_macro'])
            print(f"\n🏆 Best configuration for {model_short_name}:")
            print(f"   Run: {best_run['run_name']}")
            print(f"   F1-Macro: {best_run['f1_macro']:.4f}")
            print(f"   Accuracy: {best_run['accuracy']:.4f}")
    
    print(f"\n{'='*80}")
    print("📊 TRAINING SUMMARY")
    print(f"{'='*80}")
    print(f"Total runs attempted: {total_runs}")
    print(f"Successful runs: {successful_runs}")
    print(f"Failed runs: {failed_runs}")
    print(f"Success rate: {successful_runs/total_runs*100:.1f}%" if total_runs > 0 else "No runs completed")
    
    if successful_runs > 0:
        # Save results summary
        metrics_summary_path = f"{CONFIG['output_dir']}/metrics/all_models_summary.csv"
        df_all_metrics = pd.DataFrame(all_metrics)
        df_all_metrics.to_csv(metrics_summary_path, index=False)
        
        # Find best model
        successful_models = df_all_metrics[df_all_metrics['accuracy'].notna()]
        if len(successful_models) > 0:
            best_model_idx = successful_models['f1_macro'].idxmax()
            best_model = successful_models.loc[best_model_idx]
            print(f"\n🏆 BEST MODEL: {best_model['model_name']}")
            print(f"   Run: {best_model['run_name']}")
            print(f"   Accuracy: {best_model['accuracy']:.4f}")
            print(f"   F1-Macro: {best_model['f1_macro']:.4f}")
            
            best_model_info = {
                'model_name': best_model['model_name'],
                'run_name': best_model['run_name'],
                'metrics': best_model.to_dict()
            }
            
            with open(f"{CONFIG['output_dir']}/config/best_model.json", 'w') as f:
                json.dump(best_model_info, f, indent=2, default=str)
        
        print(f"\n{'='*60}")
        print("✅ TRAINING COMPLETE!")
        print(f"{'='*60}")
        print(f"📁 All results saved to: {CONFIG['output_dir']}")
        print(f"🏆 Best model info saved to: {CONFIG['output_dir']}/config/best_model.json")
        print(f"📊 Summary metrics: {metrics_summary_path}")
    else:
        print("\n❌ No successful model training runs. Please check your data and configuration.")

if __name__ == "__main__":
    main()