# Comprehensive BART Model Experiments

Systematic evaluation of all model configurations to determine optimal approach for tweet-note topic classification.

## Experimental Design
- **Model Types**: Tweet-only, Note-only, Combined input
- **Training Strategies**: With/without "other" class  
- **Confidence Thresholds**: 0.5 to 0.99 (8 levels)
- **Total Configurations**: 6 models × 8 thresholds = 48 experiments

Uses optimal hyperparameters from previous optimization (LR=2e-5, BS=4, Epochs=5).

## Dependencies and Configuration
Setting up imports, optimal hyperparameters, and experimental configurations.

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from transformers import (
    BartForSequenceClassification, 
    BartTokenizer, 
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset
from scipy.special import softmax
from torch.utils.data import DataLoader
import random
import gc
import json
import pickle
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configuration
TRAIN_FILE = "Training set labeled.xlsx"
TEST_FILE = "Test set labeled.xlsx"
RANDOM_STATE = 42
MODEL_NAME = "facebook/bart-large-mnli"

# Use optimal hyperparameters from previous optimization
OPTIMAL_CONFIG = {
    'num_epochs': 5,
    'batch_size': 4,
    'learning_rate': 2e-5,
    'weight_decay': 0.0,
    'max_token_length': 1024
}

# Experiment configurations
EXPERIMENT_CONFIGS = [
    {
        'name': 'with_other',
        'exclude_classes': [],
        'description': 'Train on all 7 classes including "other"'
    },
    {
        'name': 'without_other', 
        'exclude_classes': ['other'],
        'description': 'Train on 6 classes excluding "other"'
    }
]

MODEL_MODES = ['tweet', 'note', 'combined']
CONFIDENCE_THRESHOLDS = [0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 0.99]

# Device setup
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Set seeds for reproducibility
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_STATE)

# Create output directory
os.makedirs('experiment_results', exist_ok=True)
os.makedirs('trained_models', exist_ok=True)  # For final model storage

## Experiment Logging
Utility functions for tracking experimental progress and results across all 48 configurations.

In [None]:
def log_experiment(message, log_file='experiment_results/experiment_log.txt'):
    """Log experiment progress with timestamp"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_message = f"[{timestamp}] {message}"
    print(log_message)
    
    with open(log_file, 'a') as f:
        f.write(log_message + '\n')

## Data Loading and Preprocessing
Functions for consistent data preparation across all experimental conditions.  
Handles text cleaning, label encoding, dataset balancing, and excluded class management.

In [None]:
def load_and_preprocess_data(train_file, test_file, excluded_classes=None):
    """Load and clean datasets with consistent preprocessing"""
    
    if excluded_classes is None:
        excluded_classes = []
    
    def clean_dataframe(df):
        df['label'] = df['label'].astype(str).str.lower().str.strip()
        df.dropna(subset=['label', 'tweet_text', 'note_text'], inplace=True)
        df = df[df['label'] != '']
        
        # Clean text
        def clean_text(text):
            if pd.isna(text):
                return ""
            text = str(text).strip()
            text = ''.join(char for char in text if ord(char) >= 32 or char in '\t\n\r')
            return text
        
        df['tweet_text'] = df['tweet_text'].apply(clean_text)
        df['note_text'] = df['note_text'].apply(clean_text)
        df = df[(df['tweet_text'] != "") & (df['note_text'] != "")]
        
        return df.reset_index(drop=True)
    
    df_train = clean_dataframe(pd.read_excel(train_file))
    df_test = clean_dataframe(pd.read_excel(test_file))
    
    # Filter training data if excluding classes
    if excluded_classes:
        original_size = len(df_train)
        df_train = df_train[~df_train['label'].isin(excluded_classes)]
        log_experiment(f"Excluded {original_size - len(df_train)} training samples with classes {excluded_classes}")
    
    return df_train, df_test

def oversample_minority_classes(df, random_state=42):
    """Balance dataset through minority oversampling"""
    class_counts = df['label'].value_counts()
    max_count = class_counts.max()
    
    oversampled_dfs = []
    for label in class_counts.index:
        class_df = df[df['label'] == label]
        oversampled_df = class_df.sample(max_count, replace=True, random_state=random_state)
        oversampled_dfs.append(oversampled_df)
    
    balanced_df = pd.concat(oversampled_dfs).sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    log_experiment(f"Balanced dataset: {len(df)} → {len(balanced_df)} samples")
    return balanced_df

def encode_labels_with_exclusions(df_train, df_test, excluded_classes=None):
    """Encode labels handling excluded classes for evaluation"""
    
    if excluded_classes is None:
        excluded_classes = []
    
    # Create label encoder using only training classes
    label_encoder = LabelEncoder()
    label_encoder.fit(df_train['label'].unique())
    num_classes = len(label_encoder.classes_)
    
    # Encode training labels
    df_train['label_id'] = label_encoder.transform(df_train['label'])
    
    # Create test datasets
    df_test_training = df_test[~df_test['label'].isin(excluded_classes)].copy()
    df_test_training['label_id'] = label_encoder.transform(df_test_training['label'])
    
    # Handle full test set with excluded classes
    def encode_test_label_safe(label):
        if label in excluded_classes:
            return num_classes  # Special ID for excluded classes
        else:
            try:
                return label_encoder.transform([label])[0]
            except ValueError:
                return num_classes
    
    df_test['label_id'] = df_test['label'].apply(encode_test_label_safe)
    
    # Create extended label encoder for reporting
    all_unique_labels = list(label_encoder.classes_) + excluded_classes
    extended_label_encoder = LabelEncoder()
    extended_label_encoder.fit(all_unique_labels)
    
    log_experiment(f"Label encoding: {num_classes} training classes, {len(excluded_classes)} excluded")
    
    return df_train, df_test_training, df_test, label_encoder, extended_label_encoder, num_classes

## Model Training Framework
Core training functions using optimal hyperparameters.  
Models are trained efficiently with final models saved for validation notebook reproducibility.

In [None]:
def create_datasets(df_train, df_test, mode, tokenizer):
    """Create tokenized datasets for specific input mode"""
    
    def tokenize_function(batch):
        if mode == 'combined':
            return tokenizer(
                batch['tweet_text'], 
                batch['note_text'], 
                truncation="longest_first",
                padding="max_length", 
                max_length=OPTIMAL_CONFIG['max_token_length']
            )
        elif mode == 'tweet':
            return tokenizer(
                batch['tweet_text'], 
                padding='max_length', 
                truncation=True, 
                max_length=OPTIMAL_CONFIG['max_token_length']
            )
        else:  # note mode
            return tokenizer(
                batch['note_text'], 
                padding='max_length', 
                truncation=True, 
                max_length=OPTIMAL_CONFIG['max_token_length']
            )

    # Create datasets
    train_dataset = Dataset.from_pandas(
        df_train[['tweet_text', 'note_text', 'label_id']].rename(columns={'label_id': 'labels'})
    )
    test_dataset = Dataset.from_pandas(
        df_test[['tweet_text', 'note_text', 'label_id']].rename(columns={'label_id': 'labels'})
    )
    
    # Tokenize
    train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=16)
    test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=16)
    
    # Remove text columns
    columns_to_remove = ['tweet_text', 'note_text']
    if '__index_level_0__' in train_dataset.column_names:
        columns_to_remove.append('__index_level_0__')
    
    train_dataset = train_dataset.remove_columns(columns_to_remove)
    test_dataset = test_dataset.remove_columns(columns_to_remove)
    
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    
    return train_dataset, test_dataset

def train_model(train_dataset, test_dataset, experiment_name, mode, num_classes):
    """Train BART model with optimal hyperparameters"""
    
    # Clear GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Initialize model
    model = BartForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=num_classes,
        ignore_mismatched_sizes=True
    )
    
    model.config.num_labels = num_classes
    model = model.to(DEVICE)
    
    # Tokenizer
    tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)
    
    # Training arguments (no intermediate saving, but save final model)
    output_dir = f"temp_{experiment_name}_{mode}"
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="no",  # Don't save intermediate checkpoints
        num_train_epochs=OPTIMAL_CONFIG['num_epochs'],
        per_device_train_batch_size=OPTIMAL_CONFIG['batch_size'],
        per_device_eval_batch_size=OPTIMAL_CONFIG['batch_size'] * 2,
        learning_rate=OPTIMAL_CONFIG['learning_rate'],
        weight_decay=OPTIMAL_CONFIG['weight_decay'],
        seed=RANDOM_STATE,
        load_best_model_at_end=False,  # No intermediate saving
        save_total_limit=0,
        report_to="none",
        logging_strategy="epoch",
        dataloader_num_workers=0,
        remove_unused_columns=False,
        disable_tqdm=True
    )

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        if isinstance(predictions, tuple):
            predictions = predictions[0]
        predictions = np.argmax(predictions, axis=1)
        accuracy = (predictions == labels).astype(np.float32).mean().item()
        return {"accuracy": accuracy}

    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    
    log_experiment(f"Training {experiment_name} - {mode} mode...")
    
    # Train
    train_result = trainer.train()
    eval_results = trainer.evaluate()
    
    log_experiment(f"Completed {experiment_name} - {mode}: Accuracy={eval_results['eval_accuracy']:.4f}")
    
    # Save FINAL model only
    final_model_path = f"trained_models/final_{experiment_name}_{mode}"
    trainer.save_model(final_model_path)
    tokenizer.save_pretrained(final_model_path)
    
    log_experiment(f"Final model saved: {final_model_path}")
    
    # Clean up temporary directory (but keep final model)
    import shutil
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    
    return trainer, eval_results

## Confidence Threshold Evaluation
Evaluates each trained model across all confidence thresholds (0.5-0.99) to analyze accuracy-coverage trade-offs and "other" class detection performance.

In [None]:
def evaluate_with_confidence_thresholds(model, test_dataset_full, df_test_full, 
                                       extended_label_encoder, excluded_classes, 
                                       experiment_name, mode):
    """Evaluate model across all confidence thresholds"""
    
    log_experiment(f"Evaluating confidence thresholds for {experiment_name} - {mode}...")
    
    # Get predictions
    test_dataloader = DataLoader(test_dataset_full, batch_size=8, shuffle=False)
    
    all_logits = []
    all_true_labels = []
    
    model.eval()
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            true_labels_batch = batch['labels']
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            all_logits.append(logits.cpu().numpy())
            all_true_labels.append(true_labels_batch.numpy())
    
    # Process predictions
    logits = np.vstack(all_logits)
    true_labels = np.concatenate(all_true_labels)
    
    probabilities = softmax(logits, axis=1)
    predicted_labels = np.argmax(logits, axis=1)
    max_probabilities = np.max(probabilities, axis=1)
    
    # Evaluate each threshold
    results = []
    threshold_predictions = {}  # Store predictions for each threshold
    num_training_classes = len(extended_label_encoder.classes_) - len(excluded_classes)
    unknown_label_id = len(extended_label_encoder.classes_)
    
    for threshold in CONFIDENCE_THRESHOLDS:
        # Apply confidence threshold
        final_predictions = []
        for pred, conf in zip(predicted_labels, max_probabilities):
            if conf > threshold:
                final_predictions.append(pred)
            else:
                final_predictions.append(unknown_label_id)
        
        # Store predictions for this threshold
        threshold_predictions[threshold] = final_predictions
        
        # Calculate metrics
        total_samples = len(final_predictions)
        unknown_count = sum(1 for pred in final_predictions if pred == unknown_label_id)
        
        # Handle excluded classes in metrics
        excluded_class_ids = [num_training_classes + i for i in range(len(excluded_classes))]
        non_excluded_mask = ~np.isin(true_labels, excluded_class_ids)
        
        # Overall accuracy (excluding "other" samples)
        if non_excluded_mask.any():
            overall_accuracy = (
                np.array(final_predictions)[non_excluded_mask] == 
                np.array(true_labels)[non_excluded_mask]
            ).mean()
        else:
            overall_accuracy = 0.0
        
        # Committed accuracy
        committed_mask = (np.array(final_predictions) != unknown_label_id) & non_excluded_mask
        if committed_mask.any():
            committed_accuracy = (
                np.array(final_predictions)[committed_mask] == 
                np.array(true_labels)[committed_mask]
            ).mean()
        else:
            committed_accuracy = 0.0
        
        # Coverage
        non_excluded_count = non_excluded_mask.sum()
        if non_excluded_count > 0:
            confident_non_excluded = ((np.array(final_predictions) != unknown_label_id) & non_excluded_mask).sum()
            coverage = (confident_non_excluded / non_excluded_count) * 100
        else:
            coverage = 0.0
        
        # Other capture rate
        other_samples_mask = np.isin(true_labels, excluded_class_ids)
        other_samples_count = other_samples_mask.sum()
        
        if other_samples_count > 0:
            other_predicted_as_unknown = sum(1 for i, is_other in enumerate(other_samples_mask) 
                                           if is_other and final_predictions[i] == unknown_label_id)
            other_capture_rate = other_predicted_as_unknown / other_samples_count * 100
        else:
            other_capture_rate = 0.0
        
        results.append({
            'experiment': experiment_name,
            'mode': mode,
            'confidence_threshold': threshold,
            'overall_accuracy': overall_accuracy,
            'committed_accuracy': committed_accuracy,
            'coverage': coverage,
            'other_capture_rate': other_capture_rate,
            'unknown_predictions': unknown_count,
            'total_samples': total_samples,
            'true_labels': true_labels.tolist(),  # For CSV saving
            'predicted_labels': final_predictions  # For CSV saving
        })
    
    log_experiment(f"Completed threshold evaluation for {experiment_name} - {mode}")
    
    # Return both results and raw prediction data
    predictions_data = {
        'probabilities': probabilities,
        'max_probabilities': max_probabilities,
        'true_labels': true_labels,
        'predicted_labels': predicted_labels,
        'threshold_predictions': threshold_predictions
    }
    
    return results, predictions_data

## Comprehensive Experiments Pipeline
Main experimental pipeline that systematically trains and evaluates all 6 model configurations, then tests each across 8 confidence thresholds.

In [None]:
def run_comprehensive_experiments():
    """Run all experimental configurations"""
    
    log_experiment("Starting comprehensive BART experiments...")
    log_experiment(f"Total configurations: {len(EXPERIMENT_CONFIGS)} × {len(MODEL_MODES)} × {len(CONFIDENCE_THRESHOLDS)} = {len(EXPERIMENT_CONFIGS) * len(MODEL_MODES) * len(CONFIDENCE_THRESHOLDS)}")
    
    all_results = []
    
    for config in EXPERIMENT_CONFIGS:
        log_experiment(f"\n🔬 EXPERIMENT: {config['name'].upper()}")
        
        # Prepare data
        df_train, df_test = load_and_preprocess_data(
            TRAIN_FILE, TEST_FILE, config['exclude_classes']
        )
        
        # Balance training data
        df_train = oversample_minority_classes(df_train, RANDOM_STATE)
        
        # Encode labels
        df_train, df_test_training, df_test_full, label_encoder, extended_label_encoder, num_classes = encode_labels_with_exclusions(
            df_train, df_test, config['exclude_classes']
        )
        
        for mode in MODEL_MODES:
            log_experiment(f"\n📊 Training {mode.upper()} model...")
            
            try:
                # Create tokenizer
                tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)
                
                # Create datasets
                train_dataset, test_dataset = create_datasets(
                    df_train, df_test_training, mode, tokenizer
                )
                
                # Train model
                trainer, eval_results = train_model(
                    train_dataset, test_dataset, config['name'], mode, num_classes
                )
                
                # Create full test dataset for threshold evaluation
                test_dataset_full = Dataset.from_pandas(
                    df_test_full[['tweet_text', 'note_text', 'label_id']].rename(columns={'label_id': 'labels'})
                )
                
                # Tokenize full test dataset
                def tokenize_function(batch):
                    if mode == 'combined':
                        return tokenizer(
                            batch['tweet_text'], 
                            batch['note_text'], 
                            truncation="longest_first",
                            padding="max_length", 
                            max_length=OPTIMAL_CONFIG['max_token_length']
                        )
                    elif mode == 'tweet':
                        return tokenizer(
                            batch['tweet_text'], 
                            padding='max_length', 
                            truncation=True, 
                            max_length=OPTIMAL_CONFIG['max_token_length']
                        )
                    else:
                        return tokenizer(
                            batch['note_text'], 
                            padding='max_length', 
                            truncation=True, 
                            max_length=OPTIMAL_CONFIG['max_token_length']
                        )
                
                test_dataset_full = test_dataset_full.map(tokenize_function, batched=True, batch_size=16)
                
                columns_to_remove = ['tweet_text', 'note_text']
                if '__index_level_0__' in test_dataset_full.column_names:
                    columns_to_remove.append('__index_level_0__')
                
                test_dataset_full = test_dataset_full.remove_columns(columns_to_remove)
                test_dataset_full.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
                
                # Evaluate confidence thresholds
                threshold_results, predictions_data = evaluate_with_confidence_thresholds(
                    trainer.model, test_dataset_full, df_test_full,
                    extended_label_encoder, config['exclude_classes'],
                    config['name'], mode
                )
                
                # Add base model performance
                for result in threshold_results:
                    result['base_model_accuracy'] = eval_results['eval_accuracy']
                
                all_results.extend(threshold_results)
                
                # Save detailed results for key configurations (needed for later analysis)
                model_results_df = pd.DataFrame(threshold_results)
                model_results_df.to_csv(f'experiment_results/{config["name"]}_{mode}_detailed.csv', index=False)
                
                # Save prediction probabilities for key configurations
                with open(f'experiment_results/{config["name"]}_{mode}_probabilities.pkl', 'wb') as f:
                    pickle.dump({
                        'probabilities': predictions_data['probabilities'],
                        'max_probabilities': predictions_data['max_probabilities'],
                        'true_labels': predictions_data['true_labels'],
                        'predicted_labels': predictions_data['predicted_labels'],
                        'label_encoder': label_encoder,
                        'extended_label_encoder': extended_label_encoder
                    }, f)
                
                # Clean up memory
                del trainer
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                gc.collect()
                
                log_experiment(f"✅ Completed {mode} model for {config['name']}")
                
            except Exception as e:
                log_experiment(f"❌ Error with {mode} model in {config['name']}: {str(e)}")
                continue
    
    # Save results
    log_experiment("\n💾 Saving comprehensive results...")
    
    results_df = pd.DataFrame(all_results)
    
    # Remove the array columns before saving main results
    clean_results = []
    for result in all_results:
        clean_result = result.copy()
        # Remove array fields that can't be saved to CSV/Excel cleanly
        clean_result.pop('true_labels', None)
        clean_result.pop('predicted_labels', None)
        clean_results.append(clean_result)
    
    clean_results_df = pd.DataFrame(clean_results)
    clean_results_df.to_csv('experiment_results/complete_results.csv', index=False)
    clean_results_df.to_excel('experiment_results/complete_results.xlsx', index=False)
    
    # Create summary
    summary_cols = ['experiment', 'mode', 'confidence_threshold', 'base_model_accuracy',
                   'committed_accuracy', 'coverage', 'other_capture_rate']
    summary_df = clean_results_df[summary_cols].copy()
    summary_df.to_csv('experiment_results/experiment_summary.csv', index=False)
    
    log_experiment(f"✅ Completed all experiments! Results: {len(all_results)} configurations")
    log_experiment("📁 Generated core files:")
    log_experiment("   - complete_results.xlsx: All configuration results")
    log_experiment("   - complete_results.csv: All configuration results")
    log_experiment("   - experiment_summary.csv: Key metrics summary")
    log_experiment("📁 Generated final models:")
    log_experiment("   - trained_models/final_with_other_tweet")
    log_experiment("   - trained_models/final_with_other_note") 
    log_experiment("   - trained_models/final_with_other_combined")
    log_experiment("   - trained_models/final_without_other_tweet")
    log_experiment("   - trained_models/final_without_other_note")
    log_experiment("   - trained_models/final_without_other_combined ← Used by validation")
    log_experiment("📁 Generated detailed model files:")
    log_experiment("   - [experiment]_[mode]_detailed.csv: Detailed results for each model")
    log_experiment("   - [experiment]_[mode]_probabilities.pkl: Prediction data for each model")
    
    return results_df

## Results Analysis and Visualization
Functions for analyzing experimental results, creating comparison plots, and identifying optimal configurations

In [None]:
def create_comparison_plots(results_df):
    """Create comprehensive comparison visualizations"""
    
    log_experiment("Creating comparison plots...")
    
    # Set up plotting
    plt.style.use('default')
    sns.set_palette("husl")
    
    experiments = results_df['experiment'].unique()
    modes = results_df['mode'].unique()
    
    # Main comparison plot
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('BART Model Comparison Across All Configurations', fontsize=16)
    
    for i, experiment in enumerate(experiments):
        for j, metric in enumerate(['committed_accuracy', 'coverage']):
            ax = axes[i, j]
            
            exp_data = results_df[results_df['experiment'] == experiment]
            
            for mode in modes:
                mode_data = exp_data[exp_data['mode'] == mode]
                ax.plot(mode_data['confidence_threshold'], mode_data[metric], 
                       'o-', label=mode, linewidth=2, markersize=6)
            
            ax.set_xlabel('Confidence Threshold')
            ax.set_ylabel(metric.replace('_', ' ').title())
            ax.set_title(f'{experiment.replace("_", " ").title()} - {metric.replace("_", " ").title()}')
            ax.grid(True, alpha=0.3)
            ax.legend()
    
    plt.tight_layout()
    plt.savefig('experiment_results/comparison_plots_by_experiment.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Trade-off analysis
    plt.figure(figsize=(15, 5))
    
    for i, experiment in enumerate(experiments):
        plt.subplot(1, len(experiments), i+1)
        
        exp_data = results_df[results_df['experiment'] == experiment]
        
        for mode in modes:
            mode_data = exp_data[exp_data['mode'] == mode]
            plt.scatter(mode_data['coverage'], mode_data['committed_accuracy'], 
                       s=60, alpha=0.7, label=mode)
        
        plt.xlabel('Coverage (%)')
        plt.ylabel('Committed Accuracy')
        plt.title(f'{experiment.replace("_", " ").title()}\nAccuracy vs Coverage Trade-off')
        plt.grid(True, alpha=0.3)
        plt.legend()
    
    plt.tight_layout()
    plt.savefig('experiment_results/tradeoff_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    log_experiment("Comparison plots completed!")
    log_experiment("   - comparison_plots_by_experiment.png: 2×2 performance comparison")
    log_experiment("   - tradeoff_analysis.png: Accuracy vs coverage analysis")

def generate_final_report(results_df):
    """Generate comprehensive final report in markdown and LaTeX formats"""
    
    log_experiment("📋 Generating final report...")
    
    report = []
    report.append("# Comprehensive BART Model Experiments - Final Report\n")
    report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    report.append("="*80 + "\n")
    
    # Overall summary
    total_configs = len(results_df)
    experiments = results_df['experiment'].unique()
    modes = results_df['mode'].unique()
    
    report.append(f"## Experiment Summary\n")
    report.append(f"- Total configurations tested: {total_configs}\n")
    report.append(f"- Experiments: {', '.join(experiments)}\n")
    report.append(f"- Model modes: {', '.join(modes)}\n")
    report.append(f"- Confidence thresholds: {', '.join(map(str, sorted(results_df['confidence_threshold'].unique())))}\n\n")
    
    # Best overall performance
    report.append("## Best Overall Performance\n")
    
    # Find absolute best configuration
    valid_results = results_df[results_df['coverage'] >= 80]  # Minimum 80% coverage
    if len(valid_results) > 0:
        best_config = valid_results.loc[valid_results['committed_accuracy'].idxmax()]
        
        report.append(f"**Best Configuration:**\n")
        report.append(f"- Experiment: {best_config['experiment']}\n")
        report.append(f"- Mode: {best_config['mode']}\n")
        report.append(f"- Confidence Threshold: {best_config['confidence_threshold']}\n")
        report.append(f"- Committed Accuracy: {best_config['committed_accuracy']:.3f}\n")
        report.append(f"- Coverage: {best_config['coverage']:.1f}%\n")
        report.append(f"- Other Capture Rate: {best_config['other_capture_rate']:.1f}%\n\n")
    
    # Performance by experiment
    report.append("## Performance by Experiment\n")
    
    for experiment in experiments:
        exp_data = results_df[results_df['experiment'] == experiment]
        report.append(f"### {experiment.replace('_', ' ').title()}\n")
        
        # Best for each mode
        for mode in modes:
            mode_data = exp_data[exp_data['mode'] == mode]
            valid_mode_data = mode_data[mode_data['coverage'] >= 80]
            
            if len(valid_mode_data) > 0:
                best_mode = valid_mode_data.loc[valid_mode_data['committed_accuracy'].idxmax()]
                report.append(f"**{mode.title()} Model:**\n")
                report.append(f"- Best Accuracy: {best_mode['committed_accuracy']:.3f} (threshold: {best_mode['confidence_threshold']})\n")
                report.append(f"- Coverage: {best_mode['coverage']:.1f}%\n")
                report.append(f"- Other Capture: {best_mode['other_capture_rate']:.1f}%\n\n")
    
    # Comparison table
    report.append("## Model Comparison Table\n")
    
    # Create comparison for key thresholds
    key_thresholds = [0.5, 0.8, 0.9]
    
    for threshold in key_thresholds:
        threshold_data = results_df[results_df['confidence_threshold'] == threshold]
        if len(threshold_data) > 0:
            report.append(f"### At Confidence Threshold {threshold}\n")
            report.append("| Experiment | Mode | Committed Acc | Coverage | Other Capture |\n")
            report.append("|------------|------|---------------|----------|---------------|\n")
            
            for _, row in threshold_data.iterrows():
                report.append(f"| {row['experiment']} | {row['mode']} | {row['committed_accuracy']:.3f} | {row['coverage']:.1f}% | {row['other_capture_rate']:.1f}% |\n")
            report.append("\n")
    
    # Key insights
    report.append("## Key Insights\n")
    
    # Compare with vs without other
    if 'with_other' in experiments and 'without_other' in experiments:
        with_other = results_df[results_df['experiment'] == 'with_other']
        without_other = results_df[results_df['experiment'] == 'without_other']
        
        report.append("### Impact of Excluding 'Other' Class\n")
        
        for mode in modes:
            with_mode = with_other[with_other['mode'] == mode]
            without_mode = without_other[without_other['mode'] == mode]
            
            if len(with_mode) > 0 and len(without_mode) > 0:
                # Compare at same threshold
                baseline_threshold = 0.8
                with_baseline = with_mode[with_mode['confidence_threshold'] == baseline_threshold]
                without_baseline = without_mode[without_mode['confidence_threshold'] == baseline_threshold]
                
                if len(with_baseline) > 0 and len(without_baseline) > 0:
                    acc_improvement = without_baseline.iloc[0]['committed_accuracy'] - with_baseline.iloc[0]['committed_accuracy']
                    report.append(f"- {mode.title()}: {acc_improvement:+.3f} accuracy improvement when excluding 'other'\n")
        
        report.append("\n")
    
    # Optimal thresholds by mode
    report.append("### Optimal Confidence Thresholds\n")
    
    for mode in modes:
        mode_data = results_df[results_df['mode'] == mode]
        valid_mode_data = mode_data[mode_data['coverage'] >= 80]
        
        if len(valid_mode_data) > 0:
            optimal = valid_mode_data.loc[valid_mode_data['committed_accuracy'].idxmax()]
            report.append(f"- {mode.title()}: {optimal['confidence_threshold']} (Accuracy: {optimal['committed_accuracy']:.3f}, Coverage: {optimal['coverage']:.1f}%)\n")
    
    report.append("\n")
    
    # Save markdown report
    with open('experiment_results/final_report.md', 'w') as f:
        f.write(''.join(report))
    
    # Create LaTeX version
    latex_report = []
    latex_report.append("\\section{Comprehensive BART Model Experiments}\n\n")
    latex_report.append("\\subsection{Experiment Overview}\n")
    latex_report.append(f"We conducted comprehensive experiments testing {total_configs} different configurations across ")
    latex_report.append(f"{len(experiments)} experimental setups and {len(modes)} model architectures. ")
    latex_report.append("The experiments evaluated the impact of including versus excluding the problematic 'other' class ")
    latex_report.append("from training, along with confidence threshold optimization.\n\n")
    
    if len(valid_results) > 0:
        best_config = valid_results.loc[valid_results['committed_accuracy'].idxmax()]
        latex_report.append("\\subsection{Optimal Configuration}\n")
        latex_report.append("The best performing configuration achieved ")
        latex_report.append(f"{best_config['committed_accuracy']:.1%} committed accuracy with ")
        latex_report.append(f"{best_config['coverage']:.1f}\\% coverage using the {best_config['mode']} model ")
        latex_report.append(f"in the {best_config['experiment'].replace('_', ' ')} experiment ")
        latex_report.append(f"with a confidence threshold of {best_config['confidence_threshold']}.\n\n")
    
    with open('experiment_results/final_report.tex', 'w') as f:
        f.write(''.join(latex_report))
    
    log_experiment("📋 Final report generated!")
    log_experiment("   - final_report.md: Comprehensive markdown report")
    log_experiment("   - final_report.tex: LaTeX section for thesis")
    
    return ''.join(report)

def generate_experiment_summary(results_df):
    """Generate final experiment summary JSON"""
    
    log_experiment("Generating experiment summary...")
    
    # Find overall best configuration
    valid_results = results_df[results_df['coverage'] >= 80]
    
    if len(valid_results) > 0:
        best_config = valid_results.loc[valid_results['committed_accuracy'].idxmax()]
        
        summary = {
            'total_configurations': len(results_df),
            'best_experiment': best_config['experiment'],
            'best_mode': best_config['mode'],
            'best_threshold': best_config['confidence_threshold'],
            'best_committed_accuracy': best_config['committed_accuracy'],
            'best_coverage': best_config['coverage'],
            'best_other_capture': best_config['other_capture_rate'],
            'experiments_tested': results_df['experiment'].unique().tolist(),
            'modes_tested': results_df['mode'].unique().tolist(),
            'thresholds_tested': sorted(results_df['confidence_threshold'].unique().tolist())
        }
        
        # Save summary
        with open('experiment_results/final_summary.json', 'w') as f:
            json.dump(summary, f, indent=2, default=str)
        
        log_experiment("🏆 BEST OVERALL CONFIGURATION:")
        log_experiment(f"   {best_config['experiment']} - {best_config['mode']} - threshold {best_config['confidence_threshold']}")
        log_experiment(f"   Accuracy: {best_config['committed_accuracy']:.3f}, Coverage: {best_config['coverage']:.1f}%")
        
        return summary
    
    return None

## Main Execution
Complete experimental pipeline: train all models → evaluate thresholds → analyze results → identify optimal configuration.

**Output Files Generated**:
- `complete_results.csv`: All 48 configuration results
- `complete_results.xlsx`: All 48 configuration results (Excel format)
- `experiment_summary.csv`: Key metrics summary  
- `experiment_log.txt`: Complete execution log with timestamps
- `comparison_plots_by_experiment.png`: Performance visualizations (2×2 grid)
- `tradeoff_analysis.png`: Accuracy vs coverage trade-offs
- `final_report.md`: Comprehensive analysis report
- `final_report.tex`: LaTeX section for thesis
- `[experiment]_[mode]_detailed.csv`: Detailed results for each of 6 models
- `[experiment]_[mode]_probabilities.pkl`: Prediction data for each of 6 models
- `trained_models/final_[experiment]_[mode]/`: 6 final trained models for validation

**Total: 27 items** (8 core files + 12 model files + 1 log + 6 trained models)

In [None]:
def main():
    """Main execution function"""
    
    log_experiment("=" * 80)
    log_experiment("COMPREHENSIVE BART MODEL EXPERIMENTS")
    log_experiment("=" * 80)
    
    # Run all experiments
    results_df = run_comprehensive_experiments()
    
    # Create visualizations
    create_comparison_plots(results_df)

    # Generate final report
    final_report = generate_final_report(results_df)

    # Generate summary
    summary = generate_experiment_summary(results_df)
    
    log_experiment("🎉 All experiments completed successfully!")
    log_experiment("📁 Check experiment_results/ folder for detailed results")
    
    # Display final summary
    print("\n" + "="*80)
    print("EXPERIMENT SUMMARY")
    print("="*80)
    print(f"Total configurations tested: {len(results_df)}")
    print(f"Experiments: {', '.join(results_df['experiment'].unique())}")
    print(f"Modes: {', '.join(results_df['mode'].unique())}")
    
    if summary:
        print(f"\n🏆 BEST CONFIGURATION:")
        print(f"   {summary['best_experiment']} - {summary['best_mode']} - threshold {summary['best_threshold']}")
        print(f"   Accuracy: {summary['best_committed_accuracy']:.3f}")
        print(f"   Coverage: {summary['best_coverage']:.1f}%")
    
    print("="*80)
    
    return results_df

if __name__ == "__main__":
    results_df = main()