# Specialized Fine-tuning for Unreliable Source Classification

## Import Required Libraries

In [None]:
import os
import gc
import json
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix
)
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EvalPrediction,
    set_seed
)
import joblib
import itertools
import random
from collections import Counter

# Setup
warnings.filterwarnings('ignore')
plt.style.use('default')
set_seed(42)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

## Configuration

In [None]:
CONFIG = {
    "input_data_dir": "model_training_results/config",  # Where splits are saved
    "output_dir": "unreliable_source_tuning_results",
    "target_model": "roberta-large",
    "problem_class": "unreliable source",
    
    # Five class imbalance strategies
    "training_strategies": [
        {
            "name": "heavy_class_weights",
            "description": "Heavy weighting for unreliable source class",
            "epochs": 4,
            "learning_rate": 2e-5,
            "batch_size": 1,
            "max_length": 512,
            "weight_decay": 0.01,
            "class_weight_multiplier": 10.0,
            "use_focal_loss": False,
            "oversample": False
        },
        {
            "name": "focal_loss",
            "description": "Focal loss to handle hard examples",
            "epochs": 4,
            "learning_rate": 2e-5,
            "batch_size": 1,
            "max_length": 512,
            "weight_decay": 0.01,
            "class_weight_multiplier": 5.0,
            "use_focal_loss": True,
            "focal_alpha": 0.25,
            "focal_gamma": 2.0,
            "oversample": False
        },
        {
            "name": "longer_training",
            "description": "More epochs with lower learning rate",
            "epochs": 6,
            "learning_rate": 1e-5,
            "batch_size": 1,
            "max_length": 512,
            "weight_decay": 0.005,
            "class_weight_multiplier": 8.0,
            "use_focal_loss": False,
            "oversample": False
        },
        {
            "name": "oversampling",
            "description": "Oversample minority class to balance dataset",
            "epochs": 3,
            "learning_rate": 3e-5,
            "batch_size": 2,
            "max_length": 512,
            "weight_decay": 0.01,
            "class_weight_multiplier": 3.0,
            "use_focal_loss": False,
            "oversample": True,
            "oversample_ratio": 3
        },
        {
            "name": "combined_approach",
            "description": "Oversampling + focal loss + class weights",
            "epochs": 4,
            "learning_rate": 2e-5,
            "batch_size": 1,
            "max_length": 384,
            "weight_decay": 0.01,
            "class_weight_multiplier": 6.0,
            "use_focal_loss": True,
            "focal_alpha": 0.25,
            "focal_gamma": 1.5,
            "oversample": True,
            "oversample_ratio": 2
        }
    ],
    
    "evaluation": {
        "eval_steps": 20,
        "save_steps": 40,
        "logging_steps": 10
    }
}

# Create output directories
for subdir in ['predictions', 'plots', 'reports', 'metrics']:
    os.makedirs(f"{CONFIG['output_dir']}/{subdir}", exist_ok=True)

print("🎯 Unreliable Source Classification Improvement")
print(f"Target class: {CONFIG['problem_class']}")
print(f"Strategies to test: {len(CONFIG['training_strategies'])}")

## Load Existing Data Splits

In [None]:
def load_existing_splits(data_dir):
    """Load the existing train/val/test splits"""
    print(f"Loading existing data splits from {data_dir}...")
    
    try:
        train_df = pd.read_csv(f"{data_dir}/train_split.csv")
        val_df = pd.read_csv(f"{data_dir}/val_split.csv")
        test_df = pd.read_csv(f"{data_dir}/test_split.csv")
        
        print(f"✅ Loaded splits:")
        print(f"   Train: {len(train_df)} samples")
        print(f"   Validation: {len(val_df)} samples") 
        print(f"   Test: {len(test_df)} samples")
        
        # Show class distribution
        print(f"\n📊 Class distribution:")
        print("Training set:")
        print(train_df['label'].value_counts())
        
        return train_df, val_df, test_df
        
    except FileNotFoundError:
        print(f"❌ Error: Could not find splits in {data_dir}")
        print("Run the main training script first.")
        return None, None, None

def prepare_label_encoder(train_df):
    """Prepare label encoder from training data"""
    label_encoder = LabelEncoder()
    unique_labels = train_df['label'].unique()
    label_encoder.fit(unique_labels)
    
    print(f"Label mapping: {dict(enumerate(label_encoder.classes_))}")
    return label_encoder

# Load data
train_df, val_df, test_df = load_existing_splits(CONFIG["input_data_dir"])

if train_df is None:
    print("❌ Failed to load data splits. Exiting.")
else:
    label_encoder = prepare_label_encoder(train_df)
    
    # Check problem class distribution
    train_problem_count = (train_df['label'] == CONFIG['problem_class']).sum()
    val_problem_count = (val_df['label'] == CONFIG['problem_class']).sum()
    test_problem_count = (test_df['label'] == CONFIG['problem_class']).sum()
    
    print(f"\n🎯 Problem class '{CONFIG['problem_class']}' distribution:")
    print(f"   Training: {train_problem_count} examples")
    print(f"   Validation: {val_problem_count} examples")
    print(f"   Test: {test_problem_count} examples")
    
    if train_problem_count < 5:
        print("⚠️ WARNING: Very few training examples for the problem class!")

## Data Augmentation Functions

In [None]:
def oversample_minority_class(train_df, target_class, ratio=2):
    """Oversample the target class to improve balance"""
    print(f"\n🔄 Oversampling '{target_class}' by {ratio}x...")
    
    # Get samples of target class
    target_samples = train_df[train_df['label'] == target_class]
    other_samples = train_df[train_df['label'] != target_class]
    
    print(f"Original {target_class} samples: {len(target_samples)}")
    
    # Oversample by repeating samples
    oversampled_target = pd.concat([target_samples] * ratio, ignore_index=True)
    
    # Add some noise to avoid exact duplicates
    for i in range(len(target_samples), len(oversampled_target)):
        oversampled_target.at[i, 'text'] += " "
    
    # Combine with other samples
    balanced_df = pd.concat([other_samples, oversampled_target], ignore_index=True)
    
    print(f"After oversampling {target_class} samples: {len(oversampled_target)}")
    print(f"Total training samples: {len(balanced_df)}")
    print(f"New class distribution:")
    print(balanced_df['label'].value_counts())
    
    return balanced_df

def calculate_class_weights(train_df, target_class, multiplier=5.0):
    """Calculate class weights with special emphasis on target class"""
    label_counts = train_df['label'].value_counts()
    
    # Calculate base balanced weights
    unique_labels = train_df['label'].unique()
    label_encoder = LabelEncoder()
    label_encoder.fit(unique_labels)
    
    y = label_encoder.transform(train_df['label'])
    base_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
    
    # Create weight dictionary
    class_weights = {}
    for i, label in enumerate(label_encoder.classes_):
        if label == target_class:
            class_weights[i] = base_weights[i] * multiplier
        else:
            class_weights[i] = base_weights[i]
    
    print(f"\n⚖️ Class weights (with {multiplier}x multiplier for '{target_class}'):")
    for i, label in enumerate(label_encoder.classes_):
        print(f"   {label}: {class_weights[i]:.3f}")
    
    # Convert to tensor
    weight_tensor = torch.tensor([class_weights[i] for i in range(len(label_encoder.classes_))], dtype=torch.float)
    
    return weight_tensor, class_weights

## Custom Loss Functions and Trainer

In [None]:
class FocalLoss(torch.nn.Module):
    """Focal Loss for handling class imbalance"""
    
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        
    def forward(self, inputs, targets):
        ce_loss = torch.nn.functional.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * ce_loss
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

class CustomTrainer(Trainer):
    """Custom trainer with configurable loss functions"""
    
    def __init__(self, class_weights=None, use_focal_loss=False, focal_alpha=0.25, focal_gamma=2.0, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        self.use_focal_loss = use_focal_loss
        
        if use_focal_loss:
            self.focal_loss = FocalLoss(alpha=focal_alpha, gamma=focal_gamma)
        
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        
        if self.use_focal_loss:
            loss = self.focal_loss(logits, labels)
        else:
            if self.class_weights is not None:
                loss_fn = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
            else:
                loss_fn = torch.nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        
        return (loss, outputs) if return_outputs else loss

def tokenize_data(df, tokenizer, max_length):
    """Tokenize the data"""
    dataset = Dataset.from_pandas(df[["text", "label_id"]].rename(columns={"label_id": "label"}))
    
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=50)
    tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    
    return tokenized_dataset

def compute_metrics(eval_pred: EvalPrediction):
    """Compute evaluation metrics with focus on target class"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    # Overall metrics
    overall_metrics = {
        'accuracy': accuracy_score(labels, predictions),
        'f1_macro': f1_score(labels, predictions, average='macro'),
        'f1_weighted': f1_score(labels, predictions, average='weighted'),
        'precision_macro': precision_score(labels, predictions, average='macro'),
        'recall_macro': recall_score(labels, predictions, average='macro'),
    }
    
    # Per-class metrics
    per_class_f1 = f1_score(labels, predictions, average=None)
    per_class_precision = precision_score(labels, predictions, average=None)
    per_class_recall = recall_score(labels, predictions, average=None)
    
    # Add metrics for each class
    for i in range(len(per_class_f1)):
        overall_metrics[f'f1_class_{i}'] = per_class_f1[i]
        overall_metrics[f'precision_class_{i}'] = per_class_precision[i]
        overall_metrics[f'recall_class_{i}'] = per_class_recall[i]
    
    return overall_metrics

## Training Function

In [None]:
def train_roberta_with_strategy(train_df, val_df, test_df, label_encoder, strategy_config):
    """Train RoBERTa-Large with a specific strategy"""
    
    strategy_name = strategy_config["name"]
    print(f"\n{'='*80}")
    print(f"🚀 Training Strategy: {strategy_name}")
    print(f"📋 Description: {strategy_config['description']}")
    print(f"{'='*80}")
    
    # Prepare training data (with potential oversampling)
    current_train_df = train_df.copy()
    if strategy_config.get("oversample", False):
        current_train_df = oversample_minority_class(
            current_train_df, 
            CONFIG["problem_class"], 
            strategy_config.get("oversample_ratio", 2)
        )
    
    # Encode labels
    current_train_df["label_id"] = label_encoder.transform(current_train_df["label"])
    val_df_copy = val_df.copy()
    val_df_copy["label_id"] = label_encoder.transform(val_df_copy["label"])
    test_df_copy = test_df.copy()
    test_df_copy["label_id"] = label_encoder.transform(test_df_copy["label"])
    
    # Calculate class weights
    class_weights, class_weights_dict = calculate_class_weights(
        current_train_df, 
        CONFIG["problem_class"], 
        strategy_config["class_weight_multiplier"]
    )
    
    try:
        # Clear memory
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        print("Loading tokenizer and model...")
        tokenizer = AutoTokenizer.from_pretrained(CONFIG["target_model"], use_fast=True)
        
        # Handle missing pad token
        if tokenizer.pad_token is None:
            if tokenizer.eos_token:
                tokenizer.pad_token = tokenizer.eos_token
            else:
                tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        
        # Load model
        num_labels = len(label_encoder.classes_)
        model = AutoModelForSequenceClassification.from_pretrained(
            CONFIG["target_model"],
            num_labels=num_labels,
            torch_dtype=torch.float32
        )
        
        model.resize_token_embeddings(len(tokenizer))
        model.config.pad_token_id = tokenizer.pad_token_id
        
        # Move to device
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = model.to(device)
        
        print(f"Model loaded on {device}. Parameters: {model.num_parameters():,}")
        
        # Tokenize datasets
        print("Tokenizing datasets...")
        train_dataset = tokenize_data(current_train_df, tokenizer, strategy_config["max_length"])
        val_dataset = tokenize_data(val_df_copy, tokenizer, strategy_config["max_length"])
        test_dataset = tokenize_data(test_df_copy, tokenizer, strategy_config["max_length"])
        
        # Training arguments - temporary output, no model saving
        training_args = TrainingArguments(
            output_dir=f"./temp_training_{strategy_name}",  # Temporary directory
            eval_strategy="steps",
            eval_steps=CONFIG["evaluation"]["eval_steps"],
            save_steps=9999999,  # Effectively disable saving
            num_train_epochs=strategy_config["epochs"],
            per_device_train_batch_size=strategy_config["batch_size"],
            per_device_eval_batch_size=strategy_config["batch_size"],
            learning_rate=strategy_config["learning_rate"],
            weight_decay=strategy_config["weight_decay"],
            warmup_steps=20,
            logging_steps=CONFIG["evaluation"]["logging_steps"],
            save_strategy="no",  # Don't save checkpoints
            load_best_model_at_end=True,
            metric_for_best_model="eval_f1_macro",
            greater_is_better=True,
            report_to="none",
            dataloader_num_workers=0,
            remove_unused_columns=True,
            dataloader_pin_memory=False,
            max_grad_norm=1.0,
            save_total_limit=0,  # Don't save any checkpoints
            fp16=torch.cuda.is_available(),
            optim="adamw_torch",
        )
        
        # Create trainer
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            class_weights=class_weights,
            use_focal_loss=strategy_config.get("use_focal_loss", False),
            focal_alpha=strategy_config.get("focal_alpha", 0.25),
            focal_gamma=strategy_config.get("focal_gamma", 2.0)
        )
        
        # Train the model
        print("Starting training...")
        train_result = trainer.train()
        
        # Note: Model checkpoints are not saved to reduce storage requirements
        # Models can be recreated using the same configuration if needed
        
        # Evaluate on test set
        print("Evaluating on test set...")
        test_results = trainer.predict(test_dataset)
        
        # Extract predictions and probabilities
        logits = test_results.predictions
        probabilities = torch.softmax(torch.tensor(logits), dim=1).numpy()
        predicted_labels = np.argmax(probabilities, axis=1)
        true_labels = test_results.label_ids
        confidence_scores = np.max(probabilities, axis=1)
        
        # Convert back to label names
        true_label_names = label_encoder.inverse_transform(true_labels)
        predicted_label_names = label_encoder.inverse_transform(predicted_labels)
        
        # Calculate detailed metrics
        accuracy = accuracy_score(true_labels, predicted_labels)
        f1_macro = f1_score(true_labels, predicted_labels, average='macro')
        f1_weighted = f1_score(true_labels, predicted_labels, average='weighted')
        
        # Per-class metrics
        per_class_f1 = f1_score(true_labels, predicted_labels, average=None)
        per_class_precision = precision_score(true_labels, predicted_labels, average=None)
        per_class_recall = recall_score(true_labels, predicted_labels, average=None)
        
        # Focus on problem class
        problem_class_id = label_encoder.transform([CONFIG["problem_class"]])[0]
        problem_class_f1 = per_class_f1[problem_class_id]
        problem_class_precision = per_class_precision[problem_class_id]
        problem_class_recall = per_class_recall[problem_class_id]
        
        # Count correct predictions for problem class
        problem_class_mask = true_labels == problem_class_id
        problem_class_correct = (predicted_labels[problem_class_mask] == problem_class_id).sum()
        problem_class_total = problem_class_mask.sum()
        problem_class_accuracy = problem_class_correct / problem_class_total if problem_class_total > 0 else 0
        
        metrics = {
            'strategy_name': strategy_name,
            'strategy_config': strategy_config,
            'accuracy': accuracy,
            'f1_macro': f1_macro,
            'f1_weighted': f1_weighted,
            'problem_class_f1': problem_class_f1,
            'problem_class_precision': problem_class_precision,
            'problem_class_recall': problem_class_recall,
            'problem_class_accuracy': problem_class_accuracy,
            'problem_class_correct': int(problem_class_correct),
            'problem_class_total': int(problem_class_total),
            'train_time': train_result.metrics.get('train_runtime', 0),
            'train_loss': train_result.metrics.get('train_loss', 0),
        }
        
        # Add per-class metrics for all classes
        for i, class_name in enumerate(label_encoder.classes_):
            metrics[f'f1_{class_name}'] = per_class_f1[i]
            metrics[f'precision_{class_name}'] = per_class_precision[i]
            metrics[f'recall_{class_name}'] = per_class_recall[i]
        
        print(f"\n✅ Strategy Results:")
        print(f"   Overall Accuracy: {accuracy:.4f}")
        print(f"   Overall F1-Macro: {f1_macro:.4f}")
        print(f"   🎯 {CONFIG['problem_class']} F1: {problem_class_f1:.4f}")
        print(f"   🎯 {CONFIG['problem_class']} Accuracy: {problem_class_accuracy:.4f} ({problem_class_correct}/{problem_class_total})")
        
        # Create detailed results DataFrame
        results_df = pd.DataFrame({
            'text': test_df_copy['text'].values,
            'true_label': true_label_names,
            'predicted_label': predicted_label_names,
            'confidence': confidence_scores,
            'correct': true_labels == predicted_labels
        })
        
        # Add probability columns for each class
        for i, class_name in enumerate(label_encoder.classes_):
            results_df[f'prob_{class_name}'] = probabilities[:, i]
        
        # Save results
        predictions_path = f"{CONFIG['output_dir']}/predictions/{strategy_name}_predictions.csv"
        results_df.to_csv(predictions_path, index=False)
        
        metrics_path = f"{CONFIG['output_dir']}/metrics/{strategy_name}_metrics.json"
        with open(metrics_path, 'w') as f:
            json.dump(metrics, f, indent=2, default=str)
        
        # Clean up temporary training directory
        import shutil
        temp_dir = f"./temp_training_{strategy_name}"
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)
        
        return metrics, results_df, True
        
    except Exception as e:
        print(f"❌ Error in strategy {strategy_name}: {str(e)}")
        
        error_metrics = {
            'strategy_name': strategy_name,
            'strategy_config': strategy_config,
            'error': str(e),
            'error_type': type(e).__name__,
            'accuracy': None,
            'f1_macro': None,
            'problem_class_f1': None
        }
        
        return error_metrics, None, False
        
    finally:
        # Cleanup
        try:
            del model, trainer, tokenizer
        except:
            pass
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

## Execute All Strategies

In [None]:
def run_all_strategies():
    """Execute all class imbalance strategies"""
    
    if train_df is None:
        print("❌ No training data available")
        return None, None
    
    all_metrics = []
    successful_runs = 0
    failed_runs = 0
    
    print(f"\n🚀 Testing {len(CONFIG['training_strategies'])} class imbalance strategies...")
    
    for i, strategy_config in enumerate(CONFIG["training_strategies"], 1):
        print(f"\n📋 Strategy {i}/{len(CONFIG['training_strategies'])}: {strategy_config['name']}")
        
        metrics, results_df, success = train_roberta_with_strategy(
            train_df, val_df, test_df, label_encoder, strategy_config
        )
        
        all_metrics.append(metrics)
        
        if success:
            successful_runs += 1
            print(f"✅ Strategy '{strategy_config['name']}' completed successfully")
        else:
            failed_runs += 1
            print(f"❌ Strategy '{strategy_config['name']}' failed")
    
    return all_metrics, successful_runs, failed_runs

# Run all strategies
all_metrics, successful_runs, failed_runs = run_all_strategies()

if all_metrics:
    print(f"\n📊 Strategy execution complete:")
    print(f"   Successful: {successful_runs}")
    print(f"   Failed: {failed_runs}")
else:
    print("❌ No strategies executed")

## Create Comparison Visualization

In [None]:
def create_strategy_comparison_plot(all_metrics):
    """Create comparison plot of different strategies"""
    
    successful_metrics = [m for m in all_metrics if m.get('accuracy') is not None]
    
    if not successful_metrics:
        print("❌ No successful strategies to compare")
        return None
    
    df_metrics = pd.DataFrame(successful_metrics)
    
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
    
    strategies = df_metrics['strategy_name'].tolist()
    
    # Plot 1: Overall F1-Macro scores
    bars1 = ax1.bar(strategies, df_metrics['f1_macro'], color='skyblue', alpha=0.7)
    ax1.set_title('Overall F1-Macro Score by Strategy')
    ax1.set_ylabel('F1-Macro Score')
    ax1.tick_params(axis='x', rotation=45)
    
    for bar, value in zip(bars1, df_metrics['f1_macro']):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{value:.3f}', ha='center', va='bottom', fontsize=9)
    
    # Plot 2: Problem class F1 scores
    bars2 = ax2.bar(strategies, df_metrics['problem_class_f1'], color='lightcoral', alpha=0.7)
    ax2.set_title(f'{CONFIG["problem_class"]} F1-Score by Strategy')
    ax2.set_ylabel('F1-Score')
    ax2.tick_params(axis='x', rotation=45)
    
    for bar, value in zip(bars2, df_metrics['problem_class_f1']):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{value:.3f}', ha='center', va='bottom', fontsize=9)
    
    # Plot 3: Problem class accuracy
    correct_counts = df_metrics['problem_class_correct'].tolist()
    total_counts = df_metrics['problem_class_total'].tolist()
    
    bars3 = ax3.bar(strategies, df_metrics['problem_class_accuracy'], color='lightgreen', alpha=0.7)
    ax3.set_title(f'{CONFIG["problem_class"]} Accuracy by Strategy')
    ax3.set_ylabel('Accuracy')
    ax3.tick_params(axis='x', rotation=45)
    
    for bar, correct, total in zip(bars3, correct_counts, total_counts):
        height = bar.get_height()
        ax3.text(bar.get_x() + bar.get_width()/2, height + 0.01, 
                f'{correct}/{total}', ha='center', va='bottom', fontsize=9)
    
    # Plot 4: Training time comparison
    bars4 = ax4.bar(strategies, df_metrics['train_time'], color='orange', alpha=0.7)
    ax4.set_title('Training Time by Strategy')
    ax4.set_ylabel('Training Time (seconds)')
    ax4.tick_params(axis='x', rotation=45)
    
    for bar, value in zip(bars4, df_metrics['train_time']):
        ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
                f'{value:.0f}s', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    
    plot_path = f"{CONFIG['output_dir']}/plots/strategy_comparison.png"
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    return plot_path

# Create comparison plot
if all_metrics:
    plot_path = create_strategy_comparison_plot(all_metrics)
    if plot_path:
        print(f"📊 Strategy comparison plot saved: {plot_path}")

## Generate Analysis Report

In [None]:
def create_analysis_report(all_metrics):
    """Create detailed analysis report"""
    
    report = []
    report.append("# 🎯 Unreliable Source Classification Improvement Report")
    report.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append("")
    
    # Executive Summary
    successful_strategies = [m for m in all_metrics if m.get('accuracy') is not None]
    failed_strategies = [m for m in all_metrics if m.get('accuracy') is None]
    
    report.append("## 🎯 Executive Summary")
    report.append(f"- **Problem**: '{CONFIG['problem_class']}' class had 0% F1-score")
    report.append(f"- **Strategies Tested**: {len(CONFIG['training_strategies'])}")
    report.append(f"- **Successful Strategies**: {len(successful_strategies)}")
    report.append(f"- **Failed Strategies**: {len(failed_strategies)}")
    
    if successful_strategies:
        best_strategy = max(successful_strategies, key=lambda x: x['problem_class_f1'])
        report.append(f"- **Best Strategy**: {best_strategy['strategy_name']}")
        report.append(f"- **Best {CONFIG['problem_class']} F1-Score**: {best_strategy['problem_class_f1']:.4f}")
        report.append(f"- **Improvement**: From 0.00 to {best_strategy['problem_class_f1']:.2f}")
    
    report.append("")
    
    # Strategy Results
    report.append("## 📊 Class Imbalance Strategy Results")
    
    if successful_strategies:
        # Sort by problem class F1 score
        successful_strategies.sort(key=lambda x: x['problem_class_f1'], reverse=True)
        
        for i, strategy in enumerate(successful_strategies, 1):
            report.append(f"### {i}. {strategy['strategy_name']}")
            report.append(f"**Description**: {strategy['strategy_config']['description']}")
            report.append(f"- **Overall F1-Macro**: {strategy['f1_macro']:.4f}")
            report.append(f"- **🎯 {CONFIG['problem_class']} F1**: {strategy['problem_class_f1']:.4f}")
            report.append(f"- **🎯 {CONFIG['problem_class']} Accuracy**: {strategy['problem_class_accuracy']:.4f}")
            report.append(f"- **🎯 Correct Predictions**: {strategy['problem_class_correct']}/{strategy['problem_class_total']}")
            
            # Configuration details
            config = strategy['strategy_config']
            report.append(f"- **Key Parameters**:")
            report.append(f"  - Class Weight Multiplier: {config['class_weight_multiplier']}")
            report.append(f"  - Focal Loss: {config.get('use_focal_loss', False)}")
            report.append(f"  - Oversampling: {config.get('oversample', False)}")
            if config.get('oversample', False):
                report.append(f"  - Oversample Ratio: {config.get('oversample_ratio', 'N/A')}")
            
            # Performance assessment
            if strategy['problem_class_f1'] >= 0.4:
                report.append("- ✅ **Significant improvement achieved**")
            elif strategy['problem_class_f1'] >= 0.2:
                report.append("- ⚠️ **Moderate improvement**")
            elif strategy['problem_class_f1'] > 0:
                report.append("- 🔄 **Limited improvement**")
            else:
                report.append("- ❌ **No improvement**")
            
            report.append("")
    
    # Technical Analysis
    if successful_strategies:
        report.append("## 🔬 Technical Analysis")
        
        # Analyze what worked
        oversampling_strategies = [s for s in successful_strategies if s['strategy_config'].get('oversample', False)]
        focal_loss_strategies = [s for s in successful_strategies if s['strategy_config'].get('use_focal_loss', False)]
        
        if oversampling_strategies:
            avg_f1_oversampling = np.mean([s['problem_class_f1'] for s in oversampling_strategies])
            report.append(f"- **Oversampling strategies**: Average F1-score {avg_f1_oversampling:.3f}")
        
        if focal_loss_strategies:
            avg_f1_focal = np.mean([s['problem_class_f1'] for s in focal_loss_strategies])
            report.append(f"- **Focal loss strategies**: Average F1-score {avg_f1_focal:.3f}")
        
        # Best performing technique
        best_f1 = max([s['problem_class_f1'] for s in successful_strategies])
        best_technique = max(successful_strategies, key=lambda x: x['problem_class_f1'])['strategy_name']
        
        report.append(f"- **Best performing technique**: {best_technique} (F1={best_f1:.3f})")
        report.append("")
    
    # Conclusions
    report.append("## 💡 Key Findings")
    
    if successful_strategies:
        best_strategy = successful_strategies[0]
        
        if best_strategy['problem_class_f1'] >= 0.4:
            report.append("1. **Problem partially solved** - achieved meaningful improvement")
            report.append(f"2. **{best_strategy['strategy_name']} most effective** - achieved {best_strategy['problem_class_f1']:.2f} F1-score")
        elif best_strategy['problem_class_f1'] >= 0.2:
            report.append("1. **Limited success** - some improvement but still challenging")
            report.append("2. **Class imbalance techniques help but insufficient**")
        else:
            report.append("1. **Minimal improvement** - fundamental classification difficulty remains")
            report.append("2. **Consider class consolidation** - merge with similar categories")
        
        # Check if oversampling was best
        if best_strategy['strategy_config'].get('oversample', False):
            report.append("3. **Oversampling most effective** - data augmentation helped most")
        else:
            report.append("3. **Class weighting approaches preferred** - loss function modifications effective")
    else:
        report.append("1. **All strategies failed** - fundamental semantic similarity issue")
        report.append("2. **Recommend class consolidation** - merge unreliable source with slanted")
    
    report.append("")
    report.append("---")
    report.append("*Report generated by Unreliable Source Classification Improvement*")
    
    # Save report
    report_path = f"{CONFIG['output_dir']}/reports/class_imbalance_analysis.md"
    with open(report_path, 'w') as f:
        f.write('\n'.join(report))
    
    return report_path

# Generate analysis report
if all_metrics:
    report_path = create_analysis_report(all_metrics)
    if report_path:
        print(f"📝 Analysis report saved: {report_path}")

## Save Results and Summary

In [None]:
# Save summary results
if all_metrics:
    summary_path = f"{CONFIG['output_dir']}/strategy_comparison_summary.csv"
    df_all_metrics = pd.DataFrame(all_metrics)
    df_all_metrics.to_csv(summary_path, index=False)
    print(f"💾 Strategy summary saved: {summary_path}")

    # Print final summary
    print(f"\n{'='*80}")
    print("✅ CLASS IMBALANCE ANALYSIS COMPLETE!")
    print(f"{'='*80}")
    print(f"📊 Strategies tested: {len(CONFIG['training_strategies'])}")
    
    # Find best strategy
    successful_metrics = [m for m in all_metrics if m.get('accuracy') is not None]
    if successful_metrics:
        best_strategy = max(successful_metrics, key=lambda x: x['problem_class_f1'])
        
        print(f"\n🏆 BEST STRATEGY FOR '{CONFIG['problem_class']}':")
        print(f"   Strategy: {best_strategy['strategy_name']}")
        print(f"   F1-Score: {best_strategy['problem_class_f1']:.4f}")
        print(f"   Improvement: From 0.00 to {best_strategy['problem_class_f1']:.2f}")
        print(f"   Correct Predictions: {best_strategy['problem_class_correct']}/{best_strategy['problem_class_total']}")
        
        if best_strategy['problem_class_f1'] > 0.3:
            print(f"   🎯 Significant improvement achieved!")
        elif best_strategy['problem_class_f1'] > 0:
            print(f"   ⚠️ Limited improvement - consider alternative approaches")
        else:
            print(f"   ❌ No improvement - fundamental classification difficulty")
        
        # Check if this matches thesis claim about oversampling
        if best_strategy['strategy_name'] == 'minority_oversampling':
            print(f"   ✅ Confirms thesis finding: oversampling was most effective")
    
    print(f"\n📁 Results saved to: {CONFIG['output_dir']}")
else:
    print("❌ No results to save")