# Metadata Enhanced Classification Experiment

## Import Required Libraries

In [None]:
import os
import gc
import json
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix
)
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EvalPrediction,
    set_seed
)
import joblib
import itertools
import random
from collections import Counter

# Setup
warnings.filterwarnings('ignore')
plt.style.use('default')
set_seed(42)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    print(f"GPU Available: {torch.cuda.get_device_name()}")
else:
    print("Using CPU")

## Configuration

In [None]:

CONFIG = {
    # Data paths
    "enhanced_data_path": "enhanced_dataset/training_ready_dataset.csv",
    "splits_data_dir": "model_training_results/config",
    
    # Output
    "output_dir": "metadata_classification_results",
    "experiment_name": "oversampling_plus_metadata",
    
    # Model configuration
    "target_model": "roberta-large",
    "problem_class": "unreliable source",
    
    # Best strategy from previous experiments
    "training_config": {
        "epochs": 3,
        "learning_rate": 3e-5,
        "batch_size": 2,
        "max_length": 512,
        "weight_decay": 0.01,
        "class_weight_multiplier": 3.0,
        "oversample_ratio": 3,
        "use_focal_loss": False,
        "warmup_steps": 20,
    },
    
    # Metadata configuration
    "metadata_config": {
        "use_metadata": True,
        "method": "concatenation",
        "key_features": [
            "misleadingunverifiedclaimasfact",
            "misleadingmissingimportantcontext",
            "misleadingfactualerror",
            "misleadingoutdatedinformation",
            "misleadingmanipulatedmedia",
            "misleadingsatire"
        ]
    },
    
    # Baseline results for comparison
    "baseline_results": {
        "unreliable_source_f1": 0.4000,
        "false_f1": 0.3500,  # Approximate baseline
        "repurposed_f1": 0.3000,  # Approximate baseline
    }
}

# Create output directories
for subdir in ['predictions', 'reports']:
    os.makedirs(f"{CONFIG['output_dir']}/{subdir}", exist_ok=True)

print("Metadata Enhanced Classification Experiment")
print(f"Output directory: {CONFIG['output_dir']}")
print(f"Target problem: {CONFIG['problem_class']} classification")

## Load and Prepare Data

In [None]:
def load_enhanced_dataset():
    """Load enhanced dataset with metadata and existing splits"""
    
    print("Loading enhanced dataset with metadata...")
    
    # Load enhanced dataset
    try:
        if CONFIG["enhanced_data_path"].endswith('.xlsx'):
            enhanced_df = pd.read_excel(CONFIG["enhanced_data_path"])
        else:
            enhanced_df = pd.read_csv(CONFIG["enhanced_data_path"])
        
        print(f"Enhanced dataset loaded: {enhanced_df.shape}")
        
    except FileNotFoundError:
        print(f"Error: Enhanced dataset not found at {CONFIG['enhanced_data_path']}")
        return None, None, None, None
    
    # Load existing splits
    try:
        train_df = pd.read_csv(f"{CONFIG['splits_data_dir']}/train_split.csv")
        val_df = pd.read_csv(f"{CONFIG['splits_data_dir']}/val_split.csv")
        test_df = pd.read_csv(f"{CONFIG['splits_data_dir']}/test_split.csv")
        
        print(f"Splits loaded - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
        
    except FileNotFoundError:
        print(f"Error: Could not find splits in {CONFIG['splits_data_dir']}")
        return None, None, None, None
    
    return enhanced_df, train_df, val_df, test_df

def analyze_metadata_coverage(enhanced_df, metadata_features):
    """Analyze metadata feature coverage and distribution"""
    
    print(f"\nAnalyzing metadata coverage for {len(metadata_features)} features...")
    
    available_features = []
    for feature in metadata_features:
        # Check both with and without 'meta_' prefix
        feature_variants = [feature, f'meta_{feature}']
        found = False
        
        for variant in feature_variants:
            if variant in enhanced_df.columns:
                available_features.append(variant)
                coverage = enhanced_df[variant].notna().mean()
                unique_vals = enhanced_df[variant].nunique()
                print(f"   {variant}: {coverage:.1%} coverage, {unique_vals} unique values")
                found = True
                break
        
        if not found:
            print(f"   {feature}: Not found in dataset")
    
    if not available_features:
        print("WARNING: No metadata features found! Running without metadata enhancement.")
        return []
    
    print(f"\nUsing {len(available_features)} metadata features for enhancement")
    return available_features

def merge_splits_with_metadata(train_df, val_df, test_df, enhanced_df, metadata_features):
    """Merge original splits with metadata features"""
    
    print(f"\nMerging splits with metadata features...")
    
    def safe_merge(split_df, split_name):
        """Safely merge split with enhanced dataset"""
        
        # Try to merge on text column
        if 'text' in split_df.columns and 'text' in enhanced_df.columns:
            print(f"   Merging {split_name} on text column")
            
            merged_df = split_df.merge(
                enhanced_df[['text'] + metadata_features], 
                on='text', 
                how='left'
            )
            
            coverage = merged_df[metadata_features].notna().all(axis=1).mean()
            print(f"   {split_name} metadata coverage: {coverage:.1%}")
            
        else:
            print(f"   Could not merge {split_name}, using original split with default metadata")
            merged_df = split_df.copy()
            for feature in metadata_features:
                merged_df[feature] = 0
        
        return merged_df
    
    enhanced_train = safe_merge(train_df, "Training")
    enhanced_val = safe_merge(val_df, "Validation")
    enhanced_test = safe_merge(test_df, "Test")
    
    return enhanced_train, enhanced_val, enhanced_test

def create_enhanced_text_with_metadata(df, metadata_features):
    """Create enhanced text by concatenating metadata signals"""
    
    print(f"Creating enhanced text with metadata signals...")
    
    enhanced_texts = []
    signal_stats = Counter()
    
    for i in range(len(df)):
        base_text = df.iloc[i]['text']
        
        # Extract active metadata signals
        active_signals = []
        for feature in metadata_features:
            value = df.iloc[i].get(feature, 0)
            if pd.notna(value) and value == 1:
                signal = feature.replace('misleading', 'MISLEADING_').replace('meta_', '')
                active_signals.append(signal.upper())
                signal_stats[signal] += 1
        
        # Create enhanced text with metadata
        if active_signals:
            enhanced_text = f"{base_text} [METADATA] {' '.join(active_signals)}"
        else:
            enhanced_text = f"{base_text} [METADATA] NO_SIGNALS"
        
        enhanced_texts.append(enhanced_text)
    
    print(f"   Enhanced {len(enhanced_texts)} text samples")
    print(f"   Most common signals: {dict(signal_stats.most_common(3))}")
    
    return enhanced_texts

def oversample_minority_class(train_df, target_class, ratio=3):
    """Apply oversampling strategy"""
    
    print(f"\nApplying oversampling strategy...")
    print(f"   Target class: '{target_class}'")
    print(f"   Oversampling ratio: {ratio}x")
    
    # Get samples of target class
    target_samples = train_df[train_df['label'] == target_class]
    other_samples = train_df[train_df['label'] != target_class]
    
    print(f"   Original '{target_class}' samples: {len(target_samples)}")
    print(f"   Other samples: {len(other_samples)}")
    
    if len(target_samples) == 0:
        print(f"   WARNING: No samples found for '{target_class}'!")
        return train_df
    
    # Oversample by repeating samples with slight variations
    oversampled_targets = []
    for rep in range(ratio):
        target_copy = target_samples.copy()
        if rep > 0:
            target_copy = target_copy.copy()
            target_copy['text'] = target_copy['text'] + f" [AUG_{rep}]"
        oversampled_targets.append(target_copy)
    
    oversampled_target = pd.concat(oversampled_targets, ignore_index=True)
    
    # Combine with other samples
    balanced_df = pd.concat([other_samples, oversampled_target], ignore_index=True)
    
    print(f"   After oversampling '{target_class}' samples: {len(oversampled_target)}")
    print(f"   Total training samples: {len(balanced_df)}")
    print(f"   New class distribution:")
    print(balanced_df['label'].value_counts())
    
    return balanced_df

# Load and prepare data
enhanced_df, train_df, val_df, test_df = load_enhanced_dataset()

if enhanced_df is not None:
    print(f"Ready to enhance {len(enhanced_df)} samples")
    
    # Analyze and select metadata features
    metadata_features = analyze_metadata_coverage(
        enhanced_df, 
        CONFIG["metadata_config"]["key_features"]
    )
    
    # Merge splits with metadata
    enhanced_train, enhanced_val, enhanced_test = merge_splits_with_metadata(
        train_df, val_df, test_df, enhanced_df, metadata_features
    )
    
    # Create enhanced text features
    if CONFIG["metadata_config"]["use_metadata"] and metadata_features:
        print(f"\nEnhancing text with metadata...")
        enhanced_train['enhanced_text'] = create_enhanced_text_with_metadata(enhanced_train, metadata_features)
        enhanced_val['enhanced_text'] = create_enhanced_text_with_metadata(enhanced_val, metadata_features)
        enhanced_test['enhanced_text'] = create_enhanced_text_with_metadata(enhanced_test, metadata_features)
        
        # Use enhanced text as the main text
        enhanced_train['text'] = enhanced_train['enhanced_text']
        enhanced_val['text'] = enhanced_val['enhanced_text']
        enhanced_test['text'] = enhanced_test['enhanced_text']
        
        print(f"   Text enhanced with {len(metadata_features)} metadata features")
    else:
        print(f"   Using original text without metadata enhancement")
    
    # Apply oversampling strategy
    enhanced_train = oversample_minority_class(
        enhanced_train,
        CONFIG["problem_class"],
        CONFIG["training_config"]["oversample_ratio"]
    )
    
    print(f"Data preparation complete")
else:
    print("Data preparation failed")

## Model Training Functions

In [None]:
def calculate_class_weights(train_df, target_class, multiplier=3.0):
    """Calculate class weights with emphasis on target class"""
    
    print(f"\nCalculating class weights...")
    print(f"   Target class: '{target_class}'")
    print(f"   Weight multiplier: {multiplier}x")
    
    label_counts = train_df['label'].value_counts()
    print(f"   Class distribution: {dict(label_counts)}")
    
    # Calculate balanced weights
    unique_labels = train_df['label'].unique()
    label_encoder = LabelEncoder()
    label_encoder.fit(unique_labels)
    
    y = label_encoder.transform(train_df['label'])
    base_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
    
    # Apply multiplier to target class
    class_weights = {}
    for i, label in enumerate(label_encoder.classes_):
        if label == target_class:
            class_weights[i] = base_weights[i] * multiplier
        else:
            class_weights[i] = base_weights[i]
    
    print(f"   Final class weights:")
    for i, label in enumerate(label_encoder.classes_):
        print(f"     {label}: {class_weights[i]:.3f}")
    
    # Convert to tensor
    weight_tensor = torch.tensor([class_weights[i] for i in range(len(label_encoder.classes_))], dtype=torch.float)
    
    return weight_tensor, class_weights

class CustomTrainer(Trainer):
    """Custom trainer with class weights"""
    
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        
        if self.class_weights is not None:
            loss_fn = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        else:
            loss_fn = torch.nn.CrossEntropyLoss()
        
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred: EvalPrediction):
    """Compute evaluation metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    # Overall metrics
    overall_metrics = {
        'accuracy': accuracy_score(labels, predictions),
        'f1_macro': f1_score(labels, predictions, average='macro'),
        'f1_weighted': f1_score(labels, predictions, average='weighted'),
        'precision_macro': precision_score(labels, predictions, average='macro'),
        'recall_macro': recall_score(labels, predictions, average='macro'),
    }
    
    # Per-class metrics
    per_class_f1 = f1_score(labels, predictions, average=None)
    per_class_precision = precision_score(labels, predictions, average=None)
    per_class_recall = recall_score(labels, predictions, average=None)
    
    # Add per-class metrics
    for i in range(len(per_class_f1)):
        overall_metrics[f'f1_class_{i}'] = per_class_f1[i]
        overall_metrics[f'precision_class_{i}'] = per_class_precision[i]
        overall_metrics[f'recall_class_{i}'] = per_class_recall[i]
    
    return overall_metrics

def tokenize_data(df, tokenizer, max_length):
    """Tokenize the data for training"""
    
    print(f"🔧 Tokenizing {len(df)} samples...")
    
    dataset = Dataset.from_pandas(df[["text", "label_id"]].rename(columns={"label_id": "label"}))
    
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=50)
    tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    
    print(f"   ✅ Tokenization complete")
    return tokenized_dataset

## Execute Training and Evaluation

In [None]:
def train_and_evaluate_metadata_enhanced_model():
    """Complete training and evaluation pipeline"""
    
    if enhanced_df is None:
        print("❌ Cannot proceed - data not loaded")
        return None
    
    print(f"\n{'='*60}")
    print("🚀 METADATA ENHANCED CLASSIFICATION")
    print(f"{'='*60}")
    
    # Prepare label encoder
    label_encoder = LabelEncoder()
    unique_labels = enhanced_train['label'].unique()
    label_encoder.fit(unique_labels)
    
    print(f"\n🏷️ Label mapping:")
    for i, label in enumerate(label_encoder.classes_):
        print(f"   {i}: {label}")
    
    # Encode labels
    enhanced_train['label_id'] = label_encoder.transform(enhanced_train['label'])
    enhanced_val['label_id'] = label_encoder.transform(enhanced_val['label'])
    enhanced_test['label_id'] = label_encoder.transform(enhanced_test['label'])
    
    # Calculate class weights
    class_weights, class_weights_dict = calculate_class_weights(
        enhanced_train,
        CONFIG["problem_class"],
        CONFIG["training_config"]["class_weight_multiplier"]
    )
    
    # Prepare model and tokenizer
    print(f"\n🤖 Loading {CONFIG['target_model']}...")
    
    tokenizer = AutoTokenizer.from_pretrained(CONFIG["target_model"], use_fast=True)
    
    if tokenizer.pad_token is None:
        if tokenizer.eos_token:
            tokenizer.pad_token = tokenizer.eos_token
        else:
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    
    num_labels = len(label_encoder.classes_)
    model = AutoModelForSequenceClassification.from_pretrained(
        CONFIG["target_model"],
        num_labels=num_labels,
        torch_dtype=torch.float32
    )
    
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    
    print(f"   ✅ Model loaded on {device}")
    print(f"   📊 Parameters: {model.num_parameters():,}")
    
    # Tokenize datasets
    print(f"\n🔧 Preparing datasets...")
    train_dataset = tokenize_data(enhanced_train, tokenizer, CONFIG["training_config"]["max_length"])
    val_dataset = tokenize_data(enhanced_val, tokenizer, CONFIG["training_config"]["max_length"])
    test_dataset = tokenize_data(enhanced_test, tokenizer, CONFIG["training_config"]["max_length"])
    
    training_args = TrainingArguments(
        output_dir=f"{CONFIG['output_dir']}/temp",
        evaluation_strategy="steps",
        eval_steps=20,
        # save_steps=40, 
        num_train_epochs=CONFIG["training_config"]["epochs"],
        per_device_train_batch_size=CONFIG["training_config"]["batch_size"],
        per_device_eval_batch_size=CONFIG["training_config"]["batch_size"],
        learning_rate=CONFIG["training_config"]["learning_rate"],
        weight_decay=CONFIG["training_config"]["weight_decay"],
        warmup_steps=CONFIG["training_config"]["warmup_steps"],
        logging_steps=10,
        save_strategy="no", 
        load_best_model_at_end=False, 
        metric_for_best_model="eval_f1_macro",
        greater_is_better=True,
        report_to="none",
        dataloader_num_workers=0,
        remove_unused_columns=True,
        dataloader_pin_memory=False,
        max_grad_norm=1.0,
        save_total_limit=0,
        fp16=torch.cuda.is_available(),
        optim="adamw_torch",
    )
    
    # Create trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        class_weights=class_weights
    )
    
    print(f"\n🚀 Starting training...")
    print(f"   Strategy: {CONFIG['experiment_name']}")
    print(f"   Enhanced with metadata: {CONFIG['metadata_config']['use_metadata']}")
    print(f"   Metadata features: {len(metadata_features)}")
    print(f"   Training samples: {len(train_dataset)}")
    
    # Train the model
    train_result = trainer.train()
    
    print(f"✅ Training completed!")
    print(f"   Training time: {train_result.metrics.get('train_runtime', 0):.0f} seconds")
    print(f"   Final training loss: {train_result.metrics.get('train_loss', 0):.4f}")
    
    # Evaluate on test set
    print(f"\n📊 EVALUATING MODEL ON TEST SET")
    print(f"{'='*40}")
    
    test_results = trainer.predict(test_dataset)
    
    # Extract predictions and probabilities
    logits = test_results.predictions
    probabilities = torch.softmax(torch.tensor(logits), dim=1).numpy()
    predicted_labels = np.argmax(probabilities, axis=1)
    true_labels = test_results.label_ids
    confidence_scores = np.max(probabilities, axis=1)
    
    # Convert back to label names
    true_label_names = label_encoder.inverse_transform(true_labels)
    predicted_label_names = label_encoder.inverse_transform(predicted_labels)
    
    # Calculate overall metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1_macro = f1_score(true_labels, predicted_labels, average='macro')
    f1_weighted = f1_score(true_labels, predicted_labels, average='weighted')
    
    print(f"📈 Overall Results:")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   F1-Macro: {f1_macro:.4f}")
    print(f"   F1-Weighted: {f1_weighted:.4f}")
    
    # Per-class metrics
    per_class_f1 = f1_score(true_labels, predicted_labels, average=None)
    per_class_precision = precision_score(true_labels, predicted_labels, average=None)
    per_class_recall = recall_score(true_labels, predicted_labels, average=None)
    
    print(f"\n🎯 Per-Class Results:")
    for i, class_name in enumerate(label_encoder.classes_):
        print(f"   {class_name}:")
        print(f"      F1: {per_class_f1[i]:.4f}")
        print(f"      Precision: {per_class_precision[i]:.4f}")
        print(f"      Recall: {per_class_recall[i]:.4f}")
    
    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'per_class_f1': per_class_f1,
        'per_class_precision': per_class_precision,
        'per_class_recall': per_class_recall,
        'label_encoder': label_encoder,
        'true_labels': true_label_names,
        'predicted_labels': predicted_label_names,
        'confidence_scores': confidence_scores
    }

# Execute the training and evaluation
if enhanced_df is not None:
    results = train_and_evaluate_metadata_enhanced_model()
else:
    results = None
    print("❌ Cannot execute training - data not available")

## Save Results and Generate Report

In [None]:
def save_results_and_generate_report(results):
    """Save detailed results and generate comprehensive report"""
    
    if results is None:
        print("❌ No results to save")
        return
    
    print(f"\n💾 Saving results and generating report...")
    
    # Extract results
    accuracy = results['accuracy']
    f1_macro = results['f1_macro']
    f1_weighted = results['f1_weighted']
    per_class_f1 = results['per_class_f1']
    per_class_precision = results['per_class_precision']
    per_class_recall = results['per_class_recall']
    label_encoder = results['label_encoder']
    
    # Create detailed results text
    results_text = []
    results_text.append("# Metadata Enhanced Classification Results")
    results_text.append("")
    
    # Overall results
    results_text.append("## Overall Results")
    results_text.append(f"Accuracy: {accuracy:.4f}")
    results_text.append(f"F1-Macro: {f1_macro:.4f}")
    results_text.append(f"F1-Weighted: {f1_weighted:.4f}")
    results_text.append("")
    
    # Per-class results
    results_text.append("## Per-Class Results")
    for i, class_name in enumerate(label_encoder.classes_):
        results_text.append(f"{class_name}:")
        results_text.append(f"  F1: {per_class_f1[i]:.4f}")
        results_text.append(f"  Precision: {per_class_precision[i]:.4f}")
        results_text.append(f"  Recall: {per_class_precision[i]:.4f}")
        results_text.append("")
    
    # Comparison with baseline
    baseline = CONFIG["baseline_results"]
    results_text.append("## Comparison with Baseline")
    results_text.append("| Category | Baseline F1 | Enhanced F1 | Change |")
    results_text.append("|----------|-------------|-------------|--------|")
    
    for i, class_name in enumerate(label_encoder.classes_):
        baseline_key = f"{class_name.replace(' ', '_')}_f1"
        baseline_f1 = baseline.get(baseline_key, 0.0)
        enhanced_f1 = per_class_f1[i]
        change = enhanced_f1 - baseline_f1
        
        results_text.append(f"| {class_name} | {baseline_f1:.4f} | {enhanced_f1:.4f} | {change:+.4f} |")
    
    results_text.append("")
    
    # Key findings
    results_text.append("## Key Findings")
    
    # Check improvements/degradations
    unreliable_source_idx = None
    false_idx = None
    repurposed_idx = None
    
    for i, class_name in enumerate(label_encoder.classes_):
        if class_name == "unreliable source":
            unreliable_source_idx = i
        elif class_name == "false":
            false_idx = i
        elif class_name == "repurposed":
            repurposed_idx = i
    
    if false_idx is not None:
        false_improvement = per_class_f1[false_idx] - baseline.get("false_f1", 0.0)
        results_text.append(f"- False content classification: F1={per_class_f1[false_idx]:.4f} (improvement: {false_improvement:+.4f})")
    
    if repurposed_idx is not None:
        repurposed_improvement = per_class_f1[repurposed_idx] - baseline.get("repurposed_f1", 0.0)
        results_text.append(f"- Repurposed content classification: F1={per_class_f1[repurposed_idx]:.4f} (improvement: {repurposed_improvement:+.4f})")
    
    if unreliable_source_idx is not None:
        unreliable_change = per_class_f1[unreliable_source_idx] - baseline.get("unreliable_source_f1", 0.0)
        results_text.append(f"- Unreliable source classification: F1={per_class_f1[unreliable_source_idx]:.4f} (change: {unreliable_change:+.4f})")
    
    results_text.append("")
    results_text.append("## Configuration")
    results_text.append(f"Model: {CONFIG['target_model']}")
    results_text.append(f"Metadata features used: {len(metadata_features)}")
    results_text.append(f"Oversampling ratio: {CONFIG['training_config']['oversample_ratio']}x")
    results_text.append(f"Learning rate: {CONFIG['training_config']['learning_rate']}")
    results_text.append(f"Epochs: {CONFIG['training_config']['epochs']}")
    results_text.append("")
    
    # Save results text
    results_file_path = f"{CONFIG['output_dir']}/reports/metadata_classification_results.txt"
    with open(results_file_path, 'w') as f:
        f.write('\n'.join(results_text))
    
    # Save detailed predictions
    predictions_df = pd.DataFrame({
        'true_label': results['true_labels'],
        'predicted_label': results['predicted_labels'],
        'confidence': results['confidence_scores'],
        'correct': results['true_labels'] == results['predicted_labels']
    })
    
    predictions_path = f"{CONFIG['output_dir']}/predictions/detailed_predictions.csv"
    predictions_df.to_csv(predictions_path, index=False)
    
    # Save metrics as JSON
    metrics = {
        'experiment_name': CONFIG['experiment_name'],
        'accuracy': float(accuracy),
        'f1_macro': float(f1_macro),
        'f1_weighted': float(f1_weighted),
        'metadata_enhanced': CONFIG['metadata_config']['use_metadata'],
        'metadata_features_count': len(metadata_features),
        'per_class_results': {}
    }
    
    for i, class_name in enumerate(label_encoder.classes_):
        metrics['per_class_results'][class_name] = {
            'f1': float(per_class_f1[i]),
            'precision': float(per_class_precision[i]),
            'recall': float(per_class_recall[i])
        }
    
    metrics_path = f"{CONFIG['output_dir']}/reports/classification_metrics.json"
    with open(metrics_path, 'w') as f:
        json.dump(metrics, f, indent=2)
    
    print(f"✅ Results saved:")
    print(f"   - Detailed results: {results_file_path}")
    print(f"   - Predictions: {predictions_path}")
    print(f"   - Metrics: {metrics_path}")
    
    return results_file_path

# Save results and generate report
if results is not None:
    report_path = save_results_and_generate_report(results)
    
    print(f"\n{'='*60}")
    print("✅ METADATA ENHANCED CLASSIFICATION COMPLETE!")
    print(f"{'='*60}")
    print(f"📊 Overall F1-Macro: {results['f1_macro']:.4f}")
    print(f"📊 Overall Accuracy: {results['accuracy']:.4f}")
    
    # Show key results for each class
    for i, class_name in enumerate(results['label_encoder'].classes_):
        print(f"📊 {class_name}: F1={results['per_class_f1'][i]:.4f}")
    
    print(f"\n📁 All results saved to: {CONFIG['output_dir']}")
    print(f"📄 Detailed report: {report_path}")
    
else:
    print("❌ Experiment failed - no results to report")

## Cleanup

In [None]:
# Cleanup memory
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("\n🧹 Memory cleanup complete")