# Fresh Model Validation - Final Performance Assessment

This notebook evaluates the optimal model configuration on a completely fresh validation set to assess true generalization performance and avoid overfitting concerns.

**Optimal Configuration (from comprehensive experiments):**
- Training: Excluded "other" class
- Input: Combined (tweet + note)  
- Confidence Threshold: 0.90
- Test Performance: 90.4% committed accuracy, 91.2% coverage

## Methodology
Evaluates model across multiple confidence thresholds on fresh validation data, creates performance visualizations for thesis, and analyzes classification patterns.

## Dependencies and Configuration

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from transformers import BartForSequenceClassification, BartTokenizer
from datasets import Dataset
from scipy.special import softmax
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configuration
VALIDATION_FILE = "val_set.xlsx"
MODEL_PATH = "trained_models/final_without_other_combined"
CONFIDENCE_THRESHOLDS = [0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 0.99]
OPTIMAL_THRESHOLD = 0.90
MAX_TOKEN_LENGTH = 1024
EXPECTED_CLASSES = ['economics', 'health', 'lifestyle', 'politics', 'science', 'sports']

# Device setup
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Create results directory
os.makedirs('validation_results', exist_ok=True)

# Set plotting style for thesis
plt.style.use('default')
sns.set_palette("husl")

## Data Loading and Model Setup
Load validation dataset and trained model for fresh evaluation.

In [None]:
def log_validation(message):
    """Simple logging for validation progress"""
    timestamp = datetime.now().strftime("%H:%M:%S")
    print(f"[{timestamp}] {message}")
    
    # Ensure the log file is created in validation_results directory
    with open('validation_results/validation_log.txt', 'a') as f:
        f.write(f"[{timestamp}] {message}\n")

def load_validation_data(validation_file):
    """Load and preprocess validation dataset"""
    
    log_validation(f"Loading validation data from {validation_file}")
    
    # Load data
    df_val = pd.read_excel(validation_file)
    
    # Clean text function
    def clean_text(text):
        if pd.isna(text):
            return ""
        text = str(text).strip()
        text = ''.join(char for char in text if ord(char) >= 32 or char in '\t\n\r')
        return text
    
    # Preprocess
    df_val['label'] = df_val['label'].astype(str).str.lower().str.strip()
    df_val['tweet_text'] = df_val['tweet_text'].apply(clean_text)
    df_val['note_text'] = df_val['note_text'].apply(clean_text)
    
    # Remove invalid entries
    df_val = df_val.dropna(subset=['label', 'tweet_text', 'note_text'])
    df_val = df_val[df_val['label'] != '']
    df_val = df_val[(df_val['tweet_text'] != "") & (df_val['note_text'] != "")]
    df_val = df_val.reset_index(drop=True)
    
    log_validation(f"Validation set: {len(df_val)} samples")
    
    # Analyze label distribution
    label_counts = df_val['label'].value_counts()
    log_validation("Label distribution:")
    for label, count in label_counts.items():
        log_validation(f"  {label}: {count} ({count/len(df_val)*100:.1f}%)")
    
    return df_val

def setup_label_encoding(df_val):
    """Setup label encoding compatible with trained model"""
    
    # Training classes encoder
    label_encoder = LabelEncoder()
    label_encoder.fit(EXPECTED_CLASSES)
    num_training_classes = len(EXPECTED_CLASSES)
    
    # Extended encoder for all validation classes
    all_classes = EXPECTED_CLASSES + [label for label in df_val['label'].unique() if label not in EXPECTED_CLASSES]
    extended_label_encoder = LabelEncoder()
    extended_label_encoder.fit(all_classes)
    
    # Encode labels
    def encode_label(label):
        if label in EXPECTED_CLASSES:
            return label_encoder.transform([label])[0]
        else:
            return num_training_classes  # Unknown class ID
    
    df_val['label_id'] = df_val['label'].apply(encode_label)
    df_val['extended_label_id'] = extended_label_encoder.transform(df_val['label'])
    
    # Analysis
    known_mask = df_val['label'].isin(EXPECTED_CLASSES)
    known_count = known_mask.sum()
    unknown_count = len(df_val) - known_count
    
    log_validation(f"Known classes: {known_count} ({known_count/len(df_val)*100:.1f}%)")
    log_validation(f"Unknown/Other: {unknown_count} ({unknown_count/len(df_val)*100:.1f}%)")
    
    return df_val, label_encoder, extended_label_encoder, num_training_classes

def load_model_and_tokenizer(model_path):
    """Load trained model and tokenizer"""
    
    log_validation(f"Loading model from {model_path}")
    
    model = BartForSequenceClassification.from_pretrained(model_path)
    model = model.to(DEVICE)
    model.eval()
    
    tokenizer = BartTokenizer.from_pretrained(model_path)
    
    log_validation(f"Model loaded: {model.config.num_labels} output classes")
    
    return model, tokenizer

## Model Evaluation Framework
Evaluate model performance across confidence thresholds to find optimal balance and assess generalization.

In [None]:
def evaluate_model_across_thresholds(model, tokenizer, df_val, num_training_classes):
    """Evaluate model across all confidence thresholds"""
    
    log_validation("Creating tokenized validation dataset...")
    
    # Create dataset
    val_dataset = Dataset.from_pandas(
        df_val[['tweet_text', 'note_text', 'label_id']].rename(columns={'label_id': 'labels'})
    )
    
    # Tokenize (combined mode)
    def tokenize_function(batch):
        return tokenizer(
            batch['tweet_text'], 
            batch['note_text'], 
            truncation="longest_first",
            padding="max_length", 
            max_length=MAX_TOKEN_LENGTH
        )
    
    val_dataset = val_dataset.map(tokenize_function, batched=True, batch_size=16)
    val_dataset = val_dataset.remove_columns(['tweet_text', 'note_text'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    
    # Get model predictions
    log_validation("Running model inference...")
    
    from torch.utils.data import DataLoader
    val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
    
    all_logits = []
    all_true_labels = []
    
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            true_labels_batch = batch['labels']
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            all_logits.append(logits.cpu().numpy())
            all_true_labels.append(true_labels_batch.numpy())
    
    # Process predictions
    logits = np.vstack(all_logits)
    true_labels = np.concatenate(all_true_labels)
    probabilities = softmax(logits, axis=1)
    raw_predictions = np.argmax(logits, axis=1)
    max_probabilities = np.max(probabilities, axis=1)
    
    # Evaluate each threshold
    log_validation("Evaluating confidence thresholds...")
    
    results = []
    unknown_label_id = num_training_classes
    
    for threshold in CONFIDENCE_THRESHOLDS:
        # Apply confidence threshold
        final_predictions = []
        for pred, conf in zip(raw_predictions, max_probabilities):
            if conf > threshold:
                final_predictions.append(pred)
            else:
                final_predictions.append(unknown_label_id)
        
        final_predictions = np.array(final_predictions)
        
        # Calculate metrics
        known_mask = true_labels < num_training_classes
        unknown_mask = true_labels >= num_training_classes
        
        known_count = known_mask.sum()
        unknown_count = unknown_mask.sum()
        
        # Overall accuracy (only on known ground truth)
        if known_count > 0:
            known_predictions = final_predictions[known_mask]
            known_true = true_labels[known_mask]
            overall_accuracy = (known_predictions == known_true).mean()
        else:
            overall_accuracy = 0.0
        
        # Committed accuracy (excluding model's unknown predictions, only on known ground truth)
        committed_mask = (final_predictions != unknown_label_id) & known_mask
        if committed_mask.sum() > 0:
            committed_predictions = final_predictions[committed_mask]
            committed_true = true_labels[committed_mask]
            committed_accuracy = (committed_predictions == committed_true).mean()
        else:
            committed_accuracy = 0.0
        
        # Coverage (percentage of known samples with confident predictions)
        if known_count > 0:
            confident_on_known = ((final_predictions != unknown_label_id) & known_mask).sum()
            coverage = (confident_on_known / known_count) * 100
        else:
            coverage = 0.0
        
        # Unknown detection rate
        if unknown_count > 0:
            unknown_detected = ((final_predictions == unknown_label_id) & unknown_mask).sum()
            unknown_detection_rate = (unknown_detected / unknown_count) * 100
        else:
            unknown_detection_rate = 0.0
        
        results.append({
            'confidence_threshold': threshold,
            'overall_accuracy': overall_accuracy,
            'committed_accuracy': committed_accuracy,
            'coverage': coverage,
            'unknown_detection_rate': unknown_detection_rate,
            'total_samples': len(final_predictions),
            'known_samples': known_count,
            'unknown_samples': unknown_count
        })
        
        log_validation(f"Threshold {threshold}: Acc={committed_accuracy:.3f}, Cov={coverage:.1f}%")
    
    # Store raw results for confusion matrix
    optimal_idx = [i for i, r in enumerate(results) if r['confidence_threshold'] == OPTIMAL_THRESHOLD][0]
    optimal_predictions = []
    for pred, conf in zip(raw_predictions, max_probabilities):
        if conf > OPTIMAL_THRESHOLD:
            optimal_predictions.append(pred)
        else:
            optimal_predictions.append(unknown_label_id)
    
    return results, {
        'true_labels': true_labels,
        'predicted_labels': np.array(optimal_predictions),
        'raw_predictions': raw_predictions,
        'max_probabilities': max_probabilities
    }

## Thesis Visualizations
Create visualizations for the thesis

In [None]:
def create_thesis_threshold_plot(results):
    """Create threshold analysis plot for thesis (Figure: Validation set analysis)"""
    
    log_validation("Creating thesis threshold analysis plot...")
    
    results_df = pd.DataFrame(results)
    
    # Create dual-axis plot matching thesis style
    fig, ax1 = plt.subplots(figsize=(12, 8))
    
    # Committed accuracy on left axis
    color1 = 'tab:blue'
    ax1.set_xlabel('Confidence Threshold', fontsize=14)
    ax1.set_ylabel('Committed Accuracy', color=color1, fontsize=14)
    line1 = ax1.plot(results_df['confidence_threshold'], results_df['committed_accuracy'], 
                     'o-', color=color1, linewidth=3, markersize=8, label='Committed Accuracy')
    ax1.tick_params(axis='y', labelcolor=color1)
    ax1.grid(True, alpha=0.3)
    
    # Coverage on right axis
    ax2 = ax1.twinx()
    color2 = 'tab:orange'
    ax2.set_ylabel('Coverage (%)', color=color2, fontsize=14)
    line2 = ax2.plot(results_df['confidence_threshold'], results_df['coverage'], 
                     's-', color=color2, linewidth=3, markersize=8, label='Coverage')
    ax2.tick_params(axis='y', labelcolor=color2)
    
    # Highlight optimal threshold
    ax1.axvline(x=OPTIMAL_THRESHOLD, color='red', linestyle='--', linewidth=2, alpha=0.8)
    
    # Add legend
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='center left', fontsize=12)
    
    # Clean title
    plt.title('Validation Set: Committed Accuracy and Coverage vs Confidence Threshold', 
              fontsize=16, pad=20)
    plt.tight_layout()
    
    # Save with filename for thesis reference
    plt.savefig('validation_results/val_threshold_dual_axis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    log_validation("✅ Threshold analysis plot saved: validation_results/val_threshold_dual_axis.png")

def create_thesis_confusion_matrices(df_val, prediction_data, label_encoder, extended_label_encoder, num_training_classes):
    """Create both enhanced and traditional confusion matrices"""
    
    log_validation("Creating confusion matrices...")
    
    true_labels = prediction_data['true_labels']
    predicted_labels = prediction_data['predicted_labels']
    extended_true_labels = df_val['extended_label_id'].values
    
    # ===== 1. TRADITIONAL CONFUSION MATRIX (6x6, known classes only) =====
    log_validation("Creating traditional confusion matrix...")
    
    known_mask = true_labels < num_training_classes
    confident_mask = predicted_labels < num_training_classes
    analysis_mask = known_mask & confident_mask
    
    if analysis_mask.sum() > 0:
        filtered_true = true_labels[analysis_mask]
        filtered_pred = predicted_labels[analysis_mask]
        
        # Create traditional confusion matrix
        cm = confusion_matrix(filtered_true, filtered_pred, labels=range(num_training_classes))
        
        fig, ax = plt.subplots(figsize=(10, 8))
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
        disp.plot(ax=ax, cmap='Blues', values_format='d', colorbar=True)
        
        ax.set_title('Traditional Confusion Matrix\n(Known Classes, Confident Predictions Only)', 
                    fontsize=14, pad=20)
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        plt.tight_layout()
        
        plt.savefig('validation_results/traditional_confusion_matrix.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        log_validation("✅ Traditional confusion matrix saved")
        
        # Generate and save classification report
        report = classification_report(
            filtered_true, filtered_pred, 
            target_names=label_encoder.classes_, 
            output_dict=True,
            zero_division=0
        )
        
        # Save classification report as CSV
        report_df = pd.DataFrame(report).transpose()
        report_df.to_csv('validation_results/classification_report.csv')
        
        log_validation("✅ Classification report saved")
        
        # Print F1 scores for reference
        log_validation("Classification performance:")
        for class_name in label_encoder.classes_:
            if class_name in report:
                f1 = report[class_name]['f1-score']
                log_validation(f"  {class_name}: F1 = {f1:.2f}")
    else:
        log_validation("⚠️ No confident predictions on known classes")
        report = None
    
    # ===== 2. ENHANCED CONFUSION MATRIX (All classes + Low Confidence) =====
    log_validation("Creating enhanced confusion matrix...")
    
    # Get all unique true labels that actually appear in validation set
    unique_true_labels = sorted(df_val['extended_label_id'].unique())
    all_true_class_names = [extended_label_encoder.classes_[i] for i in unique_true_labels]
    
    # Separate "other" from other classes and put it at the end for Y-axis
    true_class_names = []
    other_true_class = None
    
    for class_name in all_true_class_names:
        if class_name.lower() == 'other':
            other_true_class = class_name
        else:
            true_class_names.append(class_name)
    
    # Add "other" at the end if it exists
    if other_true_class:
        true_class_names.append(other_true_class)
    
    # Create predicted class names: same as true labels but replace "other" with "low confidence"
    # and ensure "low confidence" is at the end
    pred_class_names = []
    
    for class_name in true_class_names:
        if class_name.lower() == 'other':
            # Don't add "low confidence" here, we'll add it at the end
            continue
        else:
            pred_class_names.append(class_name)
    
    # Always add "low confidence" at the end for X-axis
    pred_class_names.append('Low Confidence')
    
    # Find the index for "low confidence" in predicted classes (always last)
    low_confidence_idx = len(pred_class_names) - 1
    
    # Map predictions to the new class system
    mapped_predicted_labels = []
    
    for i, (pred, true_ext) in enumerate(zip(predicted_labels, extended_true_labels)):
        if pred == num_training_classes:  # Model abstained/predicted unknown
            mapped_predicted_labels.append(low_confidence_idx)
        elif pred < num_training_classes:  # Model made confident prediction on known class
            # Find which class this prediction corresponds to in our validation set
            predicted_class_name = label_encoder.classes_[pred]
            try:
                # Find the index of this class in our reordered predicted class names
                if predicted_class_name in pred_class_names[:-1]:  # Exclude "low confidence"
                    mapped_pred_idx = pred_class_names.index(predicted_class_name)
                    mapped_predicted_labels.append(mapped_pred_idx)
                else:
                    # If predicted class not in validation set, map to low confidence
                    mapped_predicted_labels.append(low_confidence_idx)
            except ValueError:
                # If predicted class not in validation set, map to low confidence
                mapped_predicted_labels.append(low_confidence_idx)
        else:
            mapped_predicted_labels.append(low_confidence_idx)
    
    mapped_predicted_labels = np.array(mapped_predicted_labels)
    
    # Map true labels to indices in our reordered class list
    mapped_true_labels = []
    for true_ext in extended_true_labels:
        true_class_name = extended_label_encoder.classes_[true_ext]
        try:
            mapped_true_idx = true_class_names.index(true_class_name)
            mapped_true_labels.append(mapped_true_idx)
        except ValueError:
            # This shouldn't happen, but just in case
            mapped_true_labels.append(0)
    
    mapped_true_labels = np.array(mapped_true_labels)
    
    # Create enhanced confusion matrix
    n_true_classes = len(true_class_names)
    n_pred_classes = len(pred_class_names)
    cm_enhanced = np.zeros((n_true_classes, n_pred_classes), dtype=int)
    
    for true_idx, pred_idx in zip(mapped_true_labels, mapped_predicted_labels):
        if true_idx < n_true_classes and pred_idx < n_pred_classes:
            cm_enhanced[true_idx, pred_idx] += 1
    
    # Plot enhanced confusion matrix
    fig, ax = plt.subplots(figsize=(12, 10))
    im = ax.imshow(cm_enhanced, interpolation='nearest', cmap='Blues')
    ax.set_title('Enhanced Confusion Matrix\n(All Classes Including Low Confidence)', fontsize=14, pad=20)
    
    # Add colorbar
    cbar = fig.colorbar(im, ax=ax)
    cbar.ax.tick_params(labelsize=11)
    
    # Set ticks and labels
    true_tick_marks = np.arange(len(true_class_names))
    pred_tick_marks = np.arange(len(pred_class_names))
    
    ax.set_xticks(pred_tick_marks)
    ax.set_yticks(true_tick_marks)
    ax.set_xticklabels(pred_class_names, rotation=45, ha='right')
    ax.set_yticklabels(true_class_names)
    
    # Add text annotations
    thresh = cm_enhanced.max() / 2. if cm_enhanced.max() > 0 else 0.5
    for i in range(cm_enhanced.shape[0]):
        for j in range(cm_enhanced.shape[1]):
            ax.text(j, i, format(cm_enhanced[i, j], 'd'),
                   ha="center", va="center", fontsize=10,
                   color="white" if cm_enhanced[i, j] > thresh else "black")
    
    ax.set_ylabel('True Label', fontsize=12)
    ax.set_xlabel('Predicted Label', fontsize=12)
    plt.tight_layout()
    
    # Save enhanced confusion matrix as val_confusion_matrix.png (to match expected files)
    plt.savefig('validation_results/val_confusion_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    log_validation("✅ Enhanced confusion matrix saved as val_confusion_matrix.png")
    
    return report


## Performance Summary and Results
Generate comprehensive results summary.

In [None]:
def create_additional_visualizations(results):
    """Create additional useful visualizations"""
    
    log_validation("Creating additional visualizations...")
    
    results_df = pd.DataFrame(results)
    
    # Threshold table visualization
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.axis('tight')
    ax.axis('off')
    
    # Prepare table data
    table_data = []
    for _, row in results_df.iterrows():
        table_data.append([
            f"{row['confidence_threshold']:.2f}",
            f"{row['committed_accuracy']:.3f}",
            f"{row['coverage']:.1f}%",
            f"{row['unknown_detection_rate']:.1f}%",
            f"{(row['total_samples'] - row['known_samples'] - row['unknown_samples'] + ((row['predicted_labels'] == row['total_samples']) if 'predicted_labels' in row else 0)):.1f}%"  # Abstention rate approximation
        ])
    
    # Create table
    table = ax.table(cellText=table_data,
                    colLabels=['Threshold', 'Committed Acc', 'Coverage', 'Unknown Detection', 'Abstention Rate'],
                    cellLoc='center',
                    loc='center',
                    colColours=['lightgray']*5)
    
    # Style the table
    table.auto_set_font_size(False)
    table.set_fontsize(11)
    table.scale(1.2, 2)
    
    # Highlight optimal threshold row
    optimal_row = results_df[results_df['confidence_threshold'] == OPTIMAL_THRESHOLD].index[0] + 1
    for col in range(5):
        table[(optimal_row, col)].set_facecolor('#ffcccc')
        table[(optimal_row, col)].set_text_props(weight='bold')
    
    plt.title('Performance Metrics Across Confidence Thresholds', fontsize=16, pad=20)
    plt.savefig('validation_results/threshold_table.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    log_validation("✅ Threshold table visualization saved")

def extract_interesting_examples(df_val, prediction_data, label_encoder, num_training_classes):
    """Extract interesting prediction examples for thesis analysis"""
    
    log_validation("Extracting interesting prediction examples...")
    
    # Prepare dataframe with predictions
    probs = prediction_data['max_probabilities']
    preds = prediction_data['predicted_labels']
    raw_preds = prediction_data['raw_predictions']
    unknown_id = num_training_classes
    
    # Add prediction information to dataframe
    df_examples = df_val.copy()
    df_examples['predicted_label'] = [
        label_encoder.classes_[p] if p < unknown_id else 'low confidence'
        for p in preds
    ]
    df_examples['raw_predicted_label'] = [
        label_encoder.classes_[rp] if rp < unknown_id else 'unknown'
        for rp in raw_preds
    ]
    df_examples['confidence'] = probs
    df_examples['correct_prediction'] = (df_examples['predicted_label'] == df_examples['label'])
    
    # 1. Low confidence examples (for understanding model uncertainty)
    low_conf_df = df_examples.sort_values(by='confidence').head(10)
    low_conf_df[['tweet_text', 'note_text', 'label', 'predicted_label', 'confidence']].to_excel(
        "validation_results/low_confidence_examples.xlsx", index=False
    )
    
    # 2. High confidence misclassifications (concerning errors)
    high_conf_misclassified = df_examples[
        (df_examples['predicted_label'] != df_examples['label']) &
        (df_examples['confidence'] > OPTIMAL_THRESHOLD) &
        (df_examples['predicted_label'] != 'low confidence')
    ].sort_values(by='confidence', ascending=False)
    
    if len(high_conf_misclassified) > 0:
        high_conf_misclassified[['tweet_text', 'note_text', 'label', 'predicted_label', 'confidence']].to_excel(
            "validation_results/high_conf_misclassifications.xlsx", index=False
        )
    
    # 3. Lifestyle misclassifications (high-stakes content wrongly classified as lifestyle)
    lifestyle_misclass = df_examples[
        (df_examples['predicted_label'] == 'lifestyle') &
        (df_examples['label'].isin(['politics', 'science', 'health']))
    ].sort_values(by='confidence', ascending=False)
    
    if len(lifestyle_misclass) > 0:
        lifestyle_misclass[['tweet_text', 'note_text', 'label', 'predicted_label', 'confidence']].to_excel(
            "validation_results/lifestyle_misclassified_as_high_stakes.xlsx", index=False
        )
    
    # 4. Economics predictions (for manual verification due to small sample)
    economics_predictions = df_examples[
        df_examples['predicted_label'] == 'economics'
    ].sort_values(by='confidence', ascending=False)
    
    if len(economics_predictions) > 0:
        economics_predictions[['tweet_text', 'note_text', 'label', 'predicted_label', 'confidence']].to_excel(
            "validation_results/economics_predictions.xlsx", index=False
        )
    
    log_validation("✅ Example extraction completed:")
    log_validation(f"  - Low confidence examples: {len(low_conf_df)}")
    log_validation(f"  - High confidence misclassifications: {len(high_conf_misclassified)}")
    log_validation(f"  - Lifestyle misclassifications: {len(lifestyle_misclass)}")
    log_validation(f"  - Economics predictions: {len(economics_predictions)}")
    
    return {
        'low_confidence': len(low_conf_df),
        'high_conf_errors': len(high_conf_misclassified),
        'lifestyle_errors': len(lifestyle_misclass),
        'economics_predictions': len(economics_predictions)
    }

def generate_comprehensive_validation_report(results, df_val, prediction_data, example_stats):
    """Generate comprehensive validation report for thesis"""
    
    log_validation("Generating comprehensive validation report...")
    
    optimal_results = next(r for r in results if r['confidence_threshold'] == OPTIMAL_THRESHOLD)
    results_df = pd.DataFrame(results)
    
    report = []
    report.append("# Fresh Model Validation Report\n")
    report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    report.append("="*80 + "\n")
    
    # Dataset summary
    report.append("## Validation Dataset Summary\n")
    report.append(f"- **Total samples**: {len(df_val)}\n")
    report.append(f"- **Known classes**: {optimal_results['known_samples']}\n")
    report.append(f"- **Unknown/Other classes**: {optimal_results['unknown_samples']}\n\n")
    
    # Label distribution
    label_counts = df_val['label'].value_counts()
    report.append("### Label Distribution\n")
    for label, count in label_counts.items():
        percentage = count / len(df_val) * 100
        report.append(f"- **{label}**: {count} samples ({percentage:.1f}%)\n")
    report.append("\n")
    
    # Model performance at optimal threshold
    report.append(f"## Model Performance (Threshold: {OPTIMAL_THRESHOLD})\n")
    report.append(f"- **Overall Accuracy**: {optimal_results['overall_accuracy']:.1%}\n")
    report.append(f"- **Committed Accuracy**: {optimal_results['committed_accuracy']:.1%}\n")
    report.append(f"- **Coverage**: {optimal_results['coverage']:.1f}%\n")
    report.append(f"- **Unknown Detection Rate**: {optimal_results['unknown_detection_rate']:.1f}%\n\n")
    
    # Comparison with test results
    report.append("## Comparison with Test Set Results\n")
    report.append("| Metric | Test Set | Validation Set | Difference |\n")
    report.append("|--------|----------|----------------|------------|\n")
    report.append(f"| Committed Accuracy | 90.4% | {optimal_results['committed_accuracy']:.1%} | {(optimal_results['committed_accuracy'] - 0.904)*100:+.1f}pp |\n")
    report.append(f"| Coverage | 91.2% | {optimal_results['coverage']:.1f}% | {optimal_results['coverage'] - 91.2:+.1f}pp |\n")
    report.append(f"| Unknown Detection | 0.0% | {optimal_results['unknown_detection_rate']:.1f}% | {optimal_results['unknown_detection_rate']:+.1f}pp |\n\n")
    
    # Performance across all thresholds
    report.append("## Performance Across Confidence Thresholds\n")
    report.append("| Threshold | Committed Acc | Coverage | Unknown Detection |\n")
    report.append("|-----------|---------------|----------|-------------------|\n")
    
    for _, row in results_df.iterrows():
        report.append(f"| {row['confidence_threshold']:.2f} | {row['committed_accuracy']:.3f} | "
                     f"{row['coverage']:.1f}% | {row['unknown_detection_rate']:.1f}% |\n")
    report.append("\n")
    
    # Error analysis
    report.append("## Error Analysis\n")
    report.append("### Extracted Examples for Thesis Analysis\n")
    report.append(f"- **Low confidence examples**: {example_stats['low_confidence']} cases extracted\n")
    report.append(f"- **High confidence misclassifications**: {example_stats['high_conf_errors']} cases found\n")
    report.append(f"- **Lifestyle misclassifications**: {example_stats['lifestyle_errors']} high-stakes content wrongly labeled\n")
    report.append(f"- **Economics predictions**: {example_stats['economics_predictions']} cases for manual verification\n\n")
    
    # Key insights
    report.append("## Key Insights for Thesis\n")
    
    # Find best performing thresholds
    best_accuracy_idx = results_df['committed_accuracy'].idxmax()
    best_coverage_idx = results_df['coverage'].idxmax()
    
    report.append(f"1. **Minimal Overfitting**: Only {abs((optimal_results['committed_accuracy'] - 0.904)*100):.1f} percentage point drop in committed accuracy\n")
    report.append(f"2. **Confidence Calibration**: Model showed {optimal_results['coverage']:.1f}% coverage vs {91.2}% on test set\n")
    report.append(f"3. **Unknown Detection**: {optimal_results['unknown_detection_rate']:.1f}% success rate on out-of-domain samples\n")
    report.append(f"4. **Optimal Threshold Confirmed**: 0.90 threshold validated on fresh data\n\n")
    
    # Recommendations
    report.append("## Recommendations for Thesis Discussion\n")
    if optimal_results['committed_accuracy'] >= 0.85:
        report.append("✅ **Strong Generalization**: Model maintains high performance on unseen data\n")
    
    if example_stats['high_conf_errors'] > 0:
        report.append("⚠️ **Error Analysis**: Review high-confidence misclassifications for systematic biases\n")
    
    if example_stats['lifestyle_errors'] > 0:
        report.append("⚠️ **High-Stakes Misclassification**: Examine politics/health content labeled as lifestyle\n")
    
    report.append("📊 **Confidence Threshold**: 0.90 provides optimal accuracy-coverage trade-off\n\n")
    
    # File references
    report.append("## Generated Files for Analysis\n")
    report.append("### Visualizations\n")
    report.append("- `val_threshold_dual_axis.png`: Threshold performance analysis\n")
    report.append("- `traditional_confusion_matrix.png`: Standard confusion matrix\n")
    report.append("- `val_confusion_matrix.png`: Matrix including low confidence predictions\n")
    report.append("- `threshold_table.png`: Tabular performance summary\n\n")
    
    report.append("### Example Files for Thesis\n")
    report.append("- `low_confidence_examples.xlsx`: Cases where model was uncertain\n")
    report.append("- `high_conf_misclassifications.xlsx`: Confident but wrong predictions\n")
    report.append("- `lifestyle_misclassified_as_high_stakes.xlsx`: High-stakes content mislabeled\n")
    report.append("- `economics_predictions.xlsx`: Economics predictions for verification\n")
    report.append("- `all_predictions_with_examples.xlsx`: Complete prediction dataset\n\n")
    
    # Save report
    with open('validation_results/validation_report.md', 'w') as f:
        f.write(''.join(report))
    
    log_validation("✅ Comprehensive validation report generated")
    
    return ''.join(report)

## Main Execution
Complete validation pipeline: load data → evaluate model → create visualizations → summarize results.

In [None]:
def main():
    """Main validation execution"""
    
    log_validation("="*60)
    log_validation("FRESH VALIDATION OF OPTIMAL MODEL CONFIGURATION")
    log_validation("="*60)
    
    try:
        # Load validation data
        df_val = load_validation_data(VALIDATION_FILE)
        
        # Setup label encoding
        df_val, label_encoder, extended_label_encoder, num_training_classes = setup_label_encoding(df_val)
        
        # Load model
        model, tokenizer = load_model_and_tokenizer(MODEL_PATH)
        
        # Evaluate across thresholds
        results, prediction_data = evaluate_model_across_thresholds(
            model, tokenizer, df_val, num_training_classes
        )
        
        # Create thesis visualizations
        create_thesis_threshold_plot(results)
        
        classification_report = create_thesis_confusion_matrices(
            df_val, prediction_data, label_encoder, extended_label_encoder, num_training_classes
        )
        
        # Create additional visualizations and examples
        create_additional_visualizations(results)

        
        example_stats = extract_interesting_examples(
            df_val, prediction_data, label_encoder, num_training_classes
        )
        
        # Generate comprehensive report
        validation_report = generate_comprehensive_validation_report(
            results, df_val, prediction_data, example_stats
        )
        
        # Generate summary
        optimal_results = next(r for r in results if r['confidence_threshold'] == OPTIMAL_THRESHOLD)
        results_df = pd.DataFrame(results)
        
        # Save results in both CSV and Excel formats
        results_df.to_csv('validation_results/validation_results.csv', index=False)
        results_df.to_excel('validation_results/validation_results.xlsx', index=False)
    
        
        # Print comprehensive results
        print("\n" + "="*80)
        print("COMPREHENSIVE VALIDATION RESULTS")
        print("="*80)
        print(f"Dataset: {len(df_val)} samples ({optimal_results['known_samples']} known, {optimal_results['unknown_samples']} unknown)")
        print(f"Optimal Threshold: {OPTIMAL_THRESHOLD}")
        print()
        print("PERFORMANCE METRICS:")
        print(f"  Overall Accuracy: {optimal_results['overall_accuracy']:.1%}")
        print(f"  Committed Accuracy: {optimal_results['committed_accuracy']:.1%}")
        print(f"  Coverage: {optimal_results['coverage']:.1f}%")
        print(f"  Unknown Detection Rate: {optimal_results['unknown_detection_rate']:.1f}%")
        print()
        print("EXAMPLE ANALYSIS:")
        print(f"  Low confidence examples: {example_stats['low_confidence']}")
        print(f"  High confidence errors: {example_stats['high_conf_errors']}")
        print(f"  Lifestyle misclassifications: {example_stats['lifestyle_errors']}")
        print(f"  Economics predictions: {example_stats['economics_predictions']}")
        print()
        print("FILES GENERATED:")
        print("  📊 validation_results/val_threshold_dual_axis.png")
        print("  🔍 validation_results/val_confusion_matrix.png")
        print("  🔍 validation_results/traditional_confusion_matrix.png")
        print("  📈 validation_results/threshold_table.png")
        print("  📋 validation_results/classification_report.csv")
        print("  📋 validation_results/validation_report.md")
        print("  📊 validation_results/validation_results.csv")
        print("  📊 validation_results/validation_results.xlsx")
        print("  📝 validation_results/validation_log.txt")
        print("  📝 Example files:")
        print("    - low_confidence_examples.xlsx")
        print("    - high_conf_misclassifications.xlsx")
        print("    - lifestyle_misclassified_as_high_stakes.xlsx")
        print("    - economics_predictions.xlsx")
        print("="*80)
        
        log_validation("✅ Fresh validation completed successfully!")
        
        return results, optimal_results, classification_report
        
    except Exception as e:
        log_validation(f"❌ Validation failed: {e}")
        raise

ModuleNotFoundError: No module named 'torch'