## Summary

This report analyzed the performance of DSPy taste distillation models across different configurations:

### Key Metrics
- **Baseline Performance**: Models using only the taste rubric without training
- **Optimized Performance**: Models trained using DSPy optimizers (BootstrapFinetune, etc.)
- **Tasks**: Favorite prediction and shortlist prediction
- **Model Types**: Binary classification and pairwise comparison

### Methodology
1. Parsed evaluation logs from `saved/logs/` directory
2. Extracted performance metrics, model types, and evaluation parameters
3. Compared baseline vs optimized model performance
4. Analyzed performance by task type and optimizer used
5. Performed statistical significance testing

### Next Steps
1. **Model Selection**: Use the best performing models for production inference
2. **Hyperparameter Tuning**: Experiment with different optimizer parameters
3. **Data Augmentation**: Collect more training examples for better performance
4. **Error Analysis**: Analyze failure cases to improve model robustness
5. **Ensemble Methods**: Combine multiple models for better predictions

The analysis provides a comprehensive view of how DSPy optimization affects taste distillation performance, helping guide future model development and deployment decisions.

In [ ]:
# Statistical Analysis and Key Insights
print("STATISTICAL ANALYSIS")
print("=" * 50)

# Calculate performance improvements
if len(baseline_results) > 0 and len(optimized_results) > 0:
    baseline_mean = baseline_results['accuracy'].mean()
    optimized_mean = optimized_results['accuracy'].mean()
    improvement = optimized_mean - baseline_mean
    improvement_pct = (improvement / baseline_mean) * 100
    
    print(f"Overall Performance Improvement:")
    print(f"  Baseline Mean Accuracy: {baseline_mean:.3f}")
    print(f"  Optimized Mean Accuracy: {optimized_mean:.3f}")
    print(f"  Absolute Improvement: {improvement:.3f}")
    print(f"  Relative Improvement: {improvement_pct:.1f}%")
    
    # Statistical significance test
    from scipy import stats
    t_stat, p_value = stats.ttest_ind(optimized_results['accuracy'], baseline_results['accuracy'])
    print(f"  t-statistic: {t_stat:.3f}")
    print(f"  p-value: {p_value:.3f}")
    print(f"  Statistically significant: {'Yes' if p_value < 0.05 else 'No'}")

# Best performing models
print("\\n" + "=" * 50)
print("BEST PERFORMING MODELS")
print("=" * 50)

if len(optimized_results) > 0:
    best_models = optimized_results.nlargest(5, 'accuracy')
    print("Top 5 Model Performances:")
    for idx, row in best_models.iterrows():
        print(f"  {row['model_name']}: {row['accuracy']:.3f} accuracy ({row['model_type']})")

# Performance by optimizer type
if len(model_metadata) > 0 and 'optimizer_type' in model_metadata.columns:
    print("\\n" + "=" * 50)
    print("PERFORMANCE BY OPTIMIZER TYPE")
    print("=" * 50)
    
    if 'model_name' in optimized_results.columns:
        perf_with_optimizer = optimized_results.merge(
            model_metadata[['model_name', 'optimizer_type']], 
            on='model_name', 
            how='left'
        )
        
        optimizer_stats = perf_with_optimizer.groupby('optimizer_type')['accuracy'].agg(['count', 'mean', 'std'])
        for optimizer, stats in optimizer_stats.iterrows():
            print(f"  {optimizer}: {stats['mean']:.3f} ± {stats['std']:.3f} (n={stats['count']})")

print("\\n" + "=" * 50)
print("KEY FINDINGS")
print("=" * 50)

findings = []
if len(baseline_results) > 0 and len(optimized_results) > 0:
    if improvement > 0:
        findings.append(f"DSPy optimization provides {improvement_pct:.1f}% improvement over baseline")
    else:
        findings.append("DSPy optimization shows no significant improvement over baseline")

if len(optimized_results) > 0:
    best_accuracy = optimized_results['accuracy'].max()
    findings.append(f"Best performing model achieved {best_accuracy:.3f} accuracy")

# Task-specific findings
for task in eval_results['task'].unique():
    if task != 'Unknown':
        task_results = eval_results[eval_results['task'] == task]
        if len(task_results) > 0:
            task_best = task_results['accuracy'].max()
            findings.append(f"Best {task} performance: {task_best:.3f}")

for i, finding in enumerate(findings, 1):
    print(f"{i}. {finding}")

print("\\n" + "=" * 50)
print("RECOMMENDATIONS")
print("=" * 50)

recommendations = [
    "Focus on models with accuracy > 0.60 for production use",
    "Consider ensemble methods combining multiple optimized models",
    "Investigate the impact of training data size on performance",
    "Analyze failure cases to improve rubric quality",
    "Experiment with different DSPy optimizers for better results"
]

for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

In [ ]:
# Create comprehensive comparison visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('DSPy Taste Distillation: Baseline vs Optimized Performance', fontsize=16, fontweight='bold')

# 1. Overall Performance Comparison
ax1 = axes[0, 0]
performance_comparison = eval_results.groupby('evaluation_type')['accuracy'].agg(['mean', 'std', 'count'])
performance_comparison.plot(kind='bar', y='mean', yerr='std', ax=ax1, color=['lightcoral', 'lightblue'])
ax1.set_title('Overall Performance: Baseline vs Optimized')
ax1.set_ylabel('Accuracy')
ax1.set_xlabel('Model Type')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', alpha=0.3)

# 2. Performance by Model Type
ax2 = axes[0, 1]
if len(eval_results) > 0:
    # Create a pivot table for better visualization
    pivot_data = eval_results.pivot_table(
        values='accuracy', 
        index='model_type', 
        columns='evaluation_type', 
        aggfunc='mean'
    )
    pivot_data.plot(kind='bar', ax=ax2, color=['lightcoral', 'lightblue'])
    ax2.set_title('Performance by Model Type')
    ax2.set_ylabel('Accuracy')
    ax2.set_xlabel('Model Type')
    ax2.tick_params(axis='x', rotation=45)
    ax2.legend(title='Evaluation Type')
    ax2.grid(axis='y', alpha=0.3)

# 3. Performance Distribution
ax3 = axes[1, 0]
if len(baseline_results) > 0 and len(optimized_results) > 0:
    ax3.hist(baseline_results['accuracy'], alpha=0.7, label='Baseline', bins=10, color='lightcoral')
    ax3.hist(optimized_results['accuracy'], alpha=0.7, label='Optimized', bins=10, color='lightblue')
    ax3.set_title('Accuracy Distribution')
    ax3.set_xlabel('Accuracy')
    ax3.set_ylabel('Frequency')
    ax3.legend()
    ax3.grid(axis='y', alpha=0.3)

# 4. Performance by Task
ax4 = axes[1, 1]
if len(eval_results) > 0:
    task_pivot = eval_results.pivot_table(
        values='accuracy', 
        index='task', 
        columns='evaluation_type', 
        aggfunc='mean'
    )
    task_pivot.plot(kind='bar', ax=ax4, color=['lightcoral', 'lightblue'])
    ax4.set_title('Performance by Task')
    ax4.set_ylabel('Accuracy')
    ax4.set_xlabel('Task')
    ax4.tick_params(axis='x', rotation=45)
    ax4.legend(title='Evaluation Type')
    ax4.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Comparative Analysis and Visualizations

Let's create visualizations to compare baseline and optimized model performance.

In [ ]:
# Load model metadata for additional insights
def load_model_metadata() -> pd.DataFrame:
    """Load model metadata from JSON files."""
    metadata_files = list(MODELS_DIR.glob("*_metadata.json"))
    metadata_list = []
    
    for metadata_file in metadata_files:
        try:
            with open(metadata_file, 'r') as f:
                metadata = json.load(f)
                metadata['metadata_file'] = metadata_file.name
                metadata['model_name'] = metadata_file.stem.replace('_metadata', '')
                metadata_list.append(metadata)
        except Exception as e:
            print(f"Error loading {metadata_file}: {e}")
    
    return pd.DataFrame(metadata_list)

# Load model metadata
model_metadata = load_model_metadata()
if len(model_metadata) > 0:
    print("Model Training Metadata:")
    print("=" * 30)
    print(f"Number of trained models: {len(model_metadata)}")
    
    # Show key training parameters
    key_cols = ['model_name', 'optimizer_type', 'training_examples', 'favorite_examples', 'non_favorite_examples']
    available_cols = [col for col in key_cols if col in model_metadata.columns]
    print("\\nTraining Parameters:")
    print(model_metadata[available_cols].to_string(index=False))
    
    # Merge with performance data
    if 'model_name' in optimized_results.columns:
        performance_with_metadata = optimized_results.merge(
            model_metadata[['model_name', 'optimizer_type', 'training_examples']], 
            on='model_name', 
            how='left'
        )
        
        print("\\nPerformance by Optimizer Type:")
        if 'optimizer_type' in performance_with_metadata.columns:
            perf_by_optimizer = performance_with_metadata.groupby('optimizer_type')['accuracy'].agg(['count', 'mean', 'std'])
            print(perf_by_optimizer)
else:
    print("No model metadata found.")

In [ ]:
# Analyze optimized model performance
optimized_results = eval_results[eval_results['is_optimized']].copy()

print("DSPy Optimized Model Performance Summary:")
print("=" * 50)

if len(optimized_results) > 0:
    # Overall optimized statistics
    print(f"Number of optimized evaluations: {len(optimized_results)}")
    print(f"Average accuracy: {optimized_results['accuracy'].mean():.3f}")
    print(f"Standard deviation: {optimized_results['accuracy'].std():.3f}")
    print(f"Min accuracy: {optimized_results['accuracy'].min():.3f}")
    print(f"Max accuracy: {optimized_results['accuracy'].max():.3f}")
    
    # Performance by model type
    print("\\nPerformance by Model Type:")
    optimized_by_type = optimized_results.groupby('model_type')['accuracy'].agg(['count', 'mean', 'std'])
    print(optimized_by_type)
    
    # Performance by task
    print("\\nPerformance by Task:")
    optimized_by_task = optimized_results.groupby('task')['accuracy'].agg(['count', 'mean', 'std'])
    print(optimized_by_task)
    
    # Extract model name from path for analysis
    def extract_model_name(model_path):
        if model_path is None:
            return None
        return Path(model_path).stem
    
    optimized_results['model_name'] = optimized_results['model_path'].apply(extract_model_name)
    
    # Performance by specific model
    print("\\nPerformance by Specific Model:")
    optimized_by_model = optimized_results.groupby('model_name')['accuracy'].agg(['count', 'mean', 'std'])
    print(optimized_by_model)
    
    # Show top performing models
    print("\\nTop 10 Individual Results:")
    display_cols = ['log_file', 'model_type', 'task', 'accuracy', 'test_size', 'model_name']
    top_results = optimized_results.nlargest(10, 'accuracy')[display_cols]
    print(top_results.to_string(index=False))
else:
    print("No optimized results found in the logs.")

## DSPy Optimized Model Performance

Now let's analyze the performance of models trained with DSPy optimizers.

In [ ]:
# Analyze baseline performance
baseline_results = eval_results[eval_results['is_baseline']].copy()

print("Baseline Model Performance Summary:")
print("=" * 50)

if len(baseline_results) > 0:
    # Overall baseline statistics
    print(f"Number of baseline evaluations: {len(baseline_results)}")
    print(f"Average accuracy: {baseline_results['accuracy'].mean():.3f}")
    print(f"Standard deviation: {baseline_results['accuracy'].std():.3f}")
    print(f"Min accuracy: {baseline_results['accuracy'].min():.3f}")
    print(f"Max accuracy: {baseline_results['accuracy'].max():.3f}")
    
    # Performance by model type
    print("\\nPerformance by Model Type:")
    baseline_by_type = baseline_results.groupby('model_type')['accuracy'].agg(['count', 'mean', 'std'])
    print(baseline_by_type)
    
    # Performance by task
    print("\\nPerformance by Task:")
    baseline_by_task = baseline_results.groupby('task')['accuracy'].agg(['count', 'mean', 'std'])
    print(baseline_by_task)
    
    # Show individual results
    print("\\nIndividual Baseline Results:")
    display_cols = ['log_file', 'model_type', 'task', 'accuracy', 'test_size']
    print(baseline_results[display_cols].to_string(index=False))
else:
    print("No baseline results found in the logs.")

## Baseline Model Performance

Let's analyze the performance of baseline (untrained) models that rely only on the taste rubric.

In [ ]:
# Categorize results by model type and task
def categorize_results(df: pd.DataFrame) -> pd.DataFrame:
    """Add categorical columns for easier analysis."""
    df = df.copy()
    
    # Extract model type from log file name
    def extract_model_type(log_name):
        if 'favorite' in log_name and 'pairwise' not in log_name:
            return 'Binary Classification (Favorite)'
        elif 'pairwise' in log_name:
            return 'Pairwise Classification'
        elif 'shortlist' in log_name:
            return 'Binary Classification (Shortlist)'
        else:
            return 'Unknown'
    
    df['model_type'] = df['log_file'].apply(extract_model_type)
    
    # Extract task from log file name
    def extract_task(log_name):
        if 'favorite' in log_name:
            return 'Favorite Prediction'
        elif 'shortlist' in log_name:
            return 'Shortlist Prediction'
        else:
            return 'Unknown'
    
    df['task'] = df['log_file'].apply(extract_task)
    
    # Create evaluation type column
    df['evaluation_type'] = df.apply(
        lambda row: 'Baseline' if row['is_baseline'] else 'Optimized' if row['is_optimized'] else 'Unknown', 
        axis=1
    )
    
    return df

eval_results = categorize_results(eval_results)

# Display summary statistics
print("Dataset Summary:")
print(f"Total evaluations: {len(eval_results)}")
print(f"Baseline evaluations: {len(eval_results[eval_results['is_baseline']])}")
print(f"Optimized evaluations: {len(eval_results[eval_results['is_optimized']])}")
print(f"\\nModel types:")
print(eval_results['model_type'].value_counts())
print(f"\\nEvaluation types:")
print(eval_results['evaluation_type'].value_counts())

In [ ]:
def parse_log_file(log_path: Path) -> Dict:
    """Parse a single log file to extract performance metrics."""
    with open(log_path, 'r') as f:
        content = f.read()
    
    # Extract accuracy
    accuracy_match = re.search(r'🎯 Accuracy: (\d+\.\d+)', content)
    accuracy = float(accuracy_match.group(1)) if accuracy_match else None
    
    # Extract test size
    test_size_match = re.search(r'Evaluating on (\d+) test examples', content)
    test_size = int(test_size_match.group(1)) if test_size_match else None
    
    # Extract full test set info
    full_test_match = re.search(r'Evaluating on full test set: (\d+) examples', content)
    if full_test_match:
        test_size = int(full_test_match.group(1))
    
    # Determine if baseline or optimized
    is_baseline = "baseline evaluation with untrained model" in content
    is_optimized = "Loaded trained model from:" in content
    
    # Extract model info
    model_match = re.search(r'Loaded trained model from: (.+\.json)', content)
    model_path = model_match.group(1) if model_match else None
    
    # Extract dataset info
    dataset_match = re.search(r'Using dataset: (.+\.json)', content)
    dataset_path = dataset_match.group(1) if dataset_match else None
    
    return {
        'log_file': log_path.name,
        'accuracy': accuracy,
        'test_size': test_size,
        'is_baseline': is_baseline,
        'is_optimized': is_optimized,
        'model_path': model_path,
        'dataset_path': dataset_path
    }

def load_all_logs() -> pd.DataFrame:
    """Load and parse all evaluation logs."""
    log_files = list(LOGS_DIR.glob("*eval*.log"))
    results = []
    
    for log_file in log_files:
        try:
            result = parse_log_file(log_file)
            if result['accuracy'] is not None:  # Only include logs with valid accuracy
                results.append(result)
        except Exception as e:
            print(f"Error parsing {log_file}: {e}")
    
    return pd.DataFrame(results)

# Load all evaluation results
eval_results = load_all_logs()
print(f"Loaded {len(eval_results)} evaluation results")
eval_results.head()

## Data Loading and Preprocessing

First, let's load and parse the evaluation logs to extract performance metrics.

# DSPy Taste Distillation: Baseline vs Optimized Performance Report

This notebook compares the performance of baseline (untrained) DSPy models against optimized models using various DSPy optimizers for taste distillation tasks.

## Overview

This project focuses on "taste distillation" - training models to predict personal preferences from article titles using Hacker News and Reader data. We compare two main approaches:

1. **Baseline**: Untrained DSPy models using only the taste rubric
2. **Optimized**: DSPy models trained with various optimizers (BootstrapFinetune, MIPROv2, etc.)

## Models Evaluated

- **Binary Classification**: `dspy_favorite` - Predicts whether an article is a favorite (True/False)
- **Pairwise Classification**: `dspy_pairwise` - Compares two articles and selects the preferred one

In [ ]:
import pandas as pd
import numpy as np
import json
import os
import re
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

# Project root
PROJECT_ROOT = Path("../")
LOGS_DIR = PROJECT_ROOT / "saved" / "logs"
MODELS_DIR = PROJECT_ROOT / "saved" / "models"