In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from pathlib import Path
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Academic publication style with larger fonts for readability
plt.rcParams.update({
    'font.family': 'serif',
    'font.serif': ['Times New Roman', 'DejaVu Serif', 'serif'],
    'font.size': 13,  # Increased from 11
    'axes.titlesize': 15,  # Increased from 12
    'axes.labelsize': 14,  # Increased from 11
    'xtick.labelsize': 12,  # Increased from 10
    'ytick.labelsize': 12,  # Increased from 10
    'legend.fontsize': 12,  # Increased from 10
    'figure.titlesize': 16,  # Increased from 14
    'figure.dpi': 300,
    'savefig.dpi': 300,
    'savefig.bbox': 'tight',
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.grid': False,
    'grid.alpha': 0.3,
    'axes.linewidth': 1.2,  # Thicker axis lines
    'xtick.major.width': 1.2,
    'ytick.major.width': 1.2,
})

# Paths - Experiment 2 (with artist features)
PROJECT_ROOT = Path('..').resolve()
FEATURES_DIR = PROJECT_ROOT / 'features'
MODELS_DIR = PROJECT_ROOT / 'models' / 'saved' / 'experiment2_with_artist'
DATA_DIR = PROJECT_ROOT / 'data'
FIGURES_DIR = PROJECT_ROOT / 'results' / 'figures'
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Models directory: {MODELS_DIR}")
print(f"Figures will be saved to: {FIGURES_DIR}")
print(f"\nExperiment 2: 414 features (23 audio+artist + 5 text + 2 sentiment + 384 embeddings)")

## 1. Load Data and Models

In [None]:
# Load test features - Experiment 2 (with artist data)
# Note: X_test_audio.npy contains 23 features (21 audio + 2 artist combined)
X_test_audio = np.load(FEATURES_DIR / 'X_test_audio.npy')
X_test_text_stats = np.load(FEATURES_DIR / 'X_test_text_stats.npy')
X_test_sentiment = np.load(FEATURES_DIR / 'X_test_sentiment.npy')
X_test_embeddings = np.load(FEATURES_DIR / 'X_test_embeddings.npy')

# Combine features: 23 audio+artist + 5 text + 2 sentiment + 384 embeddings = 414
X_test = np.hstack([X_test_audio, X_test_text_stats, X_test_sentiment, X_test_embeddings])
print(f"X_test shape: {X_test.shape}")
print(f"  Audio+Artist: {X_test_audio.shape[1]} features")
print(f"  Text Stats: {X_test_text_stats.shape[1]} features")
print(f"  Sentiment: {X_test_sentiment.shape[1]} features")
print(f"  Embeddings: {X_test_embeddings.shape[1]} features")
print(f"  Total: {X_test.shape[1]} features")

# Load targets
TARGETS = ['valence', 'energy', 'danceability', 'popularity']
y_test = {}
for target in TARGETS:
    y_test[target] = np.load(FEATURES_DIR / f'y_test_{target}.npy')
    print(f"y_test_{target}: {y_test[target].shape}")

# Load test CSV for metadata (genre, year)
df_test = pd.read_csv(DATA_DIR / 'test.csv')
print(f"\nTest DataFrame shape: {df_test.shape}")
print(f"Test samples: {len(df_test):,} songs")
print(f"Columns available: {df_test.columns.tolist()[:15]}...")

In [None]:
# Best models per target (Experiment 2 - will be determined from test evaluation)
# Using likely best performers based on Exp 2 validation results
BEST_MODELS = {
    'valence': 'XGBoost_tuned',
    'energy': 'XGBoost_tuned',
    'danceability': 'XGBoost_tuned',
    'popularity': 'CatBoost'
}

print("Loading Experiment 2 best models...")
print("=" * 70)

# Load best models and generate predictions
models = {}
predictions = {}
residuals = {}

for target, model_name in BEST_MODELS.items():
    model_path = MODELS_DIR / f"{model_name}_{target}.pkl"
    if model_path.exists():
        models[target] = joblib.load(model_path)
        predictions[target] = models[target].predict(X_test)
        residuals[target] = y_test[target] - predictions[target]
        
        rmse = np.sqrt(mean_squared_error(y_test[target], predictions[target]))
        mae = mean_absolute_error(y_test[target], predictions[target])
        r2 = r2_score(y_test[target], predictions[target])
        print(f"{target:12s}: {model_name:20s} (R²={r2:.4f}, RMSE={rmse:.4f}, MAE={mae:.4f})")
    else:
        print(f"WARNING: Model not found: {model_path}")

print("=" * 70)

## 2. Residual Analysis

In [None]:
def plot_predicted_vs_actual(y_true, y_pred, target_name, ax=None):
    """Scatter plot of predicted vs actual values with identity line."""
    if ax is None:
        fig, ax = plt.subplots(figsize=(6, 6))
    
    # Subsample for clarity if too many points
    n_samples = len(y_true)
    if n_samples > 10000:
        idx = np.random.choice(n_samples, 10000, replace=False)
        y_true_plot = y_true[idx]
        y_pred_plot = y_pred[idx]
    else:
        y_true_plot = y_true
        y_pred_plot = y_pred
    
    ax.scatter(y_true_plot, y_pred_plot, alpha=0.2, s=12, c='#2c3e50', edgecolors='none')
    
    # Identity line
    lims = [min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())]
    ax.plot(lims, lims, 'r--', lw=2, label='Perfect prediction', alpha=0.8)
    
    # Metrics
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    ax.set_xlabel(f'Actual {target_name.capitalize()}', fontweight='bold')
    ax.set_ylabel(f'Predicted {target_name.capitalize()}', fontweight='bold')
    ax.set_title(f'{target_name.capitalize()}\n$R^2$={r2:.3f}, RMSE={rmse:.3f}', 
                 fontweight='bold', pad=12)
    ax.legend(loc='lower right', frameon=True, fancybox=False, 
              edgecolor='black', framealpha=0.9)
    ax.grid(True, alpha=0.2, linestyle='--')
    
    return ax

In [None]:
# Predicted vs Actual for all targets
fig, axes = plt.subplots(2, 2, figsize=(14, 14))
axes = axes.flatten()

for i, target in enumerate(TARGETS):
    if target in predictions:
        plot_predicted_vs_actual(y_test[target], predictions[target], target, axes[i])

plt.suptitle('Predicted vs Actual Values', 
             fontsize=18, fontweight='bold', y=0.995)
plt.tight_layout()
fig.savefig(FIGURES_DIR / 'exp2_error_predicted_vs_actual.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"Saved: {FIGURES_DIR / 'exp2_error_predicted_vs_actual.png'}")

In [None]:
def plot_error_distribution(residuals, target_name, ax=None):
    """Histogram of prediction errors (residuals)."""
    if ax is None:
        fig, ax = plt.subplots(figsize=(6, 5))
    
    ax.hist(residuals, bins=60, color='#3498db', edgecolor='white', alpha=0.85, linewidth=0.5)
    ax.axvline(0, color='red', linestyle='--', lw=2, label='Zero error', alpha=0.8)
    ax.axvline(residuals.mean(), color='orange', linestyle='-', lw=2, 
               label=f'Mean={residuals.mean():.4f}', alpha=0.8)
    
    ax.set_xlabel('Prediction Error (Actual - Predicted)', fontweight='bold')
    ax.set_ylabel('Frequency', fontweight='bold')
    ax.set_title(f'{target_name.capitalize()} Error Distribution\nStd={residuals.std():.4f}', 
                 fontweight='bold', pad=12)
    ax.legend(loc='upper right', frameon=True, fancybox=False, 
              edgecolor='black', framealpha=0.9)
    ax.grid(True, alpha=0.2, linestyle='--', axis='y')
    
    return ax

In [None]:
# Error distribution for all targets
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, target in enumerate(TARGETS):
    if target in residuals:
        plot_error_distribution(residuals[target], target, axes[i])

plt.suptitle('Experiment 2: Error Distribution', 
             fontsize=18, fontweight='bold', y=0.995)
plt.tight_layout()
fig.savefig(FIGURES_DIR / 'exp2_error_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"Saved: {FIGURES_DIR / 'exp2_error_distribution.png'}")

In [None]:
def plot_residual_vs_predicted(y_pred, residuals, target_name, ax=None):
    """Residual plot to check for heteroscedasticity."""
    if ax is None:
        fig, ax = plt.subplots(figsize=(6, 5))
    
    # Subsample for clarity
    n_samples = len(y_pred)
    if n_samples > 10000:
        idx = np.random.choice(n_samples, 10000, replace=False)
        y_pred_plot = y_pred[idx]
        res_plot = residuals[idx]
    else:
        y_pred_plot = y_pred
        res_plot = residuals
    
    ax.scatter(y_pred_plot, res_plot, alpha=0.2, s=12, c='#2c3e50', edgecolors='none')
    ax.axhline(0, color='red', linestyle='--', lw=2, alpha=0.8)
    
    # Add smoothed trend line
    sorted_idx = np.argsort(y_pred)
    window = len(y_pred) // 20
    if window > 10:
        rolling_mean = pd.Series(residuals[sorted_idx]).rolling(window, center=True).mean()
        ax.plot(y_pred[sorted_idx], rolling_mean, color='orange', lw=2.5, 
                label='Trend', alpha=0.9)
        ax.legend(loc='best', frameon=True, fancybox=False, 
                  edgecolor='black', framealpha=0.9)
    
    ax.set_xlabel(f'Predicted {target_name.capitalize()}', fontweight='bold')
    ax.set_ylabel('Residuals', fontweight='bold')
    ax.set_title(f'{target_name.capitalize()} Residuals', 
                 fontweight='bold', pad=12)
    ax.grid(True, alpha=0.2, linestyle='--')
    
    return ax

In [None]:
# Residual plots for all targets
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, target in enumerate(TARGETS):
    if target in predictions and target in residuals:
        plot_residual_vs_predicted(predictions[target], residuals[target], target, axes[i])

plt.suptitle('Experiment 2: Residual Analysis', 
             fontsize=18, fontweight='bold', y=0.995)
plt.tight_layout()
fig.savefig(FIGURES_DIR / 'exp2_error_residual_plots.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"Saved: {FIGURES_DIR / 'exp2_error_residual_plots.png'}")

## 3. Error Segmentation by Genre

In [None]:
# Check genre column
if 'genre' in df_test.columns:
    print("Genre distribution in test set:")
    print(df_test['genre'].value_counts())
else:
    print("Available columns:", df_test.columns.tolist())

In [None]:
def analyze_error_by_genre(df, y_true, y_pred, target_name):
    """Calculate error metrics by genre."""
    if 'genre' not in df.columns:
        print("Genre column not found")
        return None
    
    # Create analysis dataframe
    analysis_df = pd.DataFrame({
        'genre': df['genre'].values[:len(y_true)],
        'actual': y_true,
        'predicted': y_pred,
        'error': y_true - y_pred,
        'abs_error': np.abs(y_true - y_pred),
        'squared_error': (y_true - y_pred) ** 2
    })
    
    # Aggregate by genre
    genre_stats = analysis_df.groupby('genre').agg({
        'actual': ['count', 'mean', 'std'],
        'error': ['mean', 'std'],
        'abs_error': 'mean',
        'squared_error': 'mean'
    }).round(4)
    
    genre_stats.columns = ['count', 'mean_actual', 'std_actual', 
                           'mean_error', 'std_error', 'mae', 'mse']
    genre_stats['rmse'] = np.sqrt(genre_stats['mse'])
    genre_stats = genre_stats.sort_values('rmse', ascending=False)
    
    return analysis_df, genre_stats

In [None]:
# Analyze errors by genre for each target
genre_analysis = {}
genre_stats = {}

for target in TARGETS:
    if target in predictions:
        result = analyze_error_by_genre(df_test, y_test[target], predictions[target], target)
        if result is not None:
            genre_analysis[target], genre_stats[target] = result
            print(f"\n{target.upper()} - Error by Genre:")
            print(genre_stats[target][['count', 'mae', 'rmse', 'mean_error']].to_string())

In [None]:
# Plot error by genre
if genre_stats:
    fig, axes = plt.subplots(2, 2, figsize=(14, 11))
    axes = axes.flatten()
    
    colors = plt.cm.RdYlGn_r(np.linspace(0.2, 0.8, 10))
    
    for i, target in enumerate(TARGETS):
        if target in genre_stats:
            stats = genre_stats[target].sort_values('rmse', ascending=True)
            
            bars = axes[i].barh(range(len(stats)), stats['rmse'], color=colors, 
                               edgecolor='white', linewidth=1.5)
            axes[i].set_yticks(range(len(stats)))
            axes[i].set_yticklabels(stats.index, fontweight='bold')
            axes[i].set_xlabel('RMSE', fontweight='bold')
            axes[i].set_title(f'{target.capitalize()} - RMSE by Genre', 
                            fontweight='bold', pad=12)
            axes[i].grid(True, alpha=0.2, linestyle='--', axis='x')
            
            # Add value labels
            for j, (idx, row) in enumerate(stats.iterrows()):
                axes[i].text(row['rmse'] + 0.003, j, f"{row['rmse']:.3f}", 
                           va='center', fontsize=11, fontweight='bold')
    
    plt.suptitle('Experiment 2: Error by Genre', 
                 fontsize=18, fontweight='bold', y=0.995)
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / 'exp2_error_by_genre.png', dpi=300, bbox_inches='tight')
    plt.show()
    print(f"Saved: {FIGURES_DIR / 'exp2_error_by_genre.png'}")

In [None]:
# Box plot of errors by genre
if genre_analysis:
    fig, axes = plt.subplots(2, 2, figsize=(16, 11))
    axes = axes.flatten()
    
    for i, target in enumerate(TARGETS):
        if target in genre_analysis:
            df_plot = genre_analysis[target]
            
            # Order by median error
            order = df_plot.groupby('genre')['abs_error'].median().sort_values().index
            
            sns.boxplot(data=df_plot, x='genre', y='error', order=order, 
                       ax=axes[i], palette='RdYlGn_r', showfliers=False, linewidth=1.5)
            axes[i].axhline(0, color='red', linestyle='--', lw=2, alpha=0.8)
            axes[i].set_xlabel('Genre', fontweight='bold')
            axes[i].set_ylabel('Prediction Error', fontweight='bold')
            axes[i].set_title(f'{target.capitalize()} - Error Distribution by Genre', 
                            fontweight='bold', pad=12)
            axes[i].tick_params(axis='x', rotation=45)
            axes[i].grid(True, alpha=0.2, linestyle='--', axis='y')
    
    plt.suptitle('Experiment 2: Error Distribution by Genre', 
                 fontsize=18, fontweight='bold', y=0.995)
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / 'exp2_error_boxplot_by_genre.png', dpi=300, bbox_inches='tight')
    plt.show()
    print(f"Saved: {FIGURES_DIR / 'exp2_error_boxplot_by_genre.png'}")

## 4. Error Segmentation by Year

In [None]:
def analyze_error_by_year(df, y_true, y_pred, target_name):
    """Calculate error metrics by release year (binned into decades)."""
    year_col = None
    for col in ['year', 'release_year', 'Year']:
        if col in df.columns:
            year_col = col
            break
    
    if year_col is None:
        print("Year column not found")
        return None
    
    # Create decade bins
    years = df[year_col].values[:len(y_true)]
    decades = (years // 10) * 10
    
    analysis_df = pd.DataFrame({
        'year': years,
        'decade': decades,
        'actual': y_true,
        'predicted': y_pred,
        'abs_error': np.abs(y_true - y_pred),
        'squared_error': (y_true - y_pred) ** 2
    })
    
    # Aggregate by decade
    decade_stats = analysis_df.groupby('decade').agg({
        'year': 'count',
        'abs_error': 'mean',
        'squared_error': 'mean'
    }).rename(columns={'year': 'count', 'abs_error': 'mae', 'squared_error': 'mse'})
    decade_stats['rmse'] = np.sqrt(decade_stats['mse'])
    
    return analysis_df, decade_stats

In [None]:
# Analyze errors by year/decade
year_analysis = {}
year_stats = {}

for target in TARGETS:
    if target in predictions:
        result = analyze_error_by_year(df_test, y_test[target], predictions[target], target)
        if result is not None:
            year_analysis[target], year_stats[target] = result
            print(f"\n{target.upper()} - Error by Decade:")
            print(year_stats[target].to_string())

In [None]:
# Plot error trends by decade
if year_stats:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.flatten()
    
    for i, target in enumerate(TARGETS):
        if target in year_stats:
            stats = year_stats[target]
            # Filter to decades with sufficient samples
            stats = stats[stats['count'] >= 100]
            
            bars = axes[i].bar(stats.index.astype(str), stats['rmse'], 
                              color='#3498db', edgecolor='white', alpha=0.85, linewidth=1.5)
            axes[i].set_xlabel('Decade', fontweight='bold')
            axes[i].set_ylabel('RMSE', fontweight='bold')
            axes[i].set_title(f'{target.capitalize()} - RMSE by Decade', 
                            fontweight='bold', pad=12)
            axes[i].tick_params(axis='x', rotation=45)
            axes[i].grid(True, alpha=0.2, linestyle='--', axis='y')
            
            # Add count labels
            for j, (idx, row) in enumerate(stats.iterrows()):
                axes[i].text(j, row['rmse'] + 0.008, f"n={int(row['count'])}", 
                           ha='center', fontsize=10, rotation=90, fontweight='bold')
    
    plt.suptitle('Experiment 2: Error by Decade', 
                 fontsize=18, fontweight='bold', y=0.995)
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / 'exp2_error_by_decade.png', dpi=300, bbox_inches='tight')
    plt.show()
    print(f"Saved: {FIGURES_DIR / 'exp2_error_by_decade.png'}")

## 5. Error Segmentation by Target Range

In [None]:
def analyze_error_by_target_range(y_true, y_pred, target_name, n_bins=5):
    """Analyze errors across different ranges of the target variable."""
    
    # Create bins based on actual values
    bin_edges = np.percentile(y_true, np.linspace(0, 100, n_bins + 1))
    bin_labels = [f"{bin_edges[i]:.2f}-{bin_edges[i+1]:.2f}" for i in range(n_bins)]
    
    bins = np.digitize(y_true, bin_edges[1:-1])
    
    analysis_df = pd.DataFrame({
        'bin': bins,
        'bin_label': [bin_labels[b] for b in bins],
        'actual': y_true,
        'predicted': y_pred,
        'error': y_true - y_pred,
        'abs_error': np.abs(y_true - y_pred)
    })
    
    # Aggregate by bin
    bin_stats = analysis_df.groupby('bin').agg({
        'actual': ['count', 'mean'],
        'error': 'mean',
        'abs_error': 'mean'
    })
    bin_stats.columns = ['count', 'mean_actual', 'mean_error', 'mae']
    bin_stats['rmse'] = analysis_df.groupby('bin').apply(
        lambda x: np.sqrt(np.mean((x['actual'] - x['predicted'])**2))
    )
    
    # Map bin labels based on actual bin indices present in the data
    bin_stats['bin_label'] = bin_stats.index.map(lambda x: bin_labels[x])
    
    return analysis_df, bin_stats

In [None]:
# Analyze errors by target range
range_analysis = {}
range_stats = {}

for target in TARGETS:
    if target in predictions:
        analysis_df, stats = analyze_error_by_target_range(
            y_test[target], predictions[target], target, n_bins=5
        )
        range_analysis[target] = analysis_df
        range_stats[target] = stats
        
        print(f"\n{target.upper()} - Error by {target.capitalize()} Range:")
        print(stats[['count', 'mean_actual', 'mae', 'rmse', 'mean_error', 'bin_label']].to_string())

In [None]:
# Plot error by target range
fig, axes = plt.subplots(2, 2, figsize=(14, 11))
axes = axes.flatten()

for i, target in enumerate(TARGETS):
    if target in range_stats:
        stats = range_stats[target]
        
        x = range(len(stats))
        width = 0.35
        
        bars1 = axes[i].bar([xi - width/2 for xi in x], stats['mae'], width, 
                           label='MAE', color='#3498db', alpha=0.85, 
                           edgecolor='white', linewidth=1.5)
        bars2 = axes[i].bar([xi + width/2 for xi in x], stats['rmse'], width, 
                           label='RMSE', color='#e74c3c', alpha=0.85, 
                           edgecolor='white', linewidth=1.5)
        
        axes[i].set_xticks(x)
        axes[i].set_xticklabels(stats['bin_label'], rotation=35, ha='right')
        axes[i].set_xlabel(f'{target.capitalize()} Range', fontweight='bold')
        axes[i].set_ylabel('Error', fontweight='bold')
        axes[i].set_title(f'{target.capitalize()} - Error by Value Range', 
                         fontweight='bold', pad=12)
        axes[i].legend(frameon=True, fancybox=False, edgecolor='black', framealpha=0.9)
        axes[i].grid(True, alpha=0.2, linestyle='--', axis='y')

plt.suptitle('Experiment 2: Error by Target Value Range', 
             fontsize=18, fontweight='bold', y=0.995)
plt.tight_layout()
fig.savefig(FIGURES_DIR / 'exp2_error_by_target_range.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"Saved: {FIGURES_DIR / 'exp2_error_by_target_range.png'}")

In [None]:
# Bias analysis: Does model over/under-predict at extremes?
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, target in enumerate(TARGETS):
    if target in range_stats:
        stats = range_stats[target]
        
        colors = ['#e74c3c' if e < 0 else '#27ae60' for e in stats['mean_error']]
        
        bars = axes[i].bar(range(len(stats)), stats['mean_error'], 
                          color=colors, alpha=0.85, edgecolor='white', linewidth=1.5)
        axes[i].axhline(0, color='black', linestyle='-', lw=1.5)
        axes[i].set_xticks(range(len(stats)))
        axes[i].set_xticklabels(stats['bin_label'], rotation=35, ha='right')
        axes[i].set_xlabel(f'{target.capitalize()} Range', fontweight='bold')
        axes[i].set_ylabel('Mean Error (Actual - Predicted)', fontweight='bold')
        axes[i].set_title(f'{target.capitalize()} - Prediction Bias by Range\n(Positive = Under-prediction)', 
                         fontweight='bold', pad=12)
        axes[i].grid(True, alpha=0.2, linestyle='--', axis='y')
        
        # Add value labels
        for j, bar in enumerate(bars):
            height = bar.get_height()
            axes[i].text(bar.get_x() + bar.get_width()/2., height,
                        f'{height:.3f}', ha='center', va='bottom' if height > 0 else 'top',
                        fontsize=10, fontweight='bold')

plt.suptitle('Experiment 2: Prediction Bias Analysis', 
             fontsize=18, fontweight='bold', y=0.995)
plt.tight_layout()
fig.savefig(FIGURES_DIR / 'exp2_error_bias_by_range.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"Saved: {FIGURES_DIR / 'exp2_error_bias_by_range.png'}")

## 6. Failure Case Analysis

In [None]:
def identify_worst_predictions(df, y_true, y_pred, target_name, n_worst=20):
    """Identify and analyze the worst predictions."""
    
    abs_errors = np.abs(y_true - y_pred)
    worst_idx = np.argsort(abs_errors)[-n_worst:][::-1]
    
    # Get song info for worst predictions
    worst_df = df.iloc[worst_idx].copy()

    # Prefer an explicit id column if present, otherwise use the original index
    id_col = next((c for c in ['id', 'track_id', 'song_id'] if c in df.columns), None)
    if id_col:
        worst_df['id'] = df.iloc[worst_idx][id_col].values
    else:
        worst_df['id'] = worst_df.index  # fallback to the DataFrame index
    

    worst_df['actual'] = y_true[worst_idx]
    worst_df['predicted'] = y_pred[worst_idx]
    worst_df['error'] = y_true[worst_idx] - y_pred[worst_idx]
    worst_df['abs_error'] = abs_errors[worst_idx]
    
    # Select relevant columns
    cols_to_show = ['id', 'actual', 'predicted', 'error', 'abs_error']
    for col in ['name', 'track_name', 'title', 'artist', 'genre', 'year']:
        if col in worst_df.columns:
            cols_to_show.append(col)
    
    return worst_df[cols_to_show]

In [None]:
# Identify worst predictions for each target
worst_predictions = {}

for target in TARGETS:
    if target in predictions:
        worst_df = identify_worst_predictions(
            df_test, y_test[target], predictions[target], target, n_worst=15
        )
        worst_predictions[target] = worst_df
        
        print(f"\n{'='*60}")
        print(f"{target.upper()} - WORST PREDICTIONS (Top 15)")
        print(f"{'='*60}")
        display_cols = ['id', 'actual', 'predicted', 'abs_error']
        if 'genre' in worst_df.columns:
            display_cols.append('genre')
        print(worst_df[display_cols].head(15).to_string())

In [None]:
# Analyze characteristics of worst predictions
def analyze_failure_characteristics(worst_df, all_df, target_name):
    """Compare characteristics of worst predictions vs overall dataset."""
    
    print(f"\n{target_name.upper()} - Failure Case Characteristics:")
    
    # Genre distribution in failures vs overall
    if 'genre' in worst_df.columns and 'genre' in all_df.columns:
        print("\nGenre distribution in worst predictions:")
        failure_genres = worst_df['genre'].value_counts(normalize=True)
        overall_genres = all_df['genre'].value_counts(normalize=True)
        
        comparison = pd.DataFrame({
            'Failures (%)': (failure_genres * 100).round(1),
            'Overall (%)': (overall_genres * 100).round(1)
        }).fillna(0)
        comparison['Overrepresented'] = comparison['Failures (%)'] > comparison['Overall (%)'] * 1.5
        print(comparison.to_string())
        
        overrep_genres = comparison[comparison['Overrepresented']].index.tolist()
        if overrep_genres:
            print(f"\nOverrepresented genres in failures: {overrep_genres}")
    
    # Actual value distribution in failures
    print(f"\nActual {target_name} in failures vs overall:")
    print(f"  Failures: mean={worst_df['actual'].mean():.3f}, std={worst_df['actual'].std():.3f}")

for target in TARGETS:
    if target in worst_predictions:
        analyze_failure_characteristics(worst_predictions[target], df_test, target)

## 7. Summary Statistics

In [None]:
# Create summary table
summary_data = []

for target in TARGETS:
    if target in predictions:
        y_true = y_test[target]
        y_pred = predictions[target]
        res = residuals[target]
        
        summary_data.append({
            'Target': target.capitalize(),
            'Model': BEST_MODELS[target],
            'R²': r2_score(y_true, y_pred),
            'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
            'MAE': mean_absolute_error(y_true, y_pred),
            'Mean Error': res.mean(),
            'Std Error': res.std(),
            'Max |Error|': np.abs(res).max(),
            'Error Skewness': pd.Series(res).skew(),
        })

summary_df = pd.DataFrame(summary_data)
print("\n" + "="*90)
print("EXPERIMENT 2 - ERROR ANALYSIS SUMMARY (WITH ARTIST FEATURES)")
print("="*90)
print(summary_df.round(4).to_string(index=False))
print("="*90)

In [None]:
# Save summary to CSV
results_dir = PROJECT_ROOT / 'results' / 'metrics' / 'experiment2_with_artist'
results_dir.mkdir(parents=True, exist_ok=True)
summary_df.to_csv(results_dir / 'error_analysis_summary.csv', index=False)
print(f"\nSaved summary to: {results_dir / 'error_analysis_summary.csv'}")

## 8. Key Findings

In [None]:

print("\nFigures saved to:", FIGURES_DIR)
print("\nFiles created:")
for f in sorted(FIGURES_DIR.glob('exp2_error_*.png')):
    print(f"  {f.name}")

print("\n" + "="*90)
print("ERROR ANALYSIS COMPLETE - Ready for thesis integration")
print("="*90)