In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from datetime import datetime
from pathlib import Path
from IPython.display import display
import warnings

warnings.filterwarnings('ignore')

# Enhanced Academic publication style settings
plt.style.use('seaborn-v0_8-paper')
sns.set_context("paper", font_scale=1.6)
sns.set_palette("deep")

# Configure matplotlib for publication quality with BOLD and BIGGER fonts
plt.rcParams['figure.figsize'] = (14, 7)
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman', 'DejaVu Serif']
plt.rcParams['font.size'] = 13
plt.rcParams['axes.labelsize'] = 15
plt.rcParams['axes.titlesize'] = 17
plt.rcParams['axes.titleweight'] = 'bold'
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['xtick.labelsize'] = 13
plt.rcParams['ytick.labelsize'] = 13
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['legend.frameon'] = True
plt.rcParams['legend.edgecolor'] = 'black'
plt.rcParams['legend.title_fontsize'] = 13
plt.rcParams['axes.linewidth'] = 1.5
plt.rcParams['grid.alpha'] = 0.3
plt.rcParams['grid.linestyle'] = '--'
plt.rcParams['xtick.major.width'] = 1.2
plt.rcParams['ytick.major.width'] = 1.2

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

print("Libraries loaded successfully!")
print("Enhanced academic publication style configured")
print("Experiment 2: WITH Artist Features (414 features)")

In [None]:
# Set paths for Experiment 2
PROJECT_ROOT = Path.cwd().parent
FEATURES_DIR = PROJECT_ROOT / "features"
MODELS_DIR = PROJECT_ROOT / "models" / "saved" / "experiment2_with_artist"
RESULTS_DIR = PROJECT_ROOT / "results" / "metrics" / "experiment2_with_artist"
FIGURES_DIR = PROJECT_ROOT / "results" / "figures"

# Configuration
TARGETS = ['valence', 'energy', 'danceability', 'popularity']
SELECTED_MODELS = [
    'CatBoost', 'CatBoost_tuned',
    'LightGBM', 'LightGBM_tuned',
    'XGBoost', 'XGBoost_tuned',
    'ExtraTrees', 'ExtraTrees_tuned',
    'MLPRegressor', 'MLPRegressor_tuned',
    'RandomForest', 'RandomForest_tuned'
]

print(f"Models directory: {MODELS_DIR}")
print(f"Results directory: {RESULTS_DIR}")
print(f"Features directory: {FEATURES_DIR}")
print(f"Targets: {TARGETS}")
print(f"Selected models: {len(SELECTED_MODELS)} models")

---
## ðŸ“‹ Load Feature Names

In [None]:
def load_feature_names():
    """Load feature names from metadata with proper audio feature names"""
    metadata_path = FEATURES_DIR / "preprocessing_metadata.json"
    
    feature_names = []
    
    # Audio features (23 for Experiment 2 - includes 2 artist features)
    audio_names_path = FEATURES_DIR / "audio_feature_names.txt"
    if audio_names_path.exists():
        with open(audio_names_path, 'r') as f:
            audio_features = [line.strip() for line in f 
                            if line.strip() and not line.strip().startswith('#')]
        feature_names.extend(audio_features)
        print(f"Loaded {len(audio_features)} audio features from file")
    else:
        # Fallback: use standard Spotify audio feature names
        audio_features = [
            'danceability', 'energy', 'loudness', 'speechiness', 
            'acousticness', 'instrumentalness', 'liveness', 'tempo',
            'genre_Rock', 'genre_Pop', 'genre_Hip-Hop', 'genre_Electronic',
            'genre_Jazz', 'genre_Classical', 'genre_Country', 'genre_R&B',
            'genre_Indie', 'genre_Metal', 'year_normalized', 'mode',
            'key_sin', 'log_total_artist_followers', 'avg_artist_popularity'
        ]
        feature_names.extend(audio_features)
        print(f"Audio feature file not found, using default names")
    
    # Text stats (5)
    text_stats = ['word_count', 'unique_word_count', 'unique_ratio', 'avg_word_length', 'char_count']
    feature_names.extend(text_stats)
    
    # Sentiment (2)
    sentiment = ['sentiment_polarity', 'sentiment_subjectivity']
    feature_names.extend(sentiment)
    
    # Embeddings (384)
    embedding_names = [f'emb_{i}' for i in range(384)]
    feature_names.extend(embedding_names)
    
    return feature_names

feature_names = load_feature_names()
print(f"Total features loaded: {len(feature_names)}")
print(f"\nFirst 32 features (including artist features):")
for i, name in enumerate(feature_names[:32]):
    print(f" {i:3d}. {name}")

---
## ðŸ”§ Define Helper Functions

In [None]:
def get_feature_importance(model, model_name, feature_names):
    """Extract feature importance from a model"""
    
    importance = None
    method = None
    
    # Tree-based models with feature_importances_
    if hasattr(model, 'feature_importances_'):
        importance = model.feature_importances_
        method = 'feature_importances_'
    
    # MLPRegressor - use first layer weights
    elif hasattr(model, 'coefs_') and len(model.coefs_) > 0:
        # Sum absolute weights from input to first hidden layer
        first_layer_weights = model.coefs_[0]
        importance = np.abs(first_layer_weights).sum(axis=1)
        method = 'mlp_first_layer'
    
    # Linear models with coef_
    elif hasattr(model, 'coef_'):
        importance = np.abs(model.coef_)
        if importance.ndim > 1:
            importance = importance.mean(axis=0)
        method = 'coef_'
    
    if importance is not None:
        # Verify shape
        if len(importance) != len(feature_names):
            print(f"Shape mismatch: {len(importance)} vs {len(feature_names)}")
            return None
        
        # Normalize
        if importance.sum() > 0:
            importance = importance / importance.sum()
        
        return pd.DataFrame({
            'feature': feature_names,
            'importance': importance,
            'method': method
        }).sort_values('importance', ascending=False)
    
    return None

def analyze_model_importance(target, model_name, feature_names):
    """Analyze feature importance for a specific model and target"""
    
    model_path = MODELS_DIR / f"{model_name}_{target}.pkl"
    
    if not model_path.exists():
        return None
    
    try:
        model = joblib.load(model_path)
        importance_df = get_feature_importance(model, model_name, feature_names)
        
        if importance_df is not None:
            importance_df['target'] = target
            importance_df['model'] = model_name
            return importance_df
        return None
            
    except Exception as e:
        print(f"Error loading {model_name}: {e}")
        return None

print("Helper functions defined")

---
## ðŸš€ Extract Feature Importance from All Models

In [None]:

print("Extracting feature importance...\n")

all_importance = []

for target in TARGETS:
    print(f"\n{target.upper()}")
    
    for model_name in SELECTED_MODELS:
        result = analyze_model_importance(target, model_name, feature_names)
        if result is not None:
            all_importance.append(result)
            top_feat = result.iloc[0]['feature']
            top_imp = result.iloc[0]['importance']
            print(f"   {model_name:20s} â†’ {top_feat} ({top_imp:.3f})")
        else:
            print(f"   {model_name:20s} â†’ Failed")
        

# Combine all results
all_importance_df = pd.concat(all_importance, ignore_index=True)

print(f"\nExtracted importance for {len(all_importance)} model-target combinations")
print(f"Total data points: {len(all_importance_df):,}")

In [None]:
# Save raw data
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
raw_path = RESULTS_DIR / f"feature_importance_raw_{timestamp}.csv"

print(f"Saving to: {raw_path}")
print(f"DataFrame shape: {all_importance_df.shape}")
print(f"Directory exists: {RESULTS_DIR.exists()}")

all_importance_df.to_csv(raw_path, index=False)

# Verify file was saved
if raw_path.exists():
    file_size = raw_path.stat().st_size / 1024
    print(f"Saved raw importance data: {raw_path.name} ({file_size:.1f} KB)")
else:
    print(f"ERROR: File was not saved!")

---
## ðŸ“Š Top 20 Features per Target

In [None]:
print("TOP 20 FEATURES PER TARGET (averaged across models)\n")

summary_data = []

for target in TARGETS:
    target_df = all_importance_df[all_importance_df['target'] == target]
    top20 = target_df.groupby('feature')['importance'].mean().nlargest(20)
    
    print(f"\n{'='*70}")
    print(f"{target.upper()}")
    print(f"{'='*70}")
    
    for i, (feat, imp) in enumerate(top20.items(), 1):
        print(f"   {i:2d}. {feat:<30s} {imp:.4f}")
        summary_data.append({
            'target': target,
            'rank': i,
            'feature': feat,
            'avg_importance': imp
        })

summary_df = pd.DataFrame(summary_data)
summary_path = RESULTS_DIR / f"feature_importance_summary_{timestamp}.csv"
summary_df.to_csv(summary_path, index=False)
print(f"\nSaved summary: {summary_path.name}")

---
## ðŸ“ˆ Visualization 1: Top 20 Features per Target

In [None]:
def plot_top_features(all_importance, target, top_n=20):
    """Plot top N features for a target with enhanced academic styling"""
    target_df = all_importance[all_importance['target'] == target].copy()
    agg_importance = target_df.groupby('feature')['importance'].mean().sort_values(ascending=False)
    top_features = agg_importance.head(top_n)
    
    fig, ax = plt.subplots(figsize=(14, 9))
    colors = plt.cm.viridis(np.linspace(0, 1, len(top_features)))
    
    bars = ax.barh(range(len(top_features)), top_features.values, 
                   color=colors, edgecolor='black', alpha=0.85, linewidth=1.5)
    ax.set_yticks(range(len(top_features)))
    ax.set_yticklabels(top_features.index, fontweight='bold', fontsize=12)
    ax.invert_yaxis()
    ax.set_xlabel('Average Importance', fontsize=15, fontweight='bold', labelpad=10)
    ax.set_title(f'Experiment 2: Top {top_n} Features for {target.upper()}', 
                 fontsize=17, fontweight='bold', pad=20)
    ax.grid(True, alpha=0.3, axis='x', linestyle='--', linewidth=1.2)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_linewidth(1.5)
    ax.spines['bottom'].set_linewidth(1.5)
    ax.tick_params(axis='both', width=1.2)
    
    # Add value labels
    for bar, val in zip(bars, top_features.values):
        ax.text(val + 0.001, bar.get_y() + bar.get_height()/2, 
                f'{val:.3f}', va='center', fontsize=11, fontweight='bold')
    
    plt.tight_layout()
    return fig

# Generate plots for all targets
for target in TARGETS:
    fig = plot_top_features(all_importance_df, target, top_n=20)
    fig_path = FIGURES_DIR / f"experiment2_feature_importance_{target}.png"
    fig.savefig(fig_path, dpi=300, bbox_inches='tight')
    plt.show()
    print(f"Saved: {fig_path.name}")

---
## ðŸ“Š Visualization 2: Feature Group Importance

In [None]:
def create_feature_groups():
    """Define feature groups including artist features"""
    groups = {
        'audio_continuous': list(range(0, 7)),
        'year': [7],
        'mode': [8],
        'key_cyclical': [9, 10],
        'genre': list(range(11, 21)),
        'artist': [21, 22],  # NEW: log_total_artist_followers + avg_artist_popularity
        'text_stats': list(range(23, 28)),
        'sentiment': list(range(28, 30)),
        'embeddings': list(range(30, 414)),
    }
    return groups

def aggregate_by_group(importance_df, feature_names):
    """Aggregate feature importance by group"""
    groups = create_feature_groups()
    group_importance = {}
    
    for group_name, indices in groups.items():
        group_features = [feature_names[i] for i in indices if i < len(feature_names)]
        mask = importance_df['feature'].isin(group_features)
        group_importance[group_name] = importance_df.loc[mask, 'importance'].sum()
    
    return pd.Series(group_importance).sort_values(ascending=False)

# Plot with enhanced styling
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#28A745', '#6C757D', '#17A2B8', '#FFC107', '#FF69B4']

for ax, target in zip(axes.flatten(), TARGETS):
    target_df = all_importance_df[all_importance_df['target'] == target]
    
    # Aggregate by group
    group_data = []
    for model in target_df['model'].unique():
        model_df = target_df[target_df['model'] == model]
        group_imp = aggregate_by_group(model_df, feature_names)
        group_data.append(group_imp)
    
    # Average across models
    group_df = pd.DataFrame(group_data)
    avg_group = group_df.mean().sort_values(ascending=True)
    
    bars = ax.barh(range(len(avg_group)), avg_group.values, 
                  color=colors[:len(avg_group)], edgecolor='black', alpha=0.85, linewidth=1.5)
    ax.set_yticks(range(len(avg_group)))
    ax.set_yticklabels(avg_group.index, fontweight='bold', fontsize=12)
    ax.set_xlabel('Importance', fontsize=14, fontweight='bold', labelpad=8)
    ax.set_title(f'{target.upper()}', fontsize=15, fontweight='bold', pad=15)
    ax.grid(True, alpha=0.3, axis='x', linestyle='--', linewidth=1.2)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_linewidth(1.5)
    ax.spines['bottom'].set_linewidth(1.5)
    ax.tick_params(axis='both', width=1.2)
    
    # Add value labels
    for bar, val in zip(bars, avg_group.values):
        ax.text(val + 0.005, bar.get_y() + bar.get_height()/2, 
                f'{val:.2%}', va='center', fontsize=11, fontweight='bold')

plt.suptitle('Feature Group Importance by Target', 
             fontsize=17, fontweight='bold', y=0.995)
plt.tight_layout()
fig_path = FIGURES_DIR / "experiment2_feature_groups.png"
plt.savefig(fig_path, dpi=300, bbox_inches='tight')
plt.show()
print(f"Saved: {fig_path.name}")

---
## ðŸ”¥ Visualization 3: Model Comparison Heatmaps

In [None]:
def plot_model_comparison(all_importance, target):
    """Compare feature importance across models with enhanced styling"""
    target_df = all_importance[all_importance['target'] == target].copy()
    
    # Separate tuned and default models (excluding MLPRegressor)
    default_models = [m for m in SELECTED_MODELS if not m.endswith('_tuned') and 'MLPRegressor' not in m]
    tuned_models = [m for m in SELECTED_MODELS if m.endswith('_tuned') and 'MLPRegressor' not in m]
    
    # Get top 15 features overall
    top_features = target_df.groupby('feature')['importance'].mean().nlargest(15).index.tolist()
    
    # Create figure with 1 row, 2 columns - Enhanced sizing
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(22, 10))
    
    # Plot default models
    default_df = target_df[target_df['model'].isin(default_models)]
    pivot_default = default_df[default_df['feature'].isin(top_features)].pivot(
        index='feature', columns='model', values='importance'
    )
    pivot_default = pivot_default.loc[top_features]
    
    sns.heatmap(pivot_default, annot=True, fmt='.3f', cmap='YlOrRd', ax=ax1,
                linewidths=2, linecolor='white',
                cbar_kws={'label': 'Importance', 'shrink': 0.85},
                annot_kws={'fontsize': 10, 'weight': 'bold'})
    ax1.set_title(f'{target.upper()} - Default Models', fontsize=16, fontweight='bold', pad=20)
    ax1.set_xlabel('Model', fontsize=14, fontweight='bold', labelpad=10)
    ax1.set_ylabel('Feature', fontsize=14, fontweight='bold', labelpad=10)
    ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right', fontweight='bold', fontsize=12)
    ax1.set_yticklabels(ax1.get_yticklabels(), rotation=0, fontweight='bold', fontsize=12)
    ax1.tick_params(axis='both', width=1.2)
    
    # Plot tuned models
    tuned_df = target_df[target_df['model'].isin(tuned_models)]
    pivot_tuned = tuned_df[tuned_df['feature'].isin(top_features)].pivot(
        index='feature', columns='model', values='importance'
    )
    pivot_tuned = pivot_tuned.loc[top_features]
    
    sns.heatmap(pivot_tuned, annot=True, fmt='.3f', cmap='YlOrRd', ax=ax2,
                linewidths=2, linecolor='white',
                cbar_kws={'label': 'Importance', 'shrink': 0.85},
                annot_kws={'fontsize': 10, 'weight': 'bold'})
    ax2.set_title(f'{target.upper()} - Tuned Models', fontsize=16, fontweight='bold', pad=20)
    ax2.set_xlabel('Model', fontsize=14, fontweight='bold', labelpad=10)
    ax2.set_ylabel('Feature', fontsize=14, fontweight='bold', labelpad=10)
    ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right', fontweight='bold', fontsize=12)
    ax2.set_yticklabels(ax2.get_yticklabels(), rotation=0, fontweight='bold', fontsize=12)
    ax2.tick_params(axis='both', width=1.2)
    
    plt.suptitle(f'Experiment 2: Feature Importance Comparison - {target.upper()}', 
                 fontsize=18, fontweight='bold', y=0.98)
    plt.tight_layout()
    return fig

# Generate heatmaps for all targets
for target in TARGETS:
    fig = plot_model_comparison(all_importance_df, target)
    fig_path = FIGURES_DIR / f"experiment2_feature_importance_heatmap_{target}.png"
    fig.savefig(fig_path, dpi=300, bbox_inches='tight')
    plt.show()
    print(f"Saved: {fig_path.name}")

---
## ðŸ”‘ Key Insights: Embedding Contribution

In [None]:
print("EMBEDDING CONTRIBUTION ANALYSIS\n")
print("="*70)

embedding_cols = [f'emb_{i}' for i in range(384)]

insights = []
for target in TARGETS:
    target_df = all_importance_df[all_importance_df['target'] == target]
    emb_importance = target_df[target_df['feature'].isin(embedding_cols)]['importance'].sum()
    total_importance = target_df['importance'].sum()
    emb_pct = (emb_importance / total_importance * 100) if total_importance > 0 else 0
    
    print(f"   {target.upper():15s} â†’ Embeddings: {emb_pct:5.1f}% of total importance")
    insights.append({'target': target, 'embedding_contribution_pct': emb_pct})

insights_df = pd.DataFrame(insights)
display(insights_df.style.format({'embedding_contribution_pct': '{:.2f}%'}).background_gradient(
    subset=['embedding_contribution_pct'], cmap='YlOrRd'
))

---
## âœ… Summary

In [None]:
print("\n" + "="*70)
print("FEATURE IMPORTANCE ANALYSIS COMPLETE")
print("="*70)
print(f"Models analyzed: {len(SELECTED_MODELS)} Ã— {len(TARGETS)} targets = {len(all_importance)} combinations")
print(f"Results saved to: {RESULTS_DIR}")
print(f"Figures saved to: {FIGURES_DIR}")