# Preprocessed Feature Files Analysis

Analyzing all preprocessed `.npy` feature files to verify data quality and distributions.

**Features analyzed:**
- Audio features (23 including artist features)
- Text statistics (5 features)
- Sentiment (2 features)
- Embeddings (384 features)
- **Total: 414 features**

**Experiment 2 includes:**
- `log_total_artist_followers` (log-transformed)
- `avg_artist_popularity` (0-100 scale)


## 1. Setup and Configuration

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from pathlib import Path
from typing import Dict, Any

# Academic publication style settings
plt.style.use('seaborn-v0_8-paper')
sns.set_context("paper", font_scale=1.4)
sns.set_palette("deep")

# Configure matplotlib for publication quality
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman', 'DejaVu Serif']
plt.rcParams['font.size'] = 11
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.titleweight'] = 'bold'
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10
plt.rcParams['legend.frameon'] = True
plt.rcParams['legend.edgecolor'] = 'black'
plt.rcParams['axes.linewidth'] = 1.2
plt.rcParams['grid.alpha'] = 0.3
plt.rcParams['grid.linestyle'] = '--'

%matplotlib inline

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

# Paths
features_dir = Path('../features')
output_dir = Path('../results/figures/feature_eda')
output_dir.mkdir(exist_ok=True, parents=True)

# Feature names (UPDATED for Experiment 2 with artist features)
AUDIO_FEATURE_NAMES = [
    'acousticness', 'instrumentalness', 'liveness', 'speechiness',
    'loudness', 'tempo', 'duration_ms', 'year', 'mode',
    'key_sin', 'key_cos',
    'genre_Blues', 'genre_Classical', 'genre_Country', 'genre_Electronic',
    'genre_Folk', 'genre_Hip-Hop', 'genre_Jazz', 'genre_Pop', 'genre_R&B', 'genre_Rock',
    'log_total_artist_followers', 'avg_artist_popularity'  # NEW: Artist features
]

TEXT_STAT_NAMES = [
    'word_count', 'unique_word_count', 'unique_ratio', 'avg_word_length', 'char_count'
]

SENTIMENT_NAMES = [
    'sentiment_polarity', 'sentiment_subjectivity'
]

print("Libraries imported and paths configured")
print("Academic publication style configured")
print(f"  Features directory: {features_dir.absolute()}")
print(f"  Output directory: {output_dir.absolute()}")
print(f"\nFeature names defined:")
print(f"  Audio features: {len(AUDIO_FEATURE_NAMES)} (includes 2 artist features)")
print(f"  Text statistics: {len(TEXT_STAT_NAMES)}")
print(f"  Sentiment features: {len(SENTIMENT_NAMES)}")

## 2. Feature File Inventory

List all available .npy and .pkl files in the features directory.

In [None]:
# List all feature files
features_dir = Path("../features/")
npy_files = sorted(features_dir.glob("*.npy"))
pkl_files = sorted(features_dir.glob("*.pkl"))

print(f"NumPy arrays (.npy): {len(npy_files)}")
print("=" * 60)
for f in npy_files:
    size_mb = f.stat().st_size / (1024**2)
    print(f"  {f.name:<35} {size_mb:>7.2f} MB")

print(f"\nPickle files (.pkl): {len(pkl_files)}")
print("=" * 60)
for f in pkl_files:
    size_kb = f.stat().st_size / 1024
    print(f"  {f.name:<35} {size_kb:>7.2f} KB")

## 3. Load All Features

Load audio, text statistics, sentiment, embeddings, and target variables.

In [None]:
features = {}

# Audio features
try:
    features['audio'] = {
        'train': np.load(features_dir / 'X_train_audio.npy'),
        'val': np.load(features_dir / 'X_val_audio.npy'),
        'test': np.load(features_dir / 'X_test_audio.npy'),
    }
    print(f"Audio features: {features['audio']['train'].shape}")
except FileNotFoundError:
    print("Audio features not found")

# Text statistics
try:
    features['text_stats'] = {
        'train': np.load(features_dir / 'X_train_text_stats.npy'),
        'val': np.load(features_dir / 'X_val_text_stats.npy'),
        'test': np.load(features_dir / 'X_test_text_stats.npy'),
    }
    print(f"Text stats: {features['text_stats']['train'].shape}")
except FileNotFoundError:
    print("Text statistics not found")

# Sentiment
try:
    features['sentiment'] = {
        'train': np.load(features_dir / 'X_train_sentiment.npy'),
        'val': np.load(features_dir / 'X_val_sentiment.npy'),
        'test': np.load(features_dir / 'X_test_sentiment.npy'),
    }
    print(f"Sentiment: {features['sentiment']['train'].shape}")
except FileNotFoundError:
    print("Sentiment features not found")

# Embeddings
try:
    features['embeddings'] = {
        'train': np.load(features_dir / 'X_train_embeddings.npy'),
        'val': np.load(features_dir / 'X_val_embeddings.npy'),
        'test': np.load(features_dir / 'X_test_embeddings.npy'),
    }
    print(f"Embeddings: {features['embeddings']['train'].shape}")
except FileNotFoundError:
    print("Embeddings not found (run: python run_preprocessing.py --steps embeddings)")

# Targets
targets = {}
target_names = ['valence', 'energy', 'danceability', 'popularity']
for target in target_names:
    try:
        targets[target] = {
            'train': np.load(features_dir / f'y_train_{target}.npy'),
            'val': np.load(features_dir / f'y_val_{target}.npy'),
            'test': np.load(features_dir / f'y_test_{target}.npy'),
        }
    except FileNotFoundError:
        print(f"Target '{target}' not found")

print(f"\n{len(targets)} targets loaded: {list(targets.keys())}")

## 4. Feature Dimensions Analysis

In [None]:
# Create dimensions table
dims_data = []
for feat_name, feat_dict in features.items():
    dims_data.append({
        'Feature Type': feat_name,
        'Train Shape': str(feat_dict['train'].shape),
        'Val Shape': str(feat_dict['val'].shape),
        'Test Shape': str(feat_dict['test'].shape),
        'N Features': feat_dict['train'].shape[1],
    })

dims_df = pd.DataFrame(dims_data)

print("="*80)
print("TABLE 1: Feature Matrix Dimensions")
print("="*80)
print(dims_df.to_string(index=False))
print("="*80)

print(f"\nTotal features available: {dims_df['N Features'].sum()}")
print(f"Training samples: {features[list(features.keys())[0]]['train'].shape[0]:,}")
print(f"Validation samples: {features[list(features.keys())[0]]['val'].shape[0]:,}")
print(f"Test samples: {features[list(features.keys())[0]]['test'].shape[0]:,}")

## 5. Data Quality Checks

Check for NaN values, infinite values, and compute basic statistics.

In [None]:
def check_array_quality(arr: np.ndarray, name: str) -> Dict[str, Any]:
    """Check array for common data quality issues."""
    return {
        'Feature': name,
        'Shape': str(arr.shape),
        'NaN Count': np.isnan(arr).sum(),
        'Inf Count': np.isinf(arr).sum(),
        'Min': arr.min() if arr.size > 0 else None,
        'Max': arr.max() if arr.size > 0 else None,
        'Mean': arr.mean() if arr.size > 0 else None,
        'Std': arr.std() if arr.size > 0 else None,
    }

quality_results = []
for feat_name, feat_dict in features.items():
    for split_name, arr in feat_dict.items():
        result = check_array_quality(arr, f"{feat_name}_{split_name}")
        quality_results.append(result)

quality_df = pd.DataFrame(quality_results)

print("="*80)
print("TABLE 2: Data Quality Assessment")
print("="*80)
print(quality_df[['Feature', 'NaN Count', 'Inf Count', 'Min', 'Max', 'Mean', 'Std']].to_string(index=False))
print("="*80)

# Check for issues
total_nan = quality_df['NaN Count'].sum()
total_inf = quality_df['Inf Count'].sum()

if total_nan > 0:
    print(f"\nWARNING: {total_nan} NaN values detected!")
else:
    print(f"\nNo NaN values detected")
    
if total_inf > 0:
    print(f"WARNING: {total_inf} Inf values detected!")
else:
    print(f"No Inf values detected")

## 6. Feature Distributions

Visualize distributions for each feature type.

In [None]:
# Define feature names mapping
feature_names_map = {
    'audio': AUDIO_FEATURE_NAMES,
    'text_stats': TEXT_STAT_NAMES,
    'sentiment': SENTIMENT_NAMES,
    'embeddings': [f'emb_{i}' for i in range(384)]  # 384-d embeddings
}

for feat_name, feat_dict in features.items():
    print(f"\n{'='*80}")
    print(f"{feat_name.upper()} - Training Set Statistics")
    print('='*80)
    
    X = feat_dict['train']
    n_features = X.shape[1]
    
    # Get feature names
    feature_names = feature_names_map.get(feat_name, [f'feature_{i}' for i in range(n_features)])
    
    # Summary statistics per feature
    stats = pd.DataFrame({
        'feature': feature_names[:n_features],
        'mean': X.mean(axis=0),
        'std': X.std(axis=0),
        'min': X.min(axis=0),
        'max': X.max(axis=0),
        'median': np.median(X, axis=0),
    })
    
    print(f"Features: {n_features}")
    print(f"Mean range: [{stats['mean'].min():.4f}, {stats['mean'].max():.4f}]")
    print(f"Std range:  [{stats['std'].min():.4f}, {stats['std'].max():.4f}]")
    
    # Display first 30 features statistics
    if n_features <= 30:
        print(f"\nFirst {min(30, n_features)} features:")
        print(stats.head(30).to_string(index=False))
    
    # Plot distribution of first few features (if not too many)
    if n_features <= 50:
        n_plots = min(6, n_features)
        fig, axes = plt.subplots(2, 3, figsize=(18, 11))
        axes = axes.flatten()
        
        for i in range(n_plots):
            ax = axes[i]
            ax.hist(X[:, i], bins=50, alpha=0.8, edgecolor='black', 
                   color='steelblue', linewidth=1.2)
            feature_label = feature_names[i] if i < len(feature_names) else f'Feature {i}'
            ax.set_title(feature_label, fontsize=17, fontweight='bold', pad=10)
            ax.set_xlabel('Value', fontsize=15, fontweight='bold')
            ax.set_ylabel('Frequency', fontsize=15, fontweight='bold')
            ax.grid(True, alpha=0.3, linestyle='--')
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.tick_params(axis='y', labelsize=12)
            ax.tick_params(axis='x', labelsize=12)
        
        # Hide unused subplots
        for i in range(n_plots, 6):
            axes[i].set_visible(False)
        
        fig.suptitle(f'{feat_name.upper()} - Feature Distributions', 
                    fontsize=18, fontweight='bold')
        
        plt.tight_layout()
        plt.savefig(output_dir / f'{feat_name}_distributions.png', dpi=300, bbox_inches='tight')
        plt.show()
        print(f"Saved to {feat_name}_distributions.png")

## 7. Target Distributions

Compare train/val/test distributions for all target variables.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Target Variables: Train/Validation/Test Distribution Comparison', 
             fontsize=18, fontweight='bold', y=0.995)

colors = {'train': '#3498db', 'val': '#e74c3c', 'test': '#2ecc71'}

for idx, (target_name, target_dict) in enumerate(targets.items()):
    y_train = target_dict['train']
    y_val = target_dict['val']
    y_test = target_dict['test']
    
    ax = axes[idx // 2, idx % 2]
    
    # Overlapping histograms
    ax.hist(y_train, bins=50, alpha=0.6, label='Train', edgecolor='black', 
            color=colors['train'], linewidth=1.2)
    ax.hist(y_val, bins=50, alpha=0.6, label='Validation', edgecolor='black', 
            color=colors['val'], linewidth=1.2)
    ax.hist(y_test, bins=50, alpha=0.6, label='Test', edgecolor='black', 
            color=colors['test'], linewidth=1.2)
    
    ax.set_title(f'{target_name.capitalize()}', fontsize=15, fontweight='bold', pad=12)
    ax.set_xlabel('Value', fontsize=13, fontweight='bold')
    ax.set_ylabel('Frequency', fontsize=13, fontweight='bold')
    ax.legend(loc='best', frameon=True, shadow=True, edgecolor='black')
    ax.grid(True, alpha=0.3, linestyle='--')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    # Print statistics
    print(f"\n{target_name.upper()}:")
    print(f"  Train: μ={y_train.mean():.4f}, σ={y_train.std():.4f}, "
          f"range=[{y_train.min():.4f}, {y_train.max():.4f}]")
    print(f"  Val:   μ={y_val.mean():.4f}, σ={y_val.std():.4f}, "
          f"range=[{y_val.min():.4f}, {y_val.max():.4f}]")
    print(f"  Test:  μ={y_test.mean():.4f}, σ={y_test.std():.4f}, "
          f"range=[{y_test.min():.4f}, {y_test.max():.4f}]")

plt.tight_layout()
plt.savefig(output_dir / 'target_distributions.png', dpi=300, bbox_inches='tight')
plt.show()
print("\nSaved to target_distributions.png")

## 8. Scalers and Transformers Inspection

Examine saved scaler/transformer objects and their properties.

In [None]:
scaler_files = {
    'audio_scaler': 'audio_scaler.pkl',
    'text_stats_scaler': 'text_stats_scaler.pkl',
    'sentiment_scaler': 'sentiment_scaler.pkl',
    'audio_power_transformer': 'audio_power_transformer.pkl',
}

print("Scalers and Transformers")
print("=" * 80)

for scaler_name, filename in scaler_files.items():
    filepath = features_dir / filename
    if filepath.exists():
        scaler = joblib.load(filepath)
        print(f"\n{scaler_name}:")
        print(f"  Type: {type(scaler).__name__}")
        
        if hasattr(scaler, 'mean_'):
            print(f"  Mean (first 5): {scaler.mean_[:5]}")
        if hasattr(scaler, 'scale_'):
            print(f"  Scale (first 5): {scaler.scale_[:5]}")
        if hasattr(scaler, 'var_'):
            print(f"  Variance (first 5): {scaler.var_[:5]}")
        if hasattr(scaler, 'n_features_in_'):
            print(f"  N Features: {scaler.n_features_in_}")
    else:
        print(f"\n{scaler_name}: Not found")

## 9. Correlation Analysis

Compute correlation matrices and identify highly correlated feature pairs.

In [None]:
# Combine all non-genre features into a single correlation matrix
print(f"\n{'='*80}")
print("COMBINED FEATURE CORRELATION ANALYSIS (excluding genres)")
print('='*80)

# Collect all features (excluding genre columns from audio)
combined_features = []
combined_names = []

for feat_name, feat_dict in features.items():
    X = feat_dict['train']
    n_features = X.shape[1]
    
    # Get feature names
    feature_names = feature_names_map.get(feat_name, [f'feature_{i}' for i in range(n_features)])
    
    if feat_name == 'audio':
        # Filter out genre features (indices 11-20)
        non_genre_mask = [i for i in range(n_features) if not feature_names[i].startswith('genre_')]
        X_filtered = X[:, non_genre_mask]
        filtered_names = [feature_names[i] for i in non_genre_mask]
        combined_features.append(X_filtered)
        combined_names.extend(filtered_names)
        print(f"Audio features (non-genre): {X_filtered.shape[1]} features")
    elif feat_name != 'embeddings':  # Skip embeddings (too high-dimensional)
        combined_features.append(X)
        combined_names.extend(feature_names[:n_features])
        print(f"{feat_name.capitalize()}: {n_features} features")
# Concatenate all features
X_combined = np.concatenate(combined_features, axis=1)
print(f"\nCombined feature matrix: {X_combined.shape}")
print(f"Total features in correlation matrix: {len(combined_names)}")

# Compute correlation matrix
corr = np.corrcoef(X_combined.T)

# Create publication-quality heatmap
fig, ax = plt.subplots(figsize=(16, 14))

sns.heatmap(corr, cmap='RdBu_r', center=0, 
            square=True, linewidths=0.8, linecolor='white',
            cbar_kws={"shrink": 0.75, "label": "Pearson Correlation (r)"},
            annot=True, fmt='.2f', annot_kws={"fontsize": 7, "weight": "bold"},
            vmin=-1, vmax=1,
            xticklabels=combined_names,
            yticklabels=combined_names, ax=ax)

ax.set_title('Combined Feature Correlation Matrix (Audio + Text + Sentiment)', 
             fontsize=18, fontweight='bold', pad=20)

# Rotate labels for readability
plt.xticks(rotation=45, ha='right', fontsize=9, fontweight='bold')
plt.yticks(rotation=0, fontsize=9, fontweight='bold')

plt.tight_layout()
plt.savefig(output_dir / 'combined_correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"\nSaved to combined_correlation_matrix.png")

# Find highly correlated pairs
print(f"\n{'='*80}")
print("HIGHLY CORRELATED FEATURE PAIRS (|r| > 0.8)")
print('='*80)

high_corr = []
for i in range(len(combined_names)):
    for j in range(i+1, len(combined_names)):
        if abs(corr[i, j]) > 0.8:
            high_corr.append((combined_names[i], combined_names[j], corr[i, j]))

if high_corr:
    # Sort by absolute correlation value
    high_corr.sort(key=lambda x: abs(x[2]), reverse=True)
    print(f"Found {len(high_corr)} highly correlated pairs:\n")
    for feat_i, feat_j, r in high_corr:
        print(f"  {feat_i:25s} ↔ {feat_j:25s}: r = {r:+.3f}")
else:
    print("No highly correlated pairs found (|r| > 0.8)")

# Summary statistics
print(f"\n{'='*80}")
print("CORRELATION MATRIX STATISTICS")
print('='*80)
upper_tri_corr = corr[np.triu_indices_from(corr, k=1)]
print(f"  Mean absolute correlation: {np.mean(np.abs(upper_tri_corr)):.3f}")
print(f"  Median absolute correlation: {np.median(np.abs(upper_tri_corr)):.3f}")
print(f"  Max correlation: {np.max(upper_tri_corr):.3f}")
print(f"  Min correlation: {np.min(upper_tri_corr):.3f}")
print(f"  Pairs with |r| > 0.8: {len(high_corr)}")
print(f"  Pairs with |r| > 0.5: {(np.abs(upper_tri_corr) > 0.5).sum()}")
print(f"  Pairs with |r| > 0.3: {(np.abs(upper_tri_corr) > 0.3).sum()}")


## 9b. Feature Importance by Variance

Analyze feature importance based on variance (useful for dimensionality assessment).

In [None]:
# Analyze variance for each feature type
for feat_name, feat_dict in features.items():
    X = feat_dict['train']
    n_features = X.shape[1]
    
    # Get feature names
    feature_names = feature_names_map.get(feat_name, [f'feature_{i}' for i in range(n_features)])
    
    print(f"\n{'='*80}")
    print(f"{feat_name.upper()} - Variance Analysis")
    print('='*80)
    
    # Compute variance
    variances = X.var(axis=0)
    
    # Create variance dataframe
    var_df = pd.DataFrame({
        'feature': feature_names[:n_features],
        'variance': variances,
    }).sort_values('variance', ascending=False)
    
    print(f"Variance range: [{variances.min():.6f}, {variances.max():.6f}]")
    print(f"Mean variance: {variances.mean():.6f}")
    
    # Show top variance features
    if n_features <= 30:
        print(f"\nTop 10 features by variance:")
        print(var_df.head(10).to_string(index=False))
        
        # Plot variance - Academic style
        fig, ax = plt.subplots(figsize=(14, 7))
        bars = ax.bar(range(len(variances)), var_df['variance'].values, 
                     color='steelblue', alpha=0.75, edgecolor='black', linewidth=1.2)
        
        ax.set_xlabel('Feature (ranked by variance)', fontsize=13, fontweight='bold')
        ax.set_ylabel('Variance', fontsize=13, fontweight='bold')
        ax.set_title(f'{feat_name.upper()} - Feature Variance Ranking', 
                    fontsize=16, fontweight='bold', pad=15)
        ax.grid(True, alpha=0.3, axis='y', linestyle='--')
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        
        # Highlight top 3
        for i in range(min(3, len(bars))):
            bars[i].set_color('#e74c3c')
            bars[i].set_alpha(0.9)
        
        plt.tight_layout()
        plt.savefig(output_dir / f'{feat_name}_variance.png', dpi=300, bbox_inches='tight')
        plt.show()
        print(f"Saved to {feat_name}_variance.png")
    else:
        # For high-dimensional features (embeddings)
        low_var_count = (variances < 0.01).sum()
        print(f"Low variance features (var < 0.01): {low_var_count} ({low_var_count/n_features*100:.1f}%)")

In [None]:
print("="*80)
print("FEATURE FILES EDA - SUMMARY REPORT (EXPERIMENT 2)")
print("="*80)

total_features = sum(feat_dict['train'].shape[1] for feat_dict in features.values())
total_samples_train = list(features.values())[0]['train'].shape[0] if features else 0

print(f"\nFEATURE INVENTORY")
print(f"{'='*80}")
print(f"  Total features: {total_features} ← Experiment 2 (includes artist features)")
for feat_name, feat_dict in features.items():
    n_feat = feat_dict['train'].shape[1]
    pct = (n_feat / total_features * 100) if total_features > 0 else 0
    print(f"  • {feat_name.capitalize():15s}: {n_feat:3d} features ({pct:5.1f}%)")

print(f"\nNEW FEATURES FOR EXPERIMENT 2:")
print(f"{'='*80}")
print(f"  Audio features: 21 → 23 (+2 artist features)")
print(f"    • log_total_artist_followers (log-transformed)")
print(f"    • avg_artist_popularity (0-100 scale)")
print(f"  Total features: 412 → 414 (+2)")

print(f"\nDATASET SPLITS")
print(f"{'='*80}")
if features:
    train_size = list(features.values())[0]['train'].shape[0]
    val_size = list(features.values())[0]['val'].shape[0]
    test_size = list(features.values())[0]['test'].shape[0]
    total_size = train_size + val_size + test_size
    
    print(f"  Training:   {train_size:7,} samples ({train_size/total_size*100:5.1f}%)")
    print(f"  Validation: {val_size:7,} samples ({val_size/total_size*100:5.1f}%)")
    print(f"  Test:       {test_size:7,} samples ({test_size/total_size*100:5.1f}%)")
    print(f"  {'─'*80}")
    print(f"  Total:      {total_size:7,} samples (100.0%)")

print(f"\nFILES PROCESSED")
print(f"{'='*80}")
print(f"  NumPy arrays (.npy): {len(npy_files)}")
print(f"  Pickle files (.pkl): {len(pkl_files)}")

print(f"\nVISUALIZATIONS GENERATED")
print(f"{'='*80}")
viz_files = sorted(output_dir.glob("*.png"))
print(f"  Total: {len(viz_files)} publication-quality figures")
for vf in viz_files:
    print(f"  • {vf.name}")

print(f"\nEXPERIMENT 2 STATUS:")
print(f"{'='*80}")
print(f"  Artist features preprocessed and scaled")
print(f"  New splits created from songs.csv")
print(f"  414 total features ready for model training")
print(f"  Next: Train enhanced models with artist features")

print(f"\nAll visualizations saved to: {output_dir.absolute()}")
print(f"Resolution: 300 DPI (publication quality)")
print("="*80)