In [12]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import randint, uniform, loguniform
import numpy as np
import matplotlib.pyplot as plt
import os
import warnings
from utils_io import load_step, save_step

warnings.filterwarnings('ignore')
os.makedirs('figures', exist_ok=True)

y_train = load_step("y_train")
y_test = load_step("y_test")
X_train_scaled = load_step("X_train_scaled")
X_test_scaled = load_step("X_test_scaled")

print("=" * 60)
print("XGBoost Hyperparameter Optimization for Spotify Popularity")
print("=" * 60)

# 1. Basic XGBoost Model (Baseline)
print("\n1. Training Basic XGBoost Model (Baseline)...")
try:
    xgb_model = xgb.XGBRegressor(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbosity=0
    )

    xgb_model.fit(X_train_scaled, y_train)
    y_pred_xgb = xgb_model.predict(X_test_scaled)

    mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
    rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
    r2_xgb = r2_score(y_test, y_pred_xgb)

    print(f"\nBasic XGBoost Results:")
    print(f"  MAE:  {mae_xgb:.4f}")
    print(f"  RMSE: {rmse_xgb:.4f}")
    print(f"  R²:   {r2_xgb:.4f}")
except Exception as e:
    print(f"Error in basic model: {e}")

# 2. Advanced Hyperparameter Tuning with RandomizedSearch
print("\n2. Advanced Hyperparameter Tuning (RandomizedSearchCV)...")
print("   This will test 150 random combinations with 5-fold CV...")

try:
    # Optimized parameter distributions for Spotify popularity prediction
    param_dist = {
    "max_depth": [2, 3, 4, 5, 6, 8],
    "min_child_weight": [1, 3, 5, 10, 20],
    "learning_rate": loguniform(0.01, 0.3),
    "subsample": uniform(0.6, 0.4),        # 0.6–1.0
    "colsample_bytree": uniform(0.6, 0.4), # 0.6–1.0
    "n_estimators": [200, 400, 600],
    "gamma": loguniform(1e-4, 10),
    "reg_alpha": loguniform(1e-4, 10),
    "reg_lambda": loguniform(1e-4, 10)
}

    xgb_base = xgb.XGBRegressor(random_state=42, verbosity=0, n_jobs=-1)

    random_search = RandomizedSearchCV(
        xgb_base,
        param_dist,
        n_iter=200,          
        cv=5,                    # 5-fold cross-validation
        scoring='r2',
        n_jobs=-1,               # Parallel processing
        verbose=1,
        random_state=42
    )

    print("\nStarting RandomizedSearchCV...")
    random_search.fit(X_train_scaled, y_train)

    print(f"\nBest Parameters Found:")
    for param, value in random_search.best_params_.items():
        print(f"  {param}: {value}")
    
    print(f"\nBest CV R² Score: {random_search.best_score_:.4f}")

    # Evaluate on test set
    best_xgb = random_search.best_estimator_
    y_pred_best_xgb = best_xgb.predict(X_test_scaled)

    mae_best = mean_absolute_error(y_test, y_pred_best_xgb)
    rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best_xgb))
    r2_best = r2_score(y_test, y_pred_best_xgb)

    print(f"\nTuned XGBoost Results (Test Set):")
    print(f"  MAE:  {mae_best:.4f}")
    print(f"  RMSE: {rmse_best:.4f}")
    print(f"  R²:   {r2_best:.4f}")

    # Calculate improvement
    improvement = ((r2_best - r2_xgb) / abs(r2_xgb)) * 100 if r2_xgb != 0 else 0
    print(f"\n✓ Improvement over Baseline: {improvement:+.2f}%")

except Exception as e:
    print(f"Error in hyperparameter tuning: {e}")
    best_xgb = xgb_model
    y_pred_best_xgb = y_pred_xgb
    r2_best = r2_xgb



XGBoost Hyperparameter Optimization for Spotify Popularity

1. Training Basic XGBoost Model (Baseline)...

Basic XGBoost Results:
  MAE:  14.1561
  RMSE: 17.9386
  R²:   0.2351

2. Advanced Hyperparameter Tuning (RandomizedSearchCV)...
   This will test 150 random combinations with 5-fold CV...

Starting RandomizedSearchCV...
Fitting 5 folds for each of 200 candidates, totalling 1000 fits

Best Parameters Found:
  colsample_bytree: 0.9201426031289446
  gamma: 0.001474213587072223
  learning_rate: 0.0627571303739234
  max_depth: 8
  min_child_weight: 3
  n_estimators: 600
  reg_alpha: 0.04621563586871436
  reg_lambda: 0.00418474274949478
  subsample: 0.7332007652232791

Best CV R² Score: 0.3358

Tuned XGBoost Results (Test Set):
  MAE:  12.4516
  RMSE: 16.6035
  R²:   0.3447

✓ Improvement over Baseline: +46.64%


In [13]:
try:
    # Spotify Colors
    spotify_green = '#1DB954'
    spotify_dark = '#191414'
    
    fig, ax = plt.subplots(figsize=(10, 3.8))
    
    # Plot with custom styling
    importance_data = xgb.plot_importance(
        best_xgb, 
        max_num_features=12,
        ax=ax, 
        importance_type='gain',
        height=0.45
    )
    
    # Styling
    ax.set_facecolor('white')
    fig.patch.set_facecolor('white')
    
    # Labels
    ax.set_xlabel('Importance Score', fontsize=10, fontweight='bold', color=spotify_dark)
    ax.set_ylabel('Features', fontsize=10, fontweight='bold', color=spotify_dark)
    ax.set_title('Feature Importance Analysis', fontsize=11, fontweight='bold', 
                 color=spotify_dark, pad=12)
    
    # Remove spines and grid
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_color('#E0E0E0')
    ax.spines['bottom'].set_color('#E0E0E0')
    ax.grid(False)
    
    # Remove all existing text labels from XGBoost
    for text in ax.texts:
        text.set_visible(False)  # ← ENTFERNT die alten Labels!
    
    # Bar colors with black borders
    bars = ax.patches
    for bar in bars:
        bar.set_color(spotify_green)
        bar.set_alpha(0.9)
        bar.set_edgecolor('black')
        bar.set_linewidth(1.2)
    
    # Format x-axis to 2 decimal places
    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x:.2f}'))
    
    # Remove y-axis ticks
    ax.tick_params(axis='y', left=False)
    ax.tick_params(axis='x', labelsize=9)
    
    # Add value labels on bars with 2 decimal places ONLY
    for bar in bars:
        width = bar.get_width()
        ax.text(width + 200, bar.get_y() + bar.get_height()/2, 
                f'{width:.2f}',
                ha='left', va='center', fontsize=8.5, fontweight='bold', color=spotify_dark)
    
    # Tight layout
    plt.subplots_adjust(left=0.35, right=0.92, top=0.92, bottom=0.1)
    plt.savefig('figures/xgboost_feature_importance.png', dpi=300, bbox_inches='tight', 
                facecolor='white', edgecolor='none')
    print("✓ Feature importance plot saved")
    plt.close()
    
except Exception as e:
    print(f"Error creating plot: {e}")





✓ Feature importance plot saved
