In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Charger les donnÃ©es
df = pd.read_csv('dataset.csv')

# SÃ©lectionner les features numÃ©riques pertinentes
features = ['danceability', 'energy', 'loudness', 'speechiness', 
           'acousticness', 'instrumentalness', 'liveness', 'valence', 
           'tempo', 'duration_ms', 'explicit']

# Encoder les variables catÃ©gorielles
le = LabelEncoder()
df['key_encoded'] = le.fit_transform(df['key'])
df['mode_encoded'] = le.fit_transform(df['mode'])
df['time_signature_encoded'] = le.fit_transform(df['time_signature'])

features.extend(['key_encoded', 'mode_encoded', 'time_signature_encoded'])

# DÃ©finir X et y
X = df[features]
y = df['popularity']

# GÃ©rer les valeurs manquantes
X = X.fillna(X.mean())

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# ModÃ¨le Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# PrÃ©dictions
y_pred_rf = rf_model.predict(X_test)

# Ã‰valuation
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest - MSE: {mse_rf:.2f}, RÂ²: {r2_rf:.2f}")

Random Forest - MSE: 219.43, RÂ²: 0.56


In [5]:
# ModÃ¨le XGBoost
xgb_model = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
xgb_model.fit(X_train, y_train)

# PrÃ©dictions
y_pred_xgb = xgb_model.predict(X_test)

# Ã‰valuation
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost - MSE: {mse_xgb:.2f}, RÂ²: {r2_xgb:.2f}")

XGBoost - MSE: 362.34, RÂ²: 0.27


In [20]:
from sklearn.model_selection import RandomizedSearchCV

# Phase 1: Recherche large
print("=== PHASE 1: Recherche large ===")
phase1_params = {
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [500, 1000, 1500],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

phase1_search = RandomizedSearchCV(
    XGBRegressor(random_state=42),
    phase1_params,
    n_iter=50,
    cv=3,  # CV rapide pour la phase 1
    scoring='r2',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

phase1_search.fit(X_train, y_train)
best_phase1 = phase1_search.best_params_

# Phase 2: Affinage autour des meilleurs paramÃ¨tres
print("\n=== PHASE 2: Affinage ===")
phase2_params = {
    'max_depth': [best_phase1['max_depth']-1, best_phase1['max_depth'], best_phase1['max_depth']+1],
    'learning_rate': np.linspace(best_phase1['learning_rate']*0.5, best_phase1['learning_rate']*1.5, 5),
    'n_estimators': [best_phase1['n_estimators']-200, best_phase1['n_estimators'], best_phase1['n_estimators']+200],
    'subsample': np.linspace(best_phase1['subsample']-0.1, best_phase1['subsample']+0.1, 5),
    'colsample_bytree': np.linspace(best_phase1['colsample_bytree']-0.1, best_phase1['colsample_bytree']+0.1, 5),
    'reg_alpha': [0, 0.1, 0.5, 1, 2],
    'reg_lambda': [0.5, 1, 2, 3]
}

phase2_search = RandomizedSearchCV(
    XGBRegressor(random_state=42),
    phase2_params,
    n_iter=100,
    cv=5,  # CV plus robuste pour l'affinage
    scoring='r2',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

phase2_search.fit(X_train, y_train)

print(f"RÂ² final (test): {r2_score(y_test, phase2_search.best_estimator_.predict(X_test)):.4f}")

=== PHASE 1: Recherche large ===
Fitting 3 folds for each of 50 candidates, totalling 150 fits

=== PHASE 2: Affinage ===
Fitting 5 folds for each of 100 candidates, totalling 500 fits


KeyboardInterrupt: 

In [6]:
# Version plus complÃ¨te avec recherche d'hyperparamÃ¨tres
from sklearn.model_selection import GridSearchCV

# XGBoost avec GridSearch
xgb_param_grid = {
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0]
}

xgb_grid = GridSearchCV(XGBRegressor(random_state=42), 
                      xgb_param_grid, cv=5, scoring='r2', n_jobs=-1)
xgb_grid.fit(X_train, y_train)

print(f"Meilleurs paramÃ¨tres XGB: {xgb_grid.best_params_}")
print(f"Meilleur score XGB: {xgb_grid.best_score_:.2f}")

Meilleurs paramÃ¨tres XGB: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}
Meilleur score XGB: 0.24


In [None]:
# Random Search avancÃ© pour XGBoost
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import time

xgb_param_dist = {
    'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
    'learning_rate': np.logspace(-3, 0, 15),
    'n_estimators': [50, 100, 200, 300, 500, 800, 1000],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1, 2, 5],
    'reg_lambda': [0.001, 0.01, 0.1, 0.5, 1, 1.5, 2, 5]
}

random_search_advanced = RandomizedSearchCV(
    XGBRegressor(random_state=42),
    xgb_param_dist,
    n_iter=100,
    cv=5,
    scoring='r2',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

start_time = time.time()
random_search_advanced.fit(X_train, y_train)
end_time = time.time()

print(f"Temps d'exÃ©cution: {(end_time - start_time)/60:.2f} minutes")

print("\nMeilleurs paramÃ¨tres:")
best_params = random_search_advanced.best_params_
for param, value in best_params.items():
    print(f"{param}: {value}")

y_pred_advanced = random_search_advanced.best_estimator_.predict(X_test)
r2_advanced = r2_score(y_test, y_pred_advanced)

print(f"\nRÂ² validation croisÃ©e: {random_search_advanced.best_score_:.4f}")
print(f"RÂ² jeu de test: {r2_advanced:.4f}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Temps d'exÃ©cution: 4.35 minutes

Meilleurs paramÃ¨tres:
subsample: 1.0
reg_lambda: 5
reg_alpha: 0.1
n_estimators: 800
max_depth: 8
learning_rate: 0.13894954943731375
colsample_bytree: 0.6

RÂ² validation croisÃ©e: 0.4509
RÂ² jeu de test: 0.4945
XGBoost par dÃ©faut RÂ²: 0.2657
Random Forest RÂ²: 0.5554


In [11]:
# Feature Engineering AvancÃ©
print("Feature Engineering...")

# 1. Interactions entre features audio
df['energy_loudness_interaction'] = df['energy'] * (df['loudness'] + 60) / 60  # loudness est nÃ©gative
df['dance_valence_synergy'] = df['danceability'] * df['valence']
df['acoustic_speech_balance'] = df['acousticness'] - df['speechiness']
df['energy_tempo_ratio'] = df['energy'] / (df['tempo'] + 0.001)

# 2. Features polynomiales et transformations
df['energy_squared'] = df['energy'] ** 2
df['danceability_squared'] = df['danceability'] ** 2
df['loudness_normalized'] = (df['loudness'] - df['loudness'].min()) / (df['loudness'].max() - df['loudness'].min())
df['duration_minutes'] = df['duration_ms'] / 60000

# 3. Ratios et combinaisons
df['speech_to_instrument'] = df['speechiness'] / (df['instrumentalness'] + 0.001)
df['acoustic_energy_ratio'] = df['acousticness'] / (df['energy'] + 0.001)
df['valence_liveness_interaction'] = df['valence'] * df['liveness']

# 4. Features basÃ©es sur la durÃ©e
df['is_long_track'] = (df['duration_ms'] > 240000).astype(int)  # plus de 4 minutes
df['tempo_duration_interaction'] = df['tempo'] * df['duration_minutes']

# Mettre Ã  jour la liste des features
new_features = ['energy_loudness_interaction', 'dance_valence_synergy', 'acoustic_speech_balance',
                'energy_tempo_ratio', 'energy_squared', 'danceability_squared', 
                'loudness_normalized', 'duration_minutes', 'speech_to_instrument',
                'acoustic_energy_ratio', 'valence_liveness_interaction', 'is_long_track',
                'tempo_duration_interaction']

features.extend(new_features)

# Mettre Ã  jour X avec les nouvelles features
X = df[features]

# GÃ©rer les valeurs manquantes des nouvelles features
X = X.fillna(X.mean())

# Resplit des donnÃ©es avec les nouvelles features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Nouvelles features ajoutÃ©es: {len(new_features)}")
print(f"Total features: {len(features)}")

# RÃ©entraÃ®ner XGBoost avec les meilleurs paramÃ¨tres trouvÃ©s
print("\nEntraÃ®nement de XGBoost avec les meilleurs paramÃ¨tres et nouvelles features...")

best_params = {
    'subsample': 1.0,
    'reg_lambda': 5,
    'reg_alpha': 0.1,
    'n_estimators': 800,
    'max_depth': 8,
    'learning_rate': 0.13894954943731375,
    'colsample_bytree': 0.6
}

xgb_optimized = XGBRegressor(**best_params, random_state=42, n_jobs=-1)
xgb_optimized.fit(X_train, y_train)

# PrÃ©dictions et Ã©valuation
y_pred_optimized = xgb_optimized.predict(X_test)
r2_optimized = r2_score(y_test, y_pred_optimized)
mse_optimized = mean_squared_error(y_test, y_pred_optimized)

print(f"\nRÃ©sultats avec feature engineering:")
print(f"XGBoost optimisÃ© - MSE: {mse_optimized:.2f}, RÂ²: {r2_optimized:.4f}")
print(f"Random Forest     - RÂ²: {r2_rf:.4f}")

# VÃ©rifier si on dÃ©passe Random Forest
if r2_optimized > r2_rf:
    print("âœ… SUCCÃˆS: XGBoost dÃ©passe enfin Random Forest!")
else:
    improvement = r2_optimized - 0.4945  # ancien score XGBoost
    print(f"AmÃ©lioration: +{improvement:.4f} par rapport Ã  l'ancien XGBoost")

Feature Engineering...
Nouvelles features ajoutÃ©es: 13
Total features: 27

EntraÃ®nement de XGBoost avec les meilleurs paramÃ¨tres et nouvelles features...

RÃ©sultats avec feature engineering:
XGBoost optimisÃ© - MSE: 242.48, RÂ²: 0.5086
Random Forest     - RÂ²: 0.5554
AmÃ©lioration: +0.0141 par rapport Ã  l'ancien XGBoost


In [17]:
# Approche ensemble simple
from sklearn.ensemble import VotingRegressor

print("\nTest avec ensemble XGBoost + Random Forest...")

# CrÃ©er un ensemble des deux meilleurs modÃ¨les
ensemble = VotingRegressor([
    ('xgb', xgb_best),
    ('rf', rf_model)  # Votre Random Forest original
])

ensemble.fit(X_train_full, y_train_full)
y_pred_ensemble = ensemble.predict(X_test_full)
r2_ensemble = r2_score(y_test_full, y_pred_ensemble)

print(f"Ensemble XGBoost + Random Forest - RÂ²: {r2_ensemble:.4f}")

if r2_ensemble > r2_rf:
    print("ðŸŽ¯ L'ensemble surpasse les deux modÃ¨les individuels!")


Test avec ensemble XGBoost + Random Forest...
Ensemble XGBoost + Random Forest - RÂ²: 0.5476
