In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('dataset.csv')

# Sélectionner les features numériques pertinentes
features = ['danceability', 'energy', 'loudness', 'speechiness', 
           'acousticness', 'instrumentalness', 'liveness', 'valence', 
           'tempo', 'duration_ms', 'explicit']

# Encoder les variables catégorielles
le = LabelEncoder()
df['key_encoded'] = le.fit_transform(df['key'])
df['mode_encoded'] = le.fit_transform(df['mode'])
df['time_signature_encoded'] = le.fit_transform(df['time_signature'])

features.extend(['key_encoded', 'mode_encoded', 'time_signature_encoded'])

# Définir X et y
X = df[features]
y = df['popularity']

# Gérer les valeurs manquantes
X = X.fillna(X.mean())

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Modèle 1: Régression linéaire
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Modèle 2: Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Modèle 3: XGBoost
xgb_model = XGBRegressor()
xgb_model.fit(X_train,y_train)

# Prédictions
y_pred_lr = lr_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)

# Évaluation
def evaluer_modele(y_true, y_pred, nom_modele):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"\n{nom_modele}:")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {np.sqrt(mse):.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")
    return mse, r2, mae

print("=== COMPARAISON DES MODÈLES ===")
mse_lr, r2_lr, mae_lr = evaluer_modele(y_test, y_pred_lr, "Régression Linéaire")
mse_rf, r2_rf, mae_rf = evaluer_modele(y_test, y_pred_rf, "Random Forest")
mse_xgb, r2_xgb, mae_xgb = evaluer_modele(y_test, y_pred_xgb, "XGBoost")

=== COMPARAISON DES MODÈLES ===

Régression Linéaire:
MSE: 482.4113
RMSE: 21.9639
MAE: 18.3351
R²: 0.0224

Random Forest:
MSE: 219.4256
RMSE: 14.8130
MAE: 10.5074
R²: 0.5554

XGBoost:
MSE: 362.3391
RMSE: 19.0352
MAE: 15.2201
R²: 0.2657


In [5]:
from sklearn.model_selection import GridSearchCV

# GridSearch minimal
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.2]
}

xgb_grid = GridSearchCV(
    XGBRegressor(random_state=42),
    param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

xgb_grid.fit(X_train, y_train)

# Évaluation
print("Meilleurs paramètres:", xgb_grid.best_params_)
y_pred_xgb_optimized = xgb_grid.best_estimator_.predict(X_test)
evaluer_modele(y_test, y_pred_xgb_optimized, "XGBoost Optimisé")

Meilleurs paramètres: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}

XGBoost Optimisé:
MSE: 374.8450
RMSE: 19.3609
MAE: 15.5924
R²: 0.2404


(374.8450012207031, 0.24040621519088745, 15.592398643493652)