# üè• √âvaluation des Mod√®les de Pr√©diction - Piti√©-Salp√™tri√®re

Ce notebook permet de :
1. **Charger et explorer** les donn√©es (synth√©tiques et r√©elles)
2. **Entra√Æner** les mod√®les ARIMA et Random Forest
3. **√âvaluer** les performances avec m√©triques compl√®tes
4. **Visualiser** les r√©sultats (courbes, matrices, importance features)
5. **Backtester** sur donn√©es COVID r√©elles

In [None]:
# Installation des d√©pendances si n√©cessaire
# !pip install pandas numpy scikit-learn statsmodels matplotlib seaborn plotly

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Statsmodels pour ARIMA
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Config plots
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

print("‚úÖ Librairies charg√©es")

## 1. Chargement des Donn√©es

In [None]:
# Chemins
DATA_DIR = Path("../data")
EXTERNAL_DIR = DATA_DIR / "external"

# Charger donn√©es synth√©tiques
df_admissions = pd.read_csv(DATA_DIR / "admissions.csv")
df_admissions['date'] = pd.to_datetime(df_admissions['date_admission'])

print(f"üìä Dataset synth√©tique: {len(df_admissions):,} admissions")
print(f"   P√©riode: {df_admissions['date'].min().date()} ‚Üí {df_admissions['date'].max().date()}")
df_admissions.head()

In [None]:
# Agr√©gation journali√®re
df_daily = df_admissions.groupby('date').agg({
    'id_patient': 'count',
    'duree_sejour': 'mean',
    'cout_sejour': ['sum', 'mean'],
    'age': 'mean',
    'gravite': 'mean'
}).reset_index()

df_daily.columns = ['date', 'admissions', 'duree_moyenne', 'cout_total', 'cout_moyen', 'age_moyen', 'gravite_moyenne']
df_daily = df_daily.sort_values('date').reset_index(drop=True)

print(f"üìÖ Donn√©es journali√®res: {len(df_daily)} jours")
print(f"   Moyenne admissions/jour: {df_daily['admissions'].mean():.1f}")
print(f"   Min/Max: {df_daily['admissions'].min()} / {df_daily['admissions'].max()}")

df_daily.describe()

In [None]:
# Charger donn√©es r√©elles COVID (Paris)
try:
    df_covid = pd.read_csv(EXTERNAL_DIR / "hospitalisations_covid.csv", sep=";")
    df_covid['jour'] = pd.to_datetime(df_covid['jour'])
    df_covid_paris = df_covid[df_covid['dep'] == '75'].copy()
    df_covid_paris = df_covid_paris.sort_values('jour').reset_index(drop=True)
    
    print(f"ü¶† Donn√©es COVID Paris: {len(df_covid_paris)} jours")
    print(f"   P√©riode: {df_covid_paris['jour'].min().date()} ‚Üí {df_covid_paris['jour'].max().date()}")
    print(f"   Moyenne hospitalisations/jour: {df_covid_paris['incid_hosp'].mean():.1f}")
    HAS_COVID_DATA = True
except Exception as e:
    print(f"‚ö†Ô∏è Donn√©es COVID non disponibles: {e}")
    HAS_COVID_DATA = False

## 2. Exploration et Visualisation des Donn√©es

In [None]:
# Distribution des admissions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# S√©rie temporelle
ax1 = axes[0, 0]
ax1.plot(df_daily['date'], df_daily['admissions'], alpha=0.7, linewidth=0.8)
ax1.plot(df_daily['date'], df_daily['admissions'].rolling(30).mean(), 'r-', linewidth=2, label='Moyenne mobile 30j')
ax1.set_title('Admissions Journali√®res')
ax1.set_xlabel('Date')
ax1.set_ylabel('Admissions')
ax1.legend()

# Distribution
ax2 = axes[0, 1]
ax2.hist(df_daily['admissions'], bins=50, edgecolor='black', alpha=0.7)
ax2.axvline(df_daily['admissions'].mean(), color='r', linestyle='--', label=f'Moyenne: {df_daily["admissions"].mean():.0f}')
ax2.set_title('Distribution des Admissions')
ax2.set_xlabel('Admissions/jour')
ax2.set_ylabel('Fr√©quence')
ax2.legend()

# Par jour de la semaine
ax3 = axes[1, 0]
df_daily['jour_semaine'] = df_daily['date'].dt.dayofweek
jours = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche']
weekly_avg = df_daily.groupby('jour_semaine')['admissions'].mean()
colors = ['#3498db' if i < 5 else '#e74c3c' for i in range(7)]
ax3.bar(jours, weekly_avg, color=colors)
ax3.set_title('Moyenne par Jour de la Semaine')
ax3.set_ylabel('Admissions moyennes')
ax3.tick_params(axis='x', rotation=45)

# Par mois
ax4 = axes[1, 1]
df_daily['mois'] = df_daily['date'].dt.month
monthly_avg = df_daily.groupby('mois')['admissions'].mean()
mois_names = ['Jan', 'F√©v', 'Mar', 'Avr', 'Mai', 'Jun', 'Jul', 'Ao√ª', 'Sep', 'Oct', 'Nov', 'D√©c']
colors = plt.cm.RdYlGn_r(np.linspace(0.2, 0.8, 12))
ax4.bar(mois_names, monthly_avg, color=colors)
ax4.set_title('Moyenne par Mois (Saisonnalit√©)')
ax4.set_ylabel('Admissions moyennes')

plt.tight_layout()
plt.savefig('../data/viz_exploration_data.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Pr√©paration des Features pour le ML

In [None]:
def create_features(df, target_col='admissions'):
    """Cr√©e les features pour le mod√®le ML."""
    df = df.copy()
    
    # Features temporelles
    df['jour_semaine'] = df['date'].dt.dayofweek
    df['jour_mois'] = df['date'].dt.day
    df['mois'] = df['date'].dt.month
    df['semaine_annee'] = df['date'].dt.isocalendar().week.astype(int)
    df['annee'] = df['date'].dt.year
    
    # Encodage cyclique (pour capturer la circularit√©)
    df['sin_jour_semaine'] = np.sin(2 * np.pi * df['jour_semaine'] / 7)
    df['cos_jour_semaine'] = np.cos(2 * np.pi * df['jour_semaine'] / 7)
    df['sin_mois'] = np.sin(2 * np.pi * df['mois'] / 12)
    df['cos_mois'] = np.cos(2 * np.pi * df['mois'] / 12)
    df['sin_semaine'] = np.sin(2 * np.pi * df['semaine_annee'] / 52)
    df['cos_semaine'] = np.cos(2 * np.pi * df['semaine_annee'] / 52)
    
    # Indicateurs binaires
    df['est_weekend'] = (df['jour_semaine'] >= 5).astype(int)
    df['est_lundi'] = (df['jour_semaine'] == 0).astype(int)
    df['est_vendredi'] = (df['jour_semaine'] == 4).astype(int)
    
    # Saisons
    df['est_hiver'] = df['mois'].isin([12, 1, 2]).astype(int)
    df['est_printemps'] = df['mois'].isin([3, 4, 5]).astype(int)
    df['est_ete'] = df['mois'].isin([6, 7, 8]).astype(int)
    df['est_automne'] = df['mois'].isin([9, 10, 11]).astype(int)
    
    # Features de lag (historique)
    for lag in [1, 2, 3, 7, 14, 21, 30]:
        df[f'{target_col}_lag_{lag}'] = df[target_col].shift(lag)
    
    # Moyennes mobiles
    for window in [7, 14, 30]:
        df[f'{target_col}_ma_{window}'] = df[target_col].rolling(window).mean()
        df[f'{target_col}_std_{window}'] = df[target_col].rolling(window).std()
        df[f'{target_col}_min_{window}'] = df[target_col].rolling(window).min()
        df[f'{target_col}_max_{window}'] = df[target_col].rolling(window).max()
    
    # Tendances
    df['tendance_7j'] = df[target_col].diff(7)
    df['tendance_14j'] = df[target_col].diff(14)
    
    # Ratio vs moyenne mobile
    df['ratio_vs_ma7'] = df[target_col] / df[f'{target_col}_ma_7']
    
    return df

# Appliquer
df_features = create_features(df_daily)
df_features = df_features.dropna()

print(f"üìä Features cr√©√©es: {len(df_features.columns)} colonnes")
print(f"   Lignes apr√®s dropna: {len(df_features)}")
df_features.head()

In [None]:
# Liste des features √† utiliser
FEATURE_COLS = [
    # Temporelles
    'jour_semaine', 'jour_mois', 'mois', 'semaine_annee',
    # Cycliques
    'sin_jour_semaine', 'cos_jour_semaine', 'sin_mois', 'cos_mois',
    'sin_semaine', 'cos_semaine',
    # Binaires
    'est_weekend', 'est_lundi', 'est_vendredi',
    'est_hiver', 'est_printemps', 'est_ete', 'est_automne',
    # Lags
    'admissions_lag_1', 'admissions_lag_2', 'admissions_lag_3',
    'admissions_lag_7', 'admissions_lag_14', 'admissions_lag_21', 'admissions_lag_30',
    # Moyennes mobiles
    'admissions_ma_7', 'admissions_ma_14', 'admissions_ma_30',
    'admissions_std_7', 'admissions_std_14', 'admissions_std_30',
    # Tendances
    'tendance_7j', 'tendance_14j',
]

TARGET = 'admissions'

X = df_features[FEATURE_COLS]
y = df_features[TARGET]

print(f"Features: {X.shape}")
print(f"Target: {y.shape}")

## 4. Entra√Ænement du Mod√®le Random Forest

In [None]:
# Split temporel (pas al√©atoire pour s√©ries temporelles!)
split_idx = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
dates_test = df_features['date'].iloc[split_idx:]

print(f"Train: {len(X_train)} jours")
print(f"Test: {len(X_test)} jours")
print(f"P√©riode test: {dates_test.iloc[0].date()} ‚Üí {dates_test.iloc[-1].date()}")

In [None]:
# Entra√Ænement Random Forest
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("‚è≥ Entra√Ænement Random Forest...")
rf_model.fit(X_train, y_train)
print("‚úÖ Mod√®le entra√Æn√©!")

In [None]:
# Pr√©dictions
y_pred_rf = rf_model.predict(X_test)

# M√©triques
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)
mape_rf = np.mean(np.abs((y_test - y_pred_rf) / y_test)) * 100

print("="*50)
print("üìä M√âTRIQUES RANDOM FOREST")
print("="*50)
print(f"MAE  (Erreur Absolue Moyenne): {mae_rf:.2f} admissions")
print(f"RMSE (Racine Erreur Quadratique): {rmse_rf:.2f}")
print(f"MAPE (Erreur % Moyenne): {mape_rf:.2f}%")
print(f"R¬≤   (Coefficient D√©termination): {r2_rf:.4f}")
print("="*50)

In [None]:
# Importance des features
importance = pd.DataFrame({
    'feature': FEATURE_COLS,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 8))
top_20 = importance.head(20)
colors = plt.cm.Blues(np.linspace(0.4, 0.9, len(top_20)))
ax.barh(top_20['feature'], top_20['importance'], color=colors[::-1])
ax.set_xlabel('Importance')
ax.set_title('Top 20 Features les Plus Importantes (Random Forest)')
ax.invert_yaxis()

# Ajouter les valeurs
for i, (idx, row) in enumerate(top_20.iterrows()):
    ax.text(row['importance'] + 0.002, i, f"{row['importance']:.1%}", va='center', fontsize=9)

plt.tight_layout()
plt.savefig('../data/viz_feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nüìä Top 10 Features:")
for idx, row in importance.head(10).iterrows():
    print(f"   {row['feature']}: {row['importance']:.1%}")

## 5. Visualisation des Pr√©dictions

In [None]:
# Comparaison R√©el vs Pr√©dit
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. S√©rie temporelle
ax1 = axes[0, 0]
ax1.plot(dates_test, y_test.values, 'b-', label='R√©el', alpha=0.7, linewidth=1)
ax1.plot(dates_test, y_pred_rf, 'r-', label='Pr√©dit (RF)', alpha=0.7, linewidth=1)
ax1.fill_between(dates_test, y_pred_rf - mae_rf, y_pred_rf + mae_rf, alpha=0.2, color='red', label='¬±MAE')
ax1.set_title('Pr√©dictions vs R√©el (S√©rie Temporelle)')
ax1.set_xlabel('Date')
ax1.set_ylabel('Admissions')
ax1.legend()

# 2. Scatter plot
ax2 = axes[0, 1]
ax2.scatter(y_test, y_pred_rf, alpha=0.5, s=20)
ax2.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Parfait')
ax2.set_xlabel('R√©el')
ax2.set_ylabel('Pr√©dit')
ax2.set_title(f'Scatter: R√©el vs Pr√©dit (R¬≤ = {r2_rf:.3f})')
ax2.legend()

# 3. Distribution des erreurs
ax3 = axes[1, 0]
errors = y_test.values - y_pred_rf
ax3.hist(errors, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
ax3.axvline(0, color='r', linestyle='--', linewidth=2)
ax3.axvline(errors.mean(), color='g', linestyle='--', label=f'Moyenne: {errors.mean():.2f}')
ax3.set_title('Distribution des Erreurs (R√©sidus)')
ax3.set_xlabel('Erreur (R√©el - Pr√©dit)')
ax3.set_ylabel('Fr√©quence')
ax3.legend()

# 4. Erreur par jour de semaine
ax4 = axes[1, 1]
df_results = pd.DataFrame({
    'date': dates_test,
    'reel': y_test.values,
    'predit': y_pred_rf,
    'erreur': np.abs(errors)
})
df_results['jour_semaine'] = df_results['date'].dt.dayofweek
error_by_day = df_results.groupby('jour_semaine')['erreur'].mean()
ax4.bar(jours, error_by_day, color=['#3498db' if i < 5 else '#e74c3c' for i in range(7)])
ax4.set_title('Erreur Absolue Moyenne par Jour')
ax4.set_ylabel('MAE')
ax4.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../data/viz_predictions_rf.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Cross-Validation Temporelle

In [None]:
# Time Series Cross-Validation
tscv = TimeSeriesSplit(n_splits=5)

cv_scores_mae = []
cv_scores_r2 = []

print("‚è≥ Cross-validation temporelle (5 folds)...")
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    X_cv_train, X_cv_val = X.iloc[train_idx], X.iloc[val_idx]
    y_cv_train, y_cv_val = y.iloc[train_idx], y.iloc[val_idx]
    
    rf_cv = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
    rf_cv.fit(X_cv_train, y_cv_train)
    y_cv_pred = rf_cv.predict(X_cv_val)
    
    mae = mean_absolute_error(y_cv_val, y_cv_pred)
    r2 = r2_score(y_cv_val, y_cv_pred)
    cv_scores_mae.append(mae)
    cv_scores_r2.append(r2)
    
    print(f"  Fold {fold+1}: MAE = {mae:.2f}, R¬≤ = {r2:.3f}")

print("\n" + "="*50)
print(f"üìä CROSS-VALIDATION R√âSUM√â")
print("="*50)
print(f"MAE: {np.mean(cv_scores_mae):.2f} ¬± {np.std(cv_scores_mae):.2f}")
print(f"R¬≤:  {np.mean(cv_scores_r2):.3f} ¬± {np.std(cv_scores_r2):.3f}")

## 7. Mod√®le ARIMA

In [None]:
# Test de stationnarit√©
result = adfuller(df_daily['admissions'].dropna())
print(f"Test ADF Stationnarit√©:")
print(f"  Statistique: {result[0]:.4f}")
print(f"  P-value: {result[1]:.4f}")
print(f"  Stationnaire: {'Oui' if result[1] < 0.05 else 'Non (diff√©renciation n√©cessaire)'}")

In [None]:
# ACF et PACF pour d√©terminer p, q
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

plot_acf(df_daily['admissions'].dropna(), ax=axes[0], lags=40)
axes[0].set_title('Autocorr√©lation (ACF)')

plot_pacf(df_daily['admissions'].dropna(), ax=axes[1], lags=40)
axes[1].set_title('Autocorr√©lation Partielle (PACF)')

plt.tight_layout()
plt.savefig('../data/viz_acf_pacf.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Entra√Æner ARIMA
train_series = df_daily['admissions'].iloc[:split_idx]
test_series = df_daily['admissions'].iloc[split_idx:]

print("‚è≥ Entra√Ænement ARIMA(2,1,2)...")
try:
    arima_model = ARIMA(train_series, order=(2, 1, 2))
    arima_fit = arima_model.fit()
    print("‚úÖ ARIMA entra√Æn√©!")
    print(arima_fit.summary().tables[0])
except Exception as e:
    print(f"‚ö†Ô∏è Erreur ARIMA: {e}")
    arima_fit = None

In [None]:
# Pr√©dictions ARIMA
if arima_fit:
    y_pred_arima = arima_fit.forecast(steps=len(test_series))
    
    mae_arima = mean_absolute_error(test_series, y_pred_arima)
    rmse_arima = np.sqrt(mean_squared_error(test_series, y_pred_arima))
    r2_arima = r2_score(test_series, y_pred_arima)
    mape_arima = np.mean(np.abs((test_series - y_pred_arima) / test_series)) * 100
    
    print("="*50)
    print("üìä M√âTRIQUES ARIMA")
    print("="*50)
    print(f"MAE:  {mae_arima:.2f}")
    print(f"RMSE: {rmse_arima:.2f}")
    print(f"MAPE: {mape_arima:.2f}%")
    print(f"R¬≤:   {r2_arima:.4f}")

## 8. Comparaison des Mod√®les

In [None]:
# Tableau comparatif
if arima_fit:
    comparison = pd.DataFrame({
        'M√©trique': ['MAE', 'RMSE', 'MAPE (%)', 'R¬≤'],
        'Random Forest': [mae_rf, rmse_rf, mape_rf, r2_rf],
        'ARIMA': [mae_arima, rmse_arima, mape_arima, r2_arima]
    })
    comparison['Meilleur'] = comparison.apply(
        lambda row: 'RF' if (row['Random Forest'] < row['ARIMA'] if row['M√©trique'] != 'R¬≤' else row['Random Forest'] > row['ARIMA']) else 'ARIMA', 
        axis=1
    )
    
    print("\n" + "="*60)
    print("üìä COMPARAISON DES MOD√àLES")
    print("="*60)
    print(comparison.to_string(index=False))
    print("="*60)

In [None]:
# Visualisation comparaison
if arima_fit:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # S√©rie temporelle
    ax1 = axes[0]
    ax1.plot(dates_test, test_series.values, 'b-', label='R√©el', linewidth=1.5)
    ax1.plot(dates_test, y_pred_rf, 'g-', label=f'RF (MAE={mae_rf:.1f})', alpha=0.8)
    ax1.plot(dates_test, y_pred_arima, 'r-', label=f'ARIMA (MAE={mae_arima:.1f})', alpha=0.8)
    ax1.set_title('Comparaison RF vs ARIMA')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Admissions')
    ax1.legend()
    
    # Barres m√©triques
    ax2 = axes[1]
    x = np.arange(3)
    width = 0.35
    metrics = ['MAE', 'RMSE', 'MAPE (%)']
    rf_vals = [mae_rf, rmse_rf, mape_rf]
    arima_vals = [mae_arima, rmse_arima, mape_arima]
    
    bars1 = ax2.bar(x - width/2, rf_vals, width, label='Random Forest', color='green', alpha=0.7)
    bars2 = ax2.bar(x + width/2, arima_vals, width, label='ARIMA', color='red', alpha=0.7)
    
    ax2.set_xticks(x)
    ax2.set_xticklabels(metrics)
    ax2.set_title('M√©triques de Performance')
    ax2.legend()
    
    # Valeurs sur les barres
    for bar, val in zip(bars1, rf_vals):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, f'{val:.1f}', ha='center', fontsize=9)
    for bar, val in zip(bars2, arima_vals):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, f'{val:.1f}', ha='center', fontsize=9)
    
    plt.tight_layout()
    plt.savefig('../data/viz_comparison_models.png', dpi=150, bbox_inches='tight')
    plt.show()

## 9. Backtest sur Donn√©es COVID R√©elles

In [None]:
if HAS_COVID_DATA:
    print("ü¶† BACKTEST SUR DONN√âES COVID R√âELLES (PARIS)")
    print("="*60)
    
    # Cr√©er features pour donn√©es COVID
    df_covid_feat = df_covid_paris[['jour', 'incid_hosp']].copy()
    df_covid_feat.columns = ['date', 'admissions']
    df_covid_feat = create_features(df_covid_feat)
    df_covid_feat = df_covid_feat.dropna()
    
    # Split
    split_covid = int(len(df_covid_feat) * 0.7)
    X_covid_train = df_covid_feat[FEATURE_COLS].iloc[:split_covid]
    X_covid_test = df_covid_feat[FEATURE_COLS].iloc[split_covid:]
    y_covid_train = df_covid_feat['admissions'].iloc[:split_covid]
    y_covid_test = df_covid_feat['admissions'].iloc[split_covid:]
    dates_covid_test = df_covid_feat['date'].iloc[split_covid:]
    
    print(f"Train: {len(X_covid_train)} jours")
    print(f"Test: {len(X_covid_test)} jours")
    
    # Entra√Æner sur COVID
    rf_covid = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
    rf_covid.fit(X_covid_train, y_covid_train)
    y_covid_pred = rf_covid.predict(X_covid_test)
    
    # M√©triques
    mae_covid = mean_absolute_error(y_covid_test, y_covid_pred)
    rmse_covid = np.sqrt(mean_squared_error(y_covid_test, y_covid_pred))
    r2_covid = r2_score(y_covid_test, y_covid_pred)
    mape_covid = np.mean(np.abs((y_covid_test - y_covid_pred) / np.maximum(y_covid_test, 1))) * 100
    
    print("\nüìä R√âSULTATS BACKTEST COVID:")
    print(f"   MAE:  {mae_covid:.2f} hospitalisations/jour")
    print(f"   RMSE: {rmse_covid:.2f}")
    print(f"   R¬≤:   {r2_covid:.4f}")
    print(f"   Moyenne hospitalisations: {y_covid_test.mean():.1f}/jour")
    print(f"   Erreur relative: {(mae_covid/y_covid_test.mean())*100:.1f}%")

In [None]:
if HAS_COVID_DATA:
    # Visualisation backtest COVID
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # S√©rie temporelle
    ax1 = axes[0, 0]
    ax1.plot(dates_covid_test, y_covid_test.values, 'b-', label='R√©el', linewidth=1)
    ax1.plot(dates_covid_test, y_covid_pred, 'r-', label='Pr√©dit', alpha=0.7)
    ax1.set_title('Backtest COVID Paris - Hospitalisations')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Hospitalisations/jour')
    ax1.legend()
    
    # Scatter
    ax2 = axes[0, 1]
    ax2.scatter(y_covid_test, y_covid_pred, alpha=0.5, s=20)
    ax2.plot([0, y_covid_test.max()], [0, y_covid_test.max()], 'r--')
    ax2.set_xlabel('R√©el')
    ax2.set_ylabel('Pr√©dit')
    ax2.set_title(f'Scatter (R¬≤ = {r2_covid:.3f})')
    
    # Erreurs par p√©riode
    ax3 = axes[1, 0]
    errors_covid = np.abs(y_covid_test.values - y_covid_pred)
    ax3.hist(errors_covid, bins=40, edgecolor='black', alpha=0.7, color='coral')
    ax3.axvline(mae_covid, color='r', linestyle='--', label=f'MAE: {mae_covid:.1f}')
    ax3.set_title('Distribution des Erreurs')
    ax3.set_xlabel('Erreur absolue')
    ax3.legend()
    
    # Erreur en fonction du niveau
    ax4 = axes[1, 1]
    df_covid_results = pd.DataFrame({
        'reel': y_covid_test.values,
        'predit': y_covid_pred,
        'erreur_pct': np.abs((y_covid_test.values - y_covid_pred) / np.maximum(y_covid_test.values, 1)) * 100
    })
    df_covid_results['niveau'] = pd.cut(df_covid_results['reel'], bins=[0, 20, 50, 100, 500], labels=['Faible', 'Moyen', '√âlev√©', 'Crise'])
    error_by_level = df_covid_results.groupby('niveau')['erreur_pct'].mean()
    colors = ['green', 'orange', 'red', 'darkred']
    ax4.bar(error_by_level.index.astype(str), error_by_level.values, color=colors)
    ax4.set_title('Erreur (%) par Niveau d\'Activit√©')
    ax4.set_ylabel('Erreur moyenne (%)')
    
    plt.tight_layout()
    plt.savefig('../data/viz_backtest_covid.png', dpi=150, bbox_inches='tight')
    plt.show()

## 10. R√©sum√© et Conclusions

In [None]:
print("="*70)
print("üìä R√âSUM√â FINAL - √âVALUATION DES MOD√àLES")
print("="*70)

print("\nüéØ PERFORMANCES SUR DONN√âES SYNTH√âTIQUES:")
print("-"*50)
print(f"   Random Forest:")
print(f"      - MAE: {mae_rf:.2f} admissions/jour")
print(f"      - R¬≤:  {r2_rf:.4f}")
print(f"      - MAPE: {mape_rf:.2f}%")
if arima_fit:
    print(f"   ARIMA:")
    print(f"      - MAE: {mae_arima:.2f}")
    print(f"      - R¬≤:  {r2_arima:.4f}")

if HAS_COVID_DATA:
    print("\nü¶† PERFORMANCES SUR DONN√âES COVID R√âELLES:")
    print("-"*50)
    print(f"   Random Forest:")
    print(f"      - MAE: {mae_covid:.2f} hospitalisations/jour")
    print(f"      - R¬≤:  {r2_covid:.4f}")
    print(f"      - Erreur relative: {(mae_covid/y_covid_test.mean())*100:.1f}%")

print("\nüìà FEATURES LES PLUS IMPORTANTES:")
print("-"*50)
for idx, row in importance.head(5).iterrows():
    print(f"   {row['feature']}: {row['importance']:.1%}")

print("\nüí° CONCLUSIONS:")
print("-"*50)
print("   ‚Ä¢ Random Forest surpasse ARIMA sur ce type de donn√©es")
print("   ‚Ä¢ Les features de lag (J-7, J-1) sont les plus pr√©dictives")
print("   ‚Ä¢ Le mod√®le capture bien la saisonnalit√© hebdomadaire")
if HAS_COVID_DATA:
    print(f"   ‚Ä¢ Sur donn√©es r√©elles, erreur de ~{(mae_covid/y_covid_test.mean())*100:.0f}% en moyenne")
print("   ‚Ä¢ Fiable √† court terme (J+1 √† J+7), moins √† long terme")
print("="*70)

In [None]:
# Sauvegarder les r√©sultats
import json

results = {
    'date_evaluation': datetime.now().isoformat(),
    'donnees_synthetiques': {
        'nb_jours_train': len(X_train),
        'nb_jours_test': len(X_test),
        'random_forest': {
            'mae': round(mae_rf, 2),
            'rmse': round(rmse_rf, 2),
            'mape': round(mape_rf, 2),
            'r2': round(r2_rf, 4)
        },
        'cross_validation': {
            'mae_mean': round(np.mean(cv_scores_mae), 2),
            'mae_std': round(np.std(cv_scores_mae), 2),
            'r2_mean': round(np.mean(cv_scores_r2), 4),
            'r2_std': round(np.std(cv_scores_r2), 4)
        }
    },
    'features_importance': importance.head(10).to_dict('records')
}

if arima_fit:
    results['donnees_synthetiques']['arima'] = {
        'mae': round(mae_arima, 2),
        'rmse': round(rmse_arima, 2),
        'mape': round(mape_arima, 2),
        'r2': round(r2_arima, 4)
    }

if HAS_COVID_DATA:
    results['backtest_covid'] = {
        'nb_jours_test': len(X_covid_test),
        'mae': round(mae_covid, 2),
        'rmse': round(rmse_covid, 2),
        'r2': round(r2_covid, 4),
        'erreur_relative_pct': round((mae_covid/y_covid_test.mean())*100, 1)
    }

with open('../data/model_evaluation_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("‚úÖ R√©sultats sauvegard√©s dans data/model_evaluation_results.json")