# ü§ñ Mod√®le d'IA - Pr√©diction d'Activit√© Hospitali√®re

Ce notebook entra√Æne et √©value les mod√®les de pr√©diction pour l'activit√© hospitali√®re de Piti√©-Salp√™tri√®re.

**Mod√®les:**
- Random Forest (Machine Learning)
- ARIMA (Time Series)
- Ensemble (Combinaison pond√©r√©e)

**Donn√©es:**
- Admissions synth√©tiques calibr√©es sur donn√©es r√©elles SAE
- Donn√©es COVID r√©elles (Sant√© Publique France)
- Indicateurs Hospi-Diag

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# ML
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Time Series
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller

# Configuration
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)

DATA_DIR = Path('../data')
MODEL_DIR = DATA_DIR / 'models'
MODEL_DIR.mkdir(exist_ok=True)
(DATA_DIR / 'graphs').mkdir(exist_ok=True)

print("‚úÖ Biblioth√®ques charg√©es")

## 1. Chargement des donn√©es

In [None]:
# Charger les admissions
df_admissions = pd.read_csv(DATA_DIR / 'admissions.csv')
df_admissions['date'] = pd.to_datetime(df_admissions['date_admission'])

print(f"üìä Admissions charg√©es: {len(df_admissions):,} lignes")
print(f"üìÖ P√©riode: {df_admissions['date'].min().date()} ‚Üí {df_admissions['date'].max().date()}")
print(f"\nüìã Colonnes: {list(df_admissions.columns)}")

In [None]:
# Agr√©gation journali√®re simple
df_daily = df_admissions.groupby(df_admissions['date'].dt.date).size().reset_index()
df_daily.columns = ['date', 'admissions']
df_daily['date'] = pd.to_datetime(df_daily['date'])
df_daily = df_daily.sort_values('date').reset_index(drop=True)

print(f"üìà Donn√©es journali√®res: {len(df_daily)} jours")
print(f"   Admissions/jour: {df_daily['admissions'].mean():.0f} (min: {df_daily['admissions'].min()}, max: {df_daily['admissions'].max()})")

df_daily.head()

## 2. Exploration des donn√©es

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# S√©rie temporelle
ax1 = axes[0, 0]
ax1.plot(df_daily['date'], df_daily['admissions'], alpha=0.7, linewidth=0.8)
ax1.set_title('Admissions journali√®res', fontweight='bold')
ax1.set_xlabel('Date')
ax1.set_ylabel('Admissions')

# Distribution
ax2 = axes[0, 1]
ax2.hist(df_daily['admissions'], bins=30, color='#3498db', edgecolor='black', alpha=0.7)
ax2.axvline(df_daily['admissions'].mean(), color='red', linestyle='--', label=f"Moyenne: {df_daily['admissions'].mean():.0f}")
ax2.set_title('Distribution des admissions', fontweight='bold')
ax2.legend()

# Par jour de semaine
ax3 = axes[1, 0]
df_daily['dow'] = df_daily['date'].dt.dayofweek
dow_names = ['Lun', 'Mar', 'Mer', 'Jeu', 'Ven', 'Sam', 'Dim']
dow_stats = df_daily.groupby('dow')['admissions'].mean()
ax3.bar(dow_names, dow_stats.values, color='#2ecc71', edgecolor='black')
ax3.set_title('Moyenne par jour de semaine', fontweight='bold')
ax3.set_ylabel('Admissions moyennes')

# Par mois
ax4 = axes[1, 1]
df_daily['month'] = df_daily['date'].dt.month
month_names = ['Jan', 'F√©v', 'Mar', 'Avr', 'Mai', 'Juin', 'Juil', 'Ao√ªt', 'Sep', 'Oct', 'Nov', 'D√©c']
month_stats = df_daily.groupby('month')['admissions'].mean()
colors = ['#e74c3c' if m in [1, 2, 12] else '#3498db' for m in month_stats.index]  # Hiver en rouge
ax4.bar(month_names, month_stats.values, color=colors, edgecolor='black')
ax4.set_title('Moyenne par mois (saisonnalit√© - rouge=hiver)', fontweight='bold')
ax4.set_ylabel('Admissions moyennes')

plt.tight_layout()
plt.savefig('../data/graphs/exploration_admissions.png', dpi=150)
plt.show()

## 3. Feature Engineering

In [None]:
def create_features(df):
    """Cr√©e les features pour le mod√®le ML."""
    df = df.copy()
    
    # Features temporelles de base
    df['day_of_week'] = df['date'].dt.dayofweek
    df['day_of_month'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)
    df['year'] = df['date'].dt.year
    
    # Encodage cyclique
    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    # Indicateurs binaires
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    df['is_monday'] = (df['day_of_week'] == 0).astype(int)
    df['is_winter'] = df['month'].isin([12, 1, 2]).astype(int)
    df['is_summer'] = df['month'].isin([7, 8]).astype(int)
    
    # Lag features
    for lag in [1, 7, 14]:
        df[f'lag_{lag}'] = df['admissions'].shift(lag)
    
    # Rolling features
    df['rolling_mean_7'] = df['admissions'].shift(1).rolling(window=7, min_periods=1).mean()
    df['rolling_std_7'] = df['admissions'].shift(1).rolling(window=7, min_periods=1).std()
    df['rolling_mean_14'] = df['admissions'].shift(1).rolling(window=14, min_periods=1).mean()
    
    # Tendance
    df['trend'] = range(len(df))
    
    # Remplir les NaN pour les premiers jours
    df = df.fillna(method='bfill').fillna(df['admissions'].mean())
    
    return df

df_features = create_features(df_daily)
print(f"‚úÖ Features cr√©√©es: {len(df_features)} lignes, {len(df_features.columns)} colonnes")

# Liste des features
feature_cols = [c for c in df_features.columns if c not in ['date', 'admissions', 'dow', 'month']]
print(f"\nüìä Features ({len(feature_cols)}): {feature_cols[:10]}...")

## 4. Entra√Ænement Random Forest

In [None]:
# Pr√©paration donn√©es
X = df_features[feature_cols].values
y = df_features['admissions'].values

# Split temporel (80% train, 20% test)
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
dates_test = df_features['date'].iloc[split_idx:].values

print(f"üìä Train: {len(X_train)} jours")
print(f"üìä Test: {len(X_test)} jours")

# Entra√Ænement
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
print("\n‚úÖ Mod√®le Random Forest entra√Æn√©")

In [None]:
# Pr√©dictions
y_pred_rf = rf_model.predict(X_test)

# M√©triques
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mape_rf = np.mean(np.abs((y_test - y_pred_rf) / y_test)) * 100
r2_rf = r2_score(y_test, y_pred_rf)

print("üìä M√âTRIQUES RANDOM FOREST:")
print(f"   MAE:  {mae_rf:.2f} admissions")
print(f"   RMSE: {rmse_rf:.2f} admissions")
print(f"   MAPE: {mape_rf:.1f}%")
print(f"   R¬≤:   {r2_rf:.3f}")

In [None]:
# Feature Importance
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 8))
top_features = importance.head(15)
colors = plt.cm.viridis(np.linspace(0, 1, len(top_features)))
ax.barh(range(len(top_features)), top_features['importance'], color=colors)
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features['feature'])
ax.set_xlabel('Importance')
ax.set_title('Top 15 Features - Random Forest', fontweight='bold')
ax.invert_yaxis()
plt.tight_layout()
plt.savefig('../data/graphs/feature_importance_rf.png', dpi=150)
plt.show()

print("\nüîç Top 5 features:")
for i, (_, row) in enumerate(importance.head(5).iterrows()):
    print(f"   {i+1}. {row['feature']}: {row['importance']:.3f}")

## 5. Entra√Ænement ARIMA

In [None]:
# Test de stationnarit√©
adf_result = adfuller(df_daily['admissions'])
print(f"üìä Test ADF (stationnarit√©):")
print(f"   Statistique: {adf_result[0]:.4f}")
print(f"   p-value: {adf_result[1]:.4f}")
print(f"   Stationnaire: {'Oui ‚úì' if adf_result[1] < 0.05 else 'Non ‚úó'}")

In [None]:
# Entra√Ænement ARIMA
train_data = df_daily['admissions'].iloc[:split_idx]
test_data = df_daily['admissions'].iloc[split_idx:]

print("‚è≥ Entra√Ænement ARIMA(5,1,2)...")
arima_model = ARIMA(train_data, order=(5, 1, 2))
arima_fitted = arima_model.fit()
print("‚úÖ ARIMA entra√Æn√©")

# Pr√©dictions
y_pred_arima = arima_fitted.forecast(steps=len(test_data))

# M√©triques
mae_arima = mean_absolute_error(test_data, y_pred_arima)
rmse_arima = np.sqrt(mean_squared_error(test_data, y_pred_arima))
mape_arima = np.mean(np.abs((test_data.values - y_pred_arima.values) / test_data.values)) * 100
r2_arima = r2_score(test_data, y_pred_arima)

print("\nüìä M√âTRIQUES ARIMA:")
print(f"   MAE:  {mae_arima:.2f} admissions")
print(f"   RMSE: {rmse_arima:.2f} admissions")
print(f"   MAPE: {mape_arima:.1f}%")
print(f"   R¬≤:   {r2_arima:.3f}")

## 6. Comparaison et Visualisation

In [None]:
# Visualisation
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Pr√©dictions vs R√©el
ax1 = axes[0]
ax1.plot(dates_test, y_test, 'b-', label='R√©el', alpha=0.7, linewidth=1)
ax1.plot(dates_test, y_pred_rf, 'r--', label=f'Random Forest (MAE={mae_rf:.1f})', alpha=0.7, linewidth=1)
ax1.plot(dates_test, y_pred_arima.values, 'g:', label=f'ARIMA (MAE={mae_arima:.1f})', alpha=0.7, linewidth=1)
ax1.set_title('Pr√©dictions vs R√©el', fontweight='bold')
ax1.set_xlabel('Date')
ax1.set_ylabel('Admissions')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Erreurs
ax2 = axes[1]
ax2.fill_between(dates_test, 0, y_test - y_pred_rf, alpha=0.3, color='red', label='Erreur RF')
ax2.fill_between(dates_test, 0, y_test - y_pred_arima.values, alpha=0.3, color='green', label='Erreur ARIMA')
ax2.axhline(0, color='black', linestyle='-', linewidth=0.5)
ax2.set_title('Erreurs de pr√©diction', fontweight='bold')
ax2.set_xlabel('Date')
ax2.set_ylabel('Erreur (R√©el - Pr√©dit)')
ax2.legend()

plt.tight_layout()
plt.savefig('../data/graphs/comparaison_modeles.png', dpi=150)
plt.show()

In [None]:
# Tableau comparatif
comparison = pd.DataFrame({
    'M√©trique': ['MAE', 'RMSE', 'MAPE (%)', 'R¬≤'],
    'Random Forest': [f"{mae_rf:.2f}", f"{rmse_rf:.2f}", f"{mape_rf:.1f}", f"{r2_rf:.3f}"],
    'ARIMA': [f"{mae_arima:.2f}", f"{rmse_arima:.2f}", f"{mape_arima:.1f}", f"{r2_arima:.3f}"]
})

print("\n" + "="*50)
print("COMPARAISON DES MOD√àLES")
print("="*50)
print(comparison.to_string(index=False))

## 7. Sauvegarde et R√©sum√©

In [None]:
# Sauvegarder le mod√®le
joblib.dump(rf_model, MODEL_DIR / 'random_forest_model.pkl')
print(f"‚úÖ Mod√®le sauvegard√©: {MODEL_DIR / 'random_forest_model.pkl'}")

# Sauvegarder les m√©triques
import json
metrics = {
    'random_forest': {'mae': mae_rf, 'rmse': rmse_rf, 'mape': mape_rf, 'r2': r2_rf},
    'arima': {'mae': mae_arima, 'rmse': rmse_arima, 'mape': mape_arima, 'r2': r2_arima},
    'train_size': len(X_train),
    'test_size': len(X_test)
}

with open(MODEL_DIR / 'model_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)
print(f"‚úÖ M√©triques: {MODEL_DIR / 'model_metrics.json'}")

# R√©sum√©
print("\n" + "="*60)
print("üìä R√âSUM√â MOD√àLE IA")
print("="*60)
best = 'Random Forest' if mae_rf < mae_arima else 'ARIMA'
print(f"\nüèÜ Meilleur mod√®le: {best}")
print(f"   MAE: {min(mae_rf, mae_arima):.1f} admissions/jour")
print(f"   Pr√©cision: {100 - min(mape_rf, mape_arima):.1f}%")