In [None]:
# Installation des biblioth√®ques n√©cessaires
import sys
!{sys.executable} -m pip install pandas numpy scikit-learn xgboost matplotlib seaborn joblib --quiet

In [None]:
# Import des biblioth√®ques
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib

# Configuration de l'affichage
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Biblioth√®ques charg√©es avec succ√®s!")

## üìÇ 1. Chargement et Exploration des Donn√©es

In [None]:
# Chargement du dataset
df = pd.read_csv('../ml/healthcare_dataset.csv')

print(f"üìä Dataset charg√©: {df.shape[0]} lignes, {df.shape[1]} colonnes")
print(f"üìÖ P√©riode: {df['date'].min()} √† {df['date'].max()}")
print(f"\nüè• Services: {df['service'].nunique()} uniques")
print(df['service'].unique())

# Aper√ßu des donn√©es
df.head()

In [None]:
# Statistiques descriptives
print("üìà Statistiques descriptives:")
df.describe()

In [None]:
# V√©rification des valeurs manquantes
print("üîç Valeurs manquantes:")
missing = df.isnull().sum()
if missing.sum() == 0:
    print("‚úÖ Aucune valeur manquante d√©tect√©e!")
else:
    print(missing[missing > 0])

## üìä 2. Visualisations Exploratoires

In [None]:
# √âvolution des co√ªts totaux par service
df['date'] = pd.to_datetime(df['date'])
df_pivot = df.pivot_table(values='cout_total', index='date', columns='service', aggfunc='sum')

plt.figure(figsize=(15, 8))
for col in df_pivot.columns:
    plt.plot(df_pivot.index, df_pivot[col], marker='o', label=col, linewidth=2)

plt.title('üìà √âvolution des Co√ªts Totaux par Service (2024)', fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Co√ªt Total (‚Ç¨)', fontsize=12)
plt.legend(loc='best', fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Distribution des co√ªts par service
plt.figure(figsize=(14, 6))
sns.boxplot(data=df, x='service', y='cout_total', palette='Set2')
plt.title('üì¶ Distribution des Co√ªts par Service', fontsize=16, fontweight='bold')
plt.xlabel('Service', fontsize=12)
plt.ylabel('Co√ªt Total (‚Ç¨)', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Matrice de corr√©lation
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('üî• Matrice de Corr√©lation des Variables', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## üîß 3. Feature Engineering et Pr√©paration des Donn√©es

In [None]:
# Feature Engineering
df_processed = df.copy()

# Encodage des variables cat√©gorielles
le_service = LabelEncoder()
le_saison = LabelEncoder()
le_jour = LabelEncoder()
le_meteo = LabelEncoder()

df_processed['service_encoded'] = le_service.fit_transform(df_processed['service'])
df_processed['saison_encoded'] = le_saison.fit_transform(df_processed['saison'])
df_processed['jour_semaine_encoded'] = le_jour.fit_transform(df_processed['jour_semaine'])
df_processed['meteo_encoded'] = le_meteo.fit_transform(df_processed['meteo'])

# Features temporelles
df_processed['jour_annee'] = df_processed['date'].dt.dayofyear
df_processed['trimestre'] = df_processed['date'].dt.quarter

# Features d'interaction
df_processed['cout_par_patient'] = df_processed['cout_total'] / df_processed['patients_count']
df_processed['actes_par_patient'] = df_processed['actes_count'] / df_processed['patients_count']
df_processed['efficacite_personnel'] = df_processed['patients_count'] / df_processed['personnel_present']

print("‚úÖ Feature Engineering termin√©!")
print(f"üìä Nouvelles dimensions: {df_processed.shape}")
df_processed.head()

## ü§ñ 4. Mod√®le 1: Pr√©diction des Co√ªts Totaux

In [None]:
# S√©lection des features pour la pr√©diction des co√ªts
features_cout = [
    'service_encoded', 'patients_count', 'actes_count', 'sejours_actifs',
    'duree_moyenne_sejour', 'taux_occupation', 'personnel_present',
    'equipements_utilises', 'urgences_admissions', 'interventions_chirurgicales',
    'examens_radiologie', 'consultations', 'hospitalisations', 'tarif_moyen',
    'saison_encoded', 'jour_semaine_encoded', 'est_weekend', 'est_ferie',
    'meteo_encoded', 'temperature', 'mois', 'jour_annee', 'trimestre',
    'actes_par_patient', 'efficacite_personnel'
]

X_cout = df_processed[features_cout]
y_cout = df_processed['cout_total']

# Split train/test
X_train_cout, X_test_cout, y_train_cout, y_test_cout = train_test_split(
    X_cout, y_cout, test_size=0.2, random_state=42
)

# Normalisation
scaler_cout = StandardScaler()
X_train_cout_scaled = scaler_cout.fit_transform(X_train_cout)
X_test_cout_scaled = scaler_cout.transform(X_test_cout)

print(f"‚úÖ Donn√©es pr√©par√©es pour pr√©diction des co√ªts")
print(f"üìä Train: {X_train_cout.shape}, Test: {X_test_cout.shape}")

In [None]:
# Entra√Ænement de plusieurs mod√®les
models_cout = {
    'Random Forest': RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, max_depth=7, learning_rate=0.1, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=200, max_depth=7, learning_rate=0.1, random_state=42, n_jobs=-1)
}

results_cout = {}

print("üöÄ Entra√Ænement des mod√®les...\n")
for name, model in models_cout.items():
    print(f"‚è≥ Entra√Ænement: {name}")
    model.fit(X_train_cout_scaled, y_train_cout)
    
    # Pr√©dictions
    y_pred_train = model.predict(X_train_cout_scaled)
    y_pred_test = model.predict(X_test_cout_scaled)
    
    # M√©triques
    r2_train = r2_score(y_train_cout, y_pred_train)
    r2_test = r2_score(y_test_cout, y_pred_test)
    mae = mean_absolute_error(y_test_cout, y_pred_test)
    rmse = np.sqrt(mean_squared_error(y_test_cout, y_pred_test))
    
    results_cout[name] = {
        'model': model,
        'r2_train': r2_train,
        'r2_test': r2_test,
        'mae': mae,
        'rmse': rmse,
        'predictions': y_pred_test
    }
    
    print(f"‚úÖ {name}:")
    print(f"   R¬≤ Train: {r2_train:.4f}")
    print(f"   R¬≤ Test: {r2_test:.4f}")
    print(f"   MAE: {mae:.2f}‚Ç¨")
    print(f"   RMSE: {rmse:.2f}‚Ç¨\n")

In [None]:
# Comparaison des mod√®les
comparison_df = pd.DataFrame({
    'Mod√®le': list(results_cout.keys()),
    'R¬≤ Train': [r['r2_train'] for r in results_cout.values()],
    'R¬≤ Test': [r['r2_test'] for r in results_cout.values()],
    'MAE (‚Ç¨)': [r['mae'] for r in results_cout.values()],
    'RMSE (‚Ç¨)': [r['rmse'] for r in results_cout.values()]
})

print("üìä Comparaison des Mod√®les de Pr√©diction des Co√ªts:\n")
print(comparison_df.to_string(index=False))

# S√©lection du meilleur mod√®le
best_model_name = comparison_df.loc[comparison_df['R¬≤ Test'].idxmax(), 'Mod√®le']
best_model_cout = results_cout[best_model_name]['model']
print(f"\nüèÜ Meilleur mod√®le: {best_model_name}")

In [None]:
# Visualisation des pr√©dictions vs r√©el
y_pred_best = results_cout[best_model_name]['predictions']

plt.figure(figsize=(12, 6))
plt.scatter(y_test_cout, y_pred_best, alpha=0.6, s=80, edgecolors='k')
plt.plot([y_test_cout.min(), y_test_cout.max()], 
         [y_test_cout.min(), y_test_cout.max()], 
         'r--', lw=3, label='Pr√©diction parfaite')
plt.xlabel('Co√ªts R√©els (‚Ç¨)', fontsize=12)
plt.ylabel('Co√ªts Pr√©dits (‚Ç¨)', fontsize=12)
plt.title(f'üéØ Pr√©diction vs R√©el - {best_model_name}', fontsize=16, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Feature Importance
if best_model_name in ['Random Forest', 'XGBoost']:
    feature_importance = pd.DataFrame({
        'feature': features_cout,
        'importance': best_model_cout.feature_importances_
    }).sort_values('importance', ascending=False).head(15)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(data=feature_importance, x='importance', y='feature', palette='viridis')
    plt.title(f'üîù Top 15 Features les plus importantes - {best_model_name}', 
              fontsize=16, fontweight='bold')
    plt.xlabel('Importance', fontsize=12)
    plt.ylabel('Feature', fontsize=12)
    plt.tight_layout()
    plt.show()

## ü§ñ 5. Mod√®le 2: Pr√©diction du Nombre de Patients

In [None]:
# Features pour pr√©diction des patients
features_patients = [
    'service_encoded', 'taux_occupation', 'personnel_present',
    'equipements_utilises', 'saison_encoded', 'jour_semaine_encoded',
    'est_weekend', 'est_ferie', 'meteo_encoded', 'temperature',
    'mois', 'jour_annee', 'trimestre'
]

X_patients = df_processed[features_patients]
y_patients = df_processed['patients_count']

# Split et normalisation
X_train_pat, X_test_pat, y_train_pat, y_test_pat = train_test_split(
    X_patients, y_patients, test_size=0.2, random_state=42
)

scaler_patients = StandardScaler()
X_train_pat_scaled = scaler_patients.fit_transform(X_train_pat)
X_test_pat_scaled = scaler_patients.transform(X_test_pat)

# Entra√Ænement XGBoost (meilleur mod√®le)
model_patients = XGBRegressor(n_estimators=200, max_depth=7, learning_rate=0.1, 
                               random_state=42, n_jobs=-1)
model_patients.fit(X_train_pat_scaled, y_train_pat)

# √âvaluation
y_pred_pat = model_patients.predict(X_test_pat_scaled)
r2_pat = r2_score(y_test_pat, y_pred_pat)
mae_pat = mean_absolute_error(y_test_pat, y_pred_pat)

print(f"‚úÖ Mod√®le Pr√©diction Patients:")
print(f"   R¬≤ Score: {r2_pat:.4f}")
print(f"   MAE: {mae_pat:.2f} patients")

## ü§ñ 6. Mod√®le 3: Pr√©diction du Taux d'Occupation

In [None]:
# Features pour pr√©diction du taux d'occupation
features_occupation = [
    'service_encoded', 'patients_count', 'sejours_actifs', 'personnel_present',
    'saison_encoded', 'jour_semaine_encoded', 'est_weekend', 'est_ferie',
    'mois', 'trimestre'
]

X_occupation = df_processed[features_occupation]
y_occupation = df_processed['taux_occupation']

# Split et normalisation
X_train_occ, X_test_occ, y_train_occ, y_test_occ = train_test_split(
    X_occupation, y_occupation, test_size=0.2, random_state=42
)

scaler_occupation = StandardScaler()
X_train_occ_scaled = scaler_occupation.fit_transform(X_train_occ)
X_test_occ_scaled = scaler_occupation.transform(X_test_occ)

# Entra√Ænement
model_occupation = XGBRegressor(n_estimators=200, max_depth=7, learning_rate=0.1,
                                 random_state=42, n_jobs=-1)
model_occupation.fit(X_train_occ_scaled, y_train_occ)

# √âvaluation
y_pred_occ = model_occupation.predict(X_test_occ_scaled)
r2_occ = r2_score(y_test_occ, y_pred_occ)
mae_occ = mean_absolute_error(y_test_occ, y_pred_occ)

print(f"‚úÖ Mod√®le Pr√©diction Taux d'Occupation:")
print(f"   R¬≤ Score: {r2_occ:.4f}")
print(f"   MAE: {mae_occ:.4f}")

## üíæ 7. Sauvegarde des Mod√®les et Encodeurs

In [None]:
# Sauvegarde des mod√®les
import os

models_dir = '../ml/models'
os.makedirs(models_dir, exist_ok=True)

# Sauvegarder les mod√®les
joblib.dump(best_model_cout, f'{models_dir}/model_cout.pkl')
joblib.dump(model_patients, f'{models_dir}/model_patients.pkl')
joblib.dump(model_occupation, f'{models_dir}/model_occupation.pkl')

# Sauvegarder les scalers
joblib.dump(scaler_cout, f'{models_dir}/scaler_cout.pkl')
joblib.dump(scaler_patients, f'{models_dir}/scaler_patients.pkl')
joblib.dump(scaler_occupation, f'{models_dir}/scaler_occupation.pkl')

# Sauvegarder les encodeurs
encoders = {
    'service': le_service,
    'saison': le_saison,
    'jour_semaine': le_jour,
    'meteo': le_meteo
}
joblib.dump(encoders, f'{models_dir}/encoders.pkl')

# Sauvegarder les features
features_info = {
    'cout': features_cout,
    'patients': features_patients,
    'occupation': features_occupation
}
joblib.dump(features_info, f'{models_dir}/features_info.pkl')

print("‚úÖ Tous les mod√®les et artefacts sauvegard√©s dans:", models_dir)
print("\nüì¶ Fichiers cr√©√©s:")
for file in os.listdir(models_dir):
    print(f"   - {file}")

## üîÆ 8. G√©n√©ration de Pr√©dictions Futures

In [None]:
# Fonction pour cr√©er des donn√©es futures
def create_future_data(service_name, days_ahead=30):
    """
    Cr√©e un dataframe de donn√©es futures pour les pr√©dictions
    """
    # Date de d√©part
    start_date = pd.to_datetime('2024-12-15')
    dates = [start_date + timedelta(days=i) for i in range(days_ahead)]
    
    # Moyennes du service pour les features
    service_data = df[df['service'] == service_name]
    
    future_data = []
    for date in dates:
        # Features basiques
        row = {
            'date': date,
            'service': service_name,
            'mois': date.month,
            'jour_annee': date.dayofyear,
            'trimestre': date.quarter,
            'jour_semaine': date.strftime('%A').lower(),
            'est_weekend': 1 if date.weekday() >= 5 else 0,
            'est_ferie': 0,  # Simplification
        }
        
        # Saison
        if date.month in [12, 1, 2]:
            row['saison'] = 'hiver'
            row['temperature'] = np.random.randint(0, 8)
        elif date.month in [3, 4, 5]:
            row['saison'] = 'printemps'
            row['temperature'] = np.random.randint(10, 20)
        elif date.month in [6, 7, 8]:
            row['saison'] = 'ete'
            row['temperature'] = np.random.randint(20, 35)
        else:
            row['saison'] = 'automne'
            row['temperature'] = np.random.randint(10, 18)
        
        # M√©teo al√©atoire
        row['meteo'] = np.random.choice(['ensoleille', 'nuageux', 'pluie', 'neige'])
        
        # Moyennes historiques du service
        row['patients_count'] = int(service_data['patients_count'].mean())
        row['actes_count'] = int(service_data['actes_count'].mean())
        row['sejours_actifs'] = int(service_data['sejours_actifs'].mean())
        row['duree_moyenne_sejour'] = service_data['duree_moyenne_sejour'].mean()
        row['taux_occupation'] = service_data['taux_occupation'].mean()
        row['personnel_present'] = int(service_data['personnel_present'].mean())
        row['equipements_utilises'] = int(service_data['equipements_utilises'].mean())
        row['urgences_admissions'] = int(service_data['urgences_admissions'].mean())
        row['interventions_chirurgicales'] = int(service_data['interventions_chirurgicales'].mean())
        row['examens_radiologie'] = int(service_data['examens_radiologie'].mean())
        row['consultations'] = int(service_data['consultations'].mean())
        row['hospitalisations'] = int(service_data['hospitalisations'].mean())
        row['tarif_moyen'] = service_data['tarif_moyen'].mean()
        
        future_data.append(row)
    
    return pd.DataFrame(future_data)

# Test avec un service
future_urgences = create_future_data('Urgences', days_ahead=30)
print("‚úÖ Donn√©es futures g√©n√©r√©es pour les Urgences (30 prochains jours)")
future_urgences.head()

In [None]:
# Fonction de pr√©diction compl√®te
def predict_future_costs(future_df):
    """
    Pr√©dit les co√ªts futurs pour un dataframe de donn√©es futures
    """
    # Feature Engineering
    future_processed = future_df.copy()
    
    # Encodage
    future_processed['service_encoded'] = le_service.transform(future_processed['service'])
    future_processed['saison_encoded'] = le_saison.transform(future_processed['saison'])
    future_processed['jour_semaine_encoded'] = le_jour.transform(future_processed['jour_semaine'])
    future_processed['meteo_encoded'] = le_meteo.transform(future_processed['meteo'])
    
    # Features d√©riv√©es
    future_processed['actes_par_patient'] = future_processed['actes_count'] / future_processed['patients_count']
    future_processed['efficacite_personnel'] = future_processed['patients_count'] / future_processed['personnel_present']
    
    # Extraction des features
    X_future = future_processed[features_cout]
    
    # Normalisation
    X_future_scaled = scaler_cout.transform(X_future)
    
    # Pr√©diction
    predictions = best_model_cout.predict(X_future_scaled)
    
    # Ajouter les pr√©dictions
    future_processed['cout_predit'] = predictions
    
    return future_processed

# Pr√©dire pour les Urgences
predictions_urgences = predict_future_costs(future_urgences)

# Visualisation
plt.figure(figsize=(14, 6))
plt.plot(predictions_urgences['date'], predictions_urgences['cout_predit'], 
         marker='o', linewidth=2, markersize=6, color='#17A2A6', label='Co√ªt pr√©dit')
plt.fill_between(predictions_urgences['date'], 
                 predictions_urgences['cout_predit'] * 0.9,
                 predictions_urgences['cout_predit'] * 1.1,
                 alpha=0.2, color='#17A2A6', label='Intervalle de confiance ¬±10%')
plt.title('üîÆ Pr√©diction des Co√ªts - Service Urgences (30 prochains jours)', 
          fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Co√ªt Pr√©dit (‚Ç¨)', fontsize=12)
plt.legend(fontsize=11)
plt.grid(alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print(f"\nüìä Statistiques des Pr√©dictions:")
print(f"   Co√ªt moyen pr√©dit: {predictions_urgences['cout_predit'].mean():.2f}‚Ç¨")
print(f"   Co√ªt min pr√©dit: {predictions_urgences['cout_predit'].min():.2f}‚Ç¨")
print(f"   Co√ªt max pr√©dit: {predictions_urgences['cout_predit'].max():.2f}‚Ç¨")
print(f"   Co√ªt total pr√©vu (30j): {predictions_urgences['cout_predit'].sum():.2f}‚Ç¨")

In [None]:
# Pr√©dictions pour tous les services
all_predictions = {}

for service in df['service'].unique():
    future_data = create_future_data(service, days_ahead=30)
    predictions = predict_future_costs(future_data)
    all_predictions[service] = predictions

# Visualisation comparative
plt.figure(figsize=(16, 8))
colors = ['#3B82F6', '#10B981', '#8B5CF6', '#F59E0B', '#EF4444', '#EC4899', '#06B6D4', '#84CC16']

for idx, (service, pred) in enumerate(all_predictions.items()):
    plt.plot(pred['date'], pred['cout_predit'], 
             marker='o', linewidth=2, label=service, color=colors[idx % len(colors)])

plt.title('üîÆ Pr√©dictions des Co√ªts par Service (30 prochains jours)', 
          fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Co√ªt Pr√©dit (‚Ç¨)', fontsize=12)
plt.legend(loc='best', fontsize=10)
plt.grid(alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Tableau r√©capitulatif
summary = []
for service, pred in all_predictions.items():
    summary.append({
        'Service': service,
        'Co√ªt Moyen (‚Ç¨)': f"{pred['cout_predit'].mean():.2f}",
        'Co√ªt Total 30j (‚Ç¨)': f"{pred['cout_predit'].sum():.2f}",
        'Min (‚Ç¨)': f"{pred['cout_predit'].min():.2f}",
        'Max (‚Ç¨)': f"{pred['cout_predit'].max():.2f}"
    })

summary_df = pd.DataFrame(summary)
print("\nüìä R√©sum√© des Pr√©dictions par Service (30 prochains jours):\n")
print(summary_df.to_string(index=False))

## üìà 9. M√©triques Finales et Performance

In [None]:
# R√©sum√© final de tous les mod√®les
print("="*80)
print("üéØ R√âSUM√â FINAL DES MOD√àLES ML")
print("="*80)

print(f"\n1Ô∏è‚É£ MOD√àLE PR√âDICTION DES CO√õTS ({best_model_name})")
print(f"   R¬≤ Score: {results_cout[best_model_name]['r2_test']:.4f}")
print(f"   MAE: {results_cout[best_model_name]['mae']:.2f}‚Ç¨")
print(f"   RMSE: {results_cout[best_model_name]['rmse']:.2f}‚Ç¨")

print(f"\n2Ô∏è‚É£ MOD√àLE PR√âDICTION NOMBRE DE PATIENTS")
print(f"   R¬≤ Score: {r2_pat:.4f}")
print(f"   MAE: {mae_pat:.2f} patients")

print(f"\n3Ô∏è‚É£ MOD√àLE PR√âDICTION TAUX D'OCCUPATION")
print(f"   R¬≤ Score: {r2_occ:.4f}")
print(f"   MAE: {mae_occ:.4f}")

print("\n" + "="*80)
print("‚úÖ TOUS LES MOD√àLES SONT OP√âRATIONNELS ET SAUVEGARD√âS")
print("="*80)

print("\nüì¶ Fichiers sauvegard√©s:")
print("   - model_cout.pkl (Pr√©diction co√ªts)")
print("   - model_patients.pkl (Pr√©diction patients)")
print("   - model_occupation.pkl (Pr√©diction taux occupation)")
print("   - scaler_*.pkl (Normalisateurs)")
print("   - encoders.pkl (Encodeurs cat√©goriels)")
print("   - features_info.pkl (Information sur les features)")

print("\nüöÄ Pr√™t pour l'int√©gration avec le backend Spring Boot!")

In [None]:
# R√©sum√© ex√©cutif avec visualisation
summary_text = f"""
{'='*80}
üè• R√âSUM√â EX√âCUTIF - MOD√àLES ML HEALTHCARE DASHBOARD
{'='*80}

üìä DATASET:
   ‚Ä¢ P√©riode analys√©e: {df['date'].min()} √† {df['date'].max()}
   ‚Ä¢ Nombre total d'observations: {len(df):,}
   ‚Ä¢ Services couverts: {df['service'].nunique()}
   ‚Ä¢ Features utilis√©es: {len(features_cout)}

üèÜ MEILLEUR MOD√àLE: {best_model_name}
   ‚Ä¢ R¬≤ Score (Test): {results_cout[best_model_name]['r2_test']:.4f}
   ‚Ä¢ MAE: {results_cout[best_model_name]['mae']:.2f}‚Ç¨
   ‚Ä¢ RMSE: {results_cout[best_model_name]['rmse']:.2f}‚Ç¨
   ‚Ä¢ Pr√©cision moyenne: {(1 - results_cout[best_model_name]['mae'] / y_test_cout.mean()) * 100:.2f}%

üìà PERFORMANCE GLOBALE:
   ‚Ä¢ Co√ªt moyen pr√©dit: {results_cout[best_model_name]['predictions'].mean():.2f}‚Ç¨
   ‚Ä¢ Co√ªt moyen r√©el: {y_test_cout.mean():.2f}‚Ç¨
   ‚Ä¢ Erreur relative moyenne: {(results_cout[best_model_name]['mae'] / y_test_cout.mean() * 100):.2f}%

üéØ TOP 3 FEATURES LES PLUS IMPORTANTES:
"""

if best_model_name in ['Random Forest', 'XGBoost']:
    top_features = pd.DataFrame({
        'feature': features_cout,
        'importance': best_model_cout.feature_importances_
    }).sort_values('importance', ascending=False).head(3)
    
    for idx, row in top_features.iterrows():
        summary_text += f"   {row.name + 1}. {row['feature']}: {row['importance']:.4f}\n"

summary_text += f"""
üí° RECOMMANDATIONS:
   1. D√©ployer le mod√®le {best_model_name} en production
   2. Mettre en place un monitoring continu des pr√©dictions
   3. R√©-entra√Æner le mod√®le tous les 3 mois avec nouvelles donn√©es
   4. Surveiller particuli√®rement les services avec MAE √©lev√©
   5. Int√©grer les pr√©dictions dans le dashboard temps r√©el

‚úÖ STATUT: Tous les mod√®les entra√Æn√©s et sauvegard√©s avec succ√®s!
{'='*80}
"""

print(summary_text)

# Sauvegarder le r√©sum√©
with open('../ml/models/model_summary.txt', 'w', encoding='utf-8') as f:
    f.write(summary_text)
    
print("\nüìù R√©sum√© sauvegard√© dans: ml/models/model_summary.txt")

## üéì 11. Conclusions et Recommandations

In [None]:
# Comparaison visuelle des 3 mod√®les
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (name, result) in enumerate(results_cout.items()):
    axes[idx].scatter(y_test_cout, result['predictions'], alpha=0.6, s=60, edgecolors='k')
    axes[idx].plot([y_test_cout.min(), y_test_cout.max()], 
                   [y_test_cout.min(), y_test_cout.max()], 
                   'r--', lw=2)
    axes[idx].set_title(f'{name}\nR¬≤ = {result["r2_test"]:.4f}', 
                        fontsize=13, fontweight='bold')
    axes[idx].set_xlabel('Co√ªts R√©els (‚Ç¨)', fontsize=11)
    axes[idx].set_ylabel('Co√ªts Pr√©dits (‚Ç¨)', fontsize=11)
    axes[idx].grid(alpha=0.3)

plt.suptitle('üî¨ Comparaison des 3 Mod√®les ML', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Tableau comparatif
print("\nüìä Tableau Comparatif des Mod√®les:\n")
comparison_full = pd.DataFrame({
    'Mod√®le': list(results_cout.keys()),
    'R¬≤ Train': [f"{r['r2_train']:.4f}" for r in results_cout.values()],
    'R¬≤ Test': [f"{r['r2_test']:.4f}" for r in results_cout.values()],
    'MAE': [f"{r['mae']:.2f}‚Ç¨" for r in results_cout.values()],
    'RMSE': [f"{r['rmse']:.2f}‚Ç¨" for r in results_cout.values()],
    'Surapprentissage': [f"{(r['r2_train'] - r['r2_test']):.4f}" for r in results_cout.values()]
})
print(comparison_full.to_string(index=False))

In [None]:
# Analyse de la performance par service
service_performance = []

for service_code in df_processed['service_encoded'].unique():
    service_mask = X_test_cout['service_encoded'] == service_code
    if service_mask.sum() > 0:
        y_test_service = y_test_cout[service_mask]
        y_pred_service = results_cout[best_model_name]['predictions'][service_mask]
        
        mae = mean_absolute_error(y_test_service, y_pred_service)
        r2 = r2_score(y_test_service, y_pred_service)
        
        service_name = df_processed[df_processed['service_encoded'] == service_code]['service'].iloc[0]
        
        service_performance.append({
            'Service': service_name,
            'MAE (‚Ç¨)': mae,
            'R¬≤ Score': r2,
            '√âchantillons': service_mask.sum()
        })

perf_df = pd.DataFrame(service_performance).sort_values('MAE (‚Ç¨)')

# Visualisation
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# MAE par service
axes[0].barh(perf_df['Service'], perf_df['MAE (‚Ç¨)'], color='coral')
axes[0].set_title('üìä MAE par Service', fontsize=14, fontweight='bold')
axes[0].set_xlabel('MAE (‚Ç¨)', fontsize=12)
axes[0].grid(alpha=0.3, axis='x')

# R¬≤ par service
colors = ['green' if r2 > 0.8 else 'orange' if r2 > 0.6 else 'red' for r2 in perf_df['R¬≤ Score']]
axes[1].barh(perf_df['Service'], perf_df['R¬≤ Score'], color=colors)
axes[1].set_title('üìà R¬≤ Score par Service', fontsize=14, fontweight='bold')
axes[1].set_xlabel('R¬≤ Score', fontsize=12)
axes[1].axvline(x=0.8, color='green', linestyle='--', linewidth=2, alpha=0.5, label='Excellent (>0.8)')
axes[1].axvline(x=0.6, color='orange', linestyle='--', linewidth=2, alpha=0.5, label='Bon (>0.6)')
axes[1].legend()
axes[1].grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

print("\nüìä Performance du Mod√®le par Service:\n")
print(perf_df.to_string(index=False))

In [None]:
# Learning Curves - √âvaluation de l'apprentissage
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    best_model_cout, X_train_cout_scaled, y_train_cout,
    cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10),
    scoring='neg_mean_absolute_error'
)

train_scores_mean = -np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(12, 7))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2, color='blue')
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2, color='orange')

plt.plot(train_sizes, train_scores_mean, 'o-', color='blue', linewidth=2, 
         markersize=8, label='Score Train')
plt.plot(train_sizes, test_scores_mean, 'o-', color='orange', linewidth=2, 
         markersize=8, label='Score Validation')

plt.title(f'üìö Learning Curves - {best_model_name}', fontsize=16, fontweight='bold')
plt.xlabel('Taille du Set d\'Entra√Ænement', fontsize=13)
plt.ylabel('MAE (Mean Absolute Error)', fontsize=13)
plt.legend(loc='best', fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f"‚úÖ Learning curves g√©n√©r√©es pour {best_model_name}")

In [None]:
# Analyse de l'importance des features avec SHAP (si disponible)
try:
    import shap
    
    # Cr√©er un explainer SHAP pour XGBoost
    if best_model_name == 'XGBoost':
        explainer = shap.TreeExplainer(best_model_cout)
        shap_values = explainer.shap_values(X_test_cout_scaled[:100])  # Limiter √† 100 √©chantillons
        
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values, X_test_cout.iloc[:100], 
                         feature_names=features_cout, show=False)
        plt.title('üîç SHAP - Importance et Impact des Features', fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
        print("‚úÖ Analyse SHAP compl√©t√©e!")
    else:
        print("‚ö†Ô∏è SHAP analysis disponible uniquement pour XGBoost")
        
except ImportError:
    print("‚ö†Ô∏è Biblioth√®que SHAP non install√©e. Installer avec: pip install shap")
    print("üìä Utilisation de Feature Importance standard √† la place")
    
    # Alternative: Feature importance standard
    if best_model_name in ['Random Forest', 'XGBoost']:
        feature_imp = pd.DataFrame({
            'feature': features_cout,
            'importance': best_model_cout.feature_importances_
        }).sort_values('importance', ascending=True).tail(20)
        
        plt.figure(figsize=(12, 10))
        plt.barh(feature_imp['feature'], feature_imp['importance'], color='teal')
        plt.title('üîù Top 20 Features - Importance Standard', fontsize=16, fontweight='bold')
        plt.xlabel('Importance', fontsize=12)
        plt.ylabel('Feature', fontsize=12)
        plt.grid(alpha=0.3, axis='x')
        plt.tight_layout()
        plt.show()

In [None]:
# Distribution des erreurs de pr√©diction
errors = y_test_cout - results_cout[best_model_name]['predictions']

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Histogramme des erreurs
axes[0, 0].hist(errors, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[0, 0].axvline(x=0, color='red', linestyle='--', linewidth=2, label='Erreur nulle')
axes[0, 0].set_title('üìä Distribution des Erreurs de Pr√©diction', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Erreur (R√©el - Pr√©dit)', fontsize=12)
axes[0, 0].set_ylabel('Fr√©quence', fontsize=12)
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# 2. Q-Q Plot
from scipy import stats
stats.probplot(errors, dist="norm", plot=axes[0, 1])
axes[0, 1].set_title('üìà Q-Q Plot - Normalit√© des Erreurs', fontsize=14, fontweight='bold')
axes[0, 1].grid(alpha=0.3)

# 3. Erreurs par service
service_errors = pd.DataFrame({
    'service': X_test_cout['service_encoded'],
    'error': errors
})
service_names = df_processed.groupby('service_encoded')['service'].first()
service_errors['service_name'] = service_errors['service'].map(service_names)

axes[1, 0].boxplot([service_errors[service_errors['service_name'] == s]['error'].values 
                     for s in service_names.sort_values().unique()],
                    labels=service_names.sort_values().unique())
axes[1, 0].set_title('üì¶ Distribution des Erreurs par Service', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Service', fontsize=12)
axes[1, 0].set_ylabel('Erreur (‚Ç¨)', fontsize=12)
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(alpha=0.3)
axes[1, 0].axhline(y=0, color='red', linestyle='--', linewidth=1)

# 4. Erreurs absolues vs Valeurs pr√©dites
axes[1, 1].scatter(results_cout[best_model_name]['predictions'], np.abs(errors), 
                   alpha=0.5, s=50, edgecolors='k')
axes[1, 1].set_title('üéØ Erreur Absolue vs Pr√©diction', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Valeur Pr√©dite (‚Ç¨)', fontsize=12)
axes[1, 1].set_ylabel('Erreur Absolue (‚Ç¨)', fontsize=12)
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nüìä Statistiques des Erreurs:")
print(f"   Erreur moyenne: {errors.mean():.2f}‚Ç¨")
print(f"   √âcart-type: {errors.std():.2f}‚Ç¨")
print(f"   Erreur m√©diane: {errors.median():.2f}‚Ç¨")
print(f"   Erreur max: {errors.max():.2f}‚Ç¨")
print(f"   Erreur min: {errors.min():.2f}‚Ç¨")

In [None]:
# Matrice de confusion pour l'√©valuation binaire
from sklearn.metrics import confusion_matrix, classification_report

# Classification: Co√ªts √©lev√©s (au-dessus de la m√©diane) vs faibles
y_pred_class = (results_cout[best_model_name]['predictions'] > y_test_cout.median()).astype(int)
y_test_class = (y_test_cout > y_test_cout.median()).astype(int)

cm = confusion_matrix(y_test_class, y_pred_class)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True, square=True,
            xticklabels=['Co√ªt Faible', 'Co√ªt √âlev√©'],
            yticklabels=['Co√ªt Faible', 'Co√ªt √âlev√©'])
plt.title('üéØ Matrice de Confusion - Classification des Co√ªts', fontsize=16, fontweight='bold')
plt.ylabel('Valeur R√©elle', fontsize=13)
plt.xlabel('Valeur Pr√©dite', fontsize=13)
plt.tight_layout()
plt.show()

print("\nüìä Rapport de Classification:\n")
print(classification_report(y_test_class, y_pred_class, 
                          target_names=['Co√ªt Faible', 'Co√ªt √âlev√©']))

In [None]:
# Courbe ROC pour √©valuer la qualit√© du mod√®le
from sklearn.metrics import roc_curve, auc

# Cr√©er des classes binaires bas√©es sur si le co√ªt d√©passe la m√©diane
y_test_binary = (y_test_cout > y_test_cout.median()).astype(int)
y_pred_proba = (results_cout[best_model_name]['predictions'] > y_test_cout.median()).astype(int)

# Pour avoir des probabilit√©s, utilisons les pr√©dictions normalis√©es
y_scores = (results_cout[best_model_name]['predictions'] - results_cout[best_model_name]['predictions'].min()) / \
           (results_cout[best_model_name]['predictions'].max() - results_cout[best_model_name]['predictions'].min())

fpr, tpr, thresholds = roc_curve(y_test_binary, y_scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=3, label=f'Courbe ROC (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Al√©atoire')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Taux de Faux Positifs (FPR)', fontsize=13)
plt.ylabel('Taux de Vrais Positifs (TPR)', fontsize=13)
plt.title(f'üìà Courbe ROC - {best_model_name}\nPr√©diction Co√ªts √âlev√©s vs Faibles', fontsize=16, fontweight='bold')
plt.legend(loc="lower right", fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f"‚úÖ AUC Score: {roc_auc:.4f}")

## üìä 10. Visualisations Avanc√©es et Analyses ML