In [23]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [24]:
# Chargement des données depuis un fichier CSV
data = pd.read_csv("readmission_des_patients.csv")

# Affichage des premières lignes pour vérification
data.head()

Unnamed: 0,subject_id,HADM_ID,gender,dob,HeightInMeters,WeightInKilograms,BMI,SmokerStatus,ECigaretteUsage,AlcoholDrinkers,...,AdmissionDayOfWeek,DIAGNOSIS,ADMISSION_LOCATION,ADMISSION_TYPE,length_of_stay,facility_cost,procedure_cost,medication_cost,lab_test_cost,total_cost
0,10017,199207,F,2075-09-21 00:00:00,177999997138977,9525,301299991607666,Former smoker,Never used e-cigarettes in my entire life,0,...,2,HUMERAL FRACTURE,EMERGENCY ROOM ADMIT,EMERGENCY,8,8000.0,3000.0,42000.0,47700.0,100700.0
1,10013,165520,F,2038-09-03 00:00:00,177999997138977,712099990844727,225300006866455,Never smoked,Never used e-cigarettes in my entire life,1,...,5,SEPSIS,TRANSFER FROM HOSP/EXTRAM,EMERGENCY,2,2000.0,1500.0,3600.0,14800.0,21900.0
2,10011,105331,F,2090-06-05 00:00:00,160000002384186,716699981689453,279899997711182,Former smoker,Never used e-cigarettes in my entire life,0,...,4,HEPATITIS B,TRANSFER FROM HOSP/EXTRAM,EMERGENCY,13,13000.0,3000.0,0.0,70000.0,86000.0
3,10006,142345,F,2094-03-05 00:00:00,162999999523163,848199996948242,320999984741211,Former smoker,Never used e-cigarettes in my entire life,0,...,3,SEPSIS,EMERGENCY ROOM ADMIT,EMERGENCY,8,8000.0,10500.0,21600.0,196300.0,236400.0
4,10019,177759,M,2114-06-20 00:00:00,167999994754791,780199966430664,277600002288818,Never smoked,Never used e-cigarettes in my entire life,0,...,7,ALCOHOLIC HEPATITIS,TRANSFER FROM HOSP/EXTRAM,EMERGENCY,0,0.0,6000.0,0.0,28700.0,34700.0


In [26]:
# Afficher les noms des colonnes du DataFrame
print(data.columns)


Index(['subject_id', 'HADM_ID', 'gender', 'dob', 'HeightInMeters',
       'WeightInKilograms', 'BMI', 'SmokerStatus', 'ECigaretteUsage',
       'AlcoholDrinkers', 'CovidPos', 'LANGUAGE', 'ETHNICITY',
       'MARITAL_STATUS', 'HadHeartAttack', 'HadAngina', 'HadStroke',
       'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder',
       'HadKidneyDisease', 'HadArthritis', 'HadDiabetes',
       'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'ChestScan',
       'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap',
       'AdmissionDate', 'AdmissionYear', 'AdmissionDayOfWeek', 'DIAGNOSIS',
       'ADMISSION_LOCATION', 'ADMISSION_TYPE', 'length_of_stay',
       'facility_cost', 'procedure_cost', 'medication_cost', 'lab_test_cost',
       'total_cost'],
      dtype='object')


In [27]:

# Sélection des colonnes pour l'encodage
categorical_cols = ['AlcoholDrinkers', 'CovidPos', 'HadHeartAttack', 'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD']

# Encoder les variables catégorielles (True -> 1, False -> 0)
for col in categorical_cols:
    data[col] = data[col].replace({True: 1, False: 0})

# Sélectionner les variables d'entrée (X) et la variable cible (y)
X = data[['AlcoholDrinkers', 'CovidPos', 'HadHeartAttack', 'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD']]  # Exemple d'entrée
y = data['length_of_stay']  # Durée du séjour

# Séparation des données en train et test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalisation des données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Entraîner un modèle de régression linéaire
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Prédictions sur les données de test
y_pred = model.predict(X_test_scaled)

# Évaluer le modèle
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Afficher les résultats
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 501.85889903115464
R-squared: 0.07116092149700881


In [28]:

# Encodage des colonnes catégorielles (True/False -> 1/0)
categorical_cols = ['AlcoholDrinkers', 'CovidPos', 'HadHeartAttack', 'HadAngina', 
                    'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD']
for col in categorical_cols:
    data[col] = data[col].replace({True: 1, False: 0})

# Sélection des features (vous pouvez en ajouter selon votre dataset)
features = ['AlcoholDrinkers', 'CovidPos', 'HadHeartAttack', 'HadAngina',
            'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD']
X = data[features]
y = data['length_of_stay']

# Séparation des données
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Modèle : Gradient Boosting Regressor
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

# Prédictions
y_pred = model.predict(X_test_scaled)

# Évaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Affichage des résultats
print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared Score: {r2:.2f}')

Mean Squared Error: 484.07
R-squared Score: 0.10


In [31]:
from sklearn.model_selection import train_test_split, GridSearchCV

# Encodage des colonnes catégorielles (True/False -> 1/0)
categorical_cols = ['AlcoholDrinkers', 'CovidPos', 'HadHeartAttack', 'HadAngina', 
                    'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD']
for col in categorical_cols:
    data[col] = data[col].replace({True: 1, False: 0})

# Définir les features
features = ['AlcoholDrinkers', 'CovidPos', 'HadHeartAttack', 'HadAngina',
            'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD']
X = data[features]
y = data['length_of_stay']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# GridSearchCV pour le fine-tuning
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0]
}

gbr = GradientBoostingRegressor(random_state=42)

grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid,
                           cv=5, scoring='r2', n_jobs=-1, verbose=1)

grid_search.fit(X_train_scaled, y_train)

# Meilleur modèle
best_model = grid_search.best_estimator_

# Prédiction
y_pred = best_model.predict(X_test_scaled)

# Évaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Résultats
print("Best parameters found:", grid_search.best_params_)
print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared Score: {r2:.2f}')

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
Mean Squared Error: 454.20
R-squared Score: 0.16


In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score



# 2. Encoder les colonnes booléennes (True/False)
bool_cols = ['AlcoholDrinkers', 'CovidPos', 'HadHeartAttack', 'HadAngina',
             'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD']
for col in bool_cols:
    if col in data.columns:
        data[col] = data[col].astype(int)

# 3. Encoder automatiquement les colonnes catégorielles (type object)
object_cols = data.select_dtypes(include='object').columns
data = pd.get_dummies(data, columns=object_cols, drop_first=True)

# 4. Définir les features et la target
X = data.drop(columns=['length_of_stay'])  # Variable cible
y = data['length_of_stay']

# 5. Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Normalisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 7. GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

# 8. Évaluation
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test_scaled)

print("✅ Best parameters found:", grid_search.best_params_)
print("📉 Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("📈 R-squared Score:", r2_score(y_test, y_pred))


Fitting 5 folds for each of 48 candidates, totalling 240 fits
✅ Best parameters found: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
📉 Mean Squared Error: 281.53845518822624
📈 R-squared Score: 0.4789293967188296


In [34]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(5, 30),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None]
}

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   n_iter=100, cv=5, scoring='neg_mean_squared_error', 
                                   n_jobs=-1, random_state=42, verbose=2)
random_search.fit(X_train_scaled, y_train)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)

print("✅ Best params:", random_search.best_params_)
print("📉 MSE:", mean_squared_error(y_test, y_pred))
print("📈 R²:", r2_score(y_test, y_pred))


Fitting 5 folds for each of 100 candidates, totalling 500 fits
✅ Best params: {'max_depth': 26, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 223}
📉 MSE: 85.98860516306071
📈 R²: 0.8408525246127863


In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import randint
import joblib

# 🔄 Chargement des données (ajoute ton propre chemin si nécessaire)
# data = pd.read_csv("chemin/vers/ton_fichier.csv")

# ✅ 1. Encodage des booléens
bool_cols = ['AlcoholDrinkers', 'CovidPos', 'HadHeartAttack', 'HadAngina',
             'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD']
for col in bool_cols:
    if col in data.columns:
        data[col] = data[col].astype(int)

# ✅ 2. Encodage des variables catégorielles
object_cols = data.select_dtypes(include='object').columns
data = pd.get_dummies(data, columns=object_cols, drop_first=True)

# ✅ 3. Features et Target
X = data.drop(columns=['length_of_stay'])
y = data['length_of_stay']

# ✅ 4. Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ 5. Normalisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ✅ 6. Modèle de base
rf = RandomForestRegressor(random_state=42)


# ✅ 8. RandomizedSearchCV (plus puissant)
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(5, 30),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None]
}

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   n_iter=100, cv=5, scoring='neg_mean_squared_error', 
                                   n_jobs=-1, random_state=42, verbose=2)
random_search.fit(X_train_scaled, y_train)

# ✅ 9. Évaluation finale
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)

print("✅ Best params from RandomizedSearchCV:", random_search.best_params_)
print("📉 Mean Squared Error :", mean_squared_error(y_test, y_pred))
print("📈 R² Score:", r2_score(y_test, y_pred))

# ✅ 10. Exporter le modèle et le scaler
joblib.dump(best_model, 'best_rf_model.pkl')



Fitting 5 folds for each of 100 candidates, totalling 500 fits
✅ Best params from RandomizedSearchCV: {'max_depth': 26, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 223}
📉 Mean Squared Error: 85.98860516306071
📈 R² Score: 0.8408525246127863


['best_rf_model.pkl']

In [38]:
loaded_model = joblib.load('best_rf_model.pkl')

