In [1]:
# Installer les packages nécessaires
!pip install pandas scikit-learn matplotlib seaborn joblib




In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import joblib
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
from google.colab import files

uploaded_files = files.upload()  # upload fichiers dans dossier data

dfs = []
for filename in uploaded_files.keys():
    df = pd.read_csv(filename, delimiter=',')
    dfs.append(df)

# Combiner tous les fichiers
data = pd.concat(dfs, ignore_index=True)
print(data.shape)
data.head()


Saving Apr_2023.csv to Apr_2023 (1).csv
Saving Aug_2022.csv to Aug_2022 (1).csv
Saving Dec_2022.csv to Dec_2022 (1).csv
(802872, 18)


Unnamed: 0,Timestamp,Battery_Active_Power,Battery_Active_Power_Set_Response,PVPCS_Active_Power,GE_Body_Active_Power,GE_Active_Power,GE_Body_Active_Power_Set_Response,FC_Active_Power_FC_END_Set,FC_Active_Power,FC_Active_Power_FC_end_Set_Response,Island_mode_MCCB_Active_Power,MG-LV-MSB_AC_Voltage,Receiving_Point_AC_Voltage,Island_mode_MCCB_AC_Voltage,Island_mode_MCCB_Frequency,MG-LV-MSB_Frequency,Inlet_Temperature_of_Chilled_Water,Outlet_Temperature
0,2023/04/01 00:00:01,-0.1,0.0,0.0,110.0,87.0,122.0,40.0,38.0,40.0,-123.0,488.0,486.0,488.0,60.040001,60.040001,15.1,15.5
1,2023/04/01 00:00:11,-0.3,0.0,0.0,118.0,120.0,122.0,40.0,38.0,40.0,-87.0,488.0,486.0,488.0,60.040001,60.040001,15.1,15.5
2,2023/04/01 00:00:21,0.0,0.0,0.0,116.0,124.0,122.0,40.0,38.0,40.0,-116.0,488.0,486.0,488.0,60.040001,60.040001,15.1,15.5
3,2023/04/01 00:00:31,-0.1,0.0,0.0,110.0,94.300003,122.0,40.0,38.0,40.0,-115.0,488.0,486.0,488.0,60.049999,60.049999,15.1,15.5
4,2023/04/01 00:00:41,0.0,0.0,0.0,116.0,116.0,122.0,40.0,38.0,40.0,-128.0,488.0,486.0,488.0,60.049999,60.049999,15.1,15.5


In [4]:
# --- Conversion timestamp ---
data['Timestamp'] = pd.to_datetime(data['Timestamp'], errors='coerce')

# --- Resampling journalier : moyenne des valeurs par jour --- =>Pour ne pas avoire des donnes repeter
data_daily = (
    data.set_index('Timestamp')
        .resample('1D')
        .mean()
        .reset_index()
)

# --- Renommage des colonnes ---
data_daily.rename(columns={
    'Battery_Active_Power': 'battery_power',
    'Battery_Active_Power_Set_Response': 'battery_set_response',
    'PVPCS_Active_Power': 'pv_power',
    'GE_Body_Active_Power': 'ge_power_body',
    'GE_Active_Power': 'ge_power_total',
    'GE_Body_Active_Power_Set_Response': 'ge_body_set_response',
    'FC_Active_Power_FC_END_Set': 'fc_setpoint',
    'FC_Active_Power': 'fc_power',
    'FC_Active_Power_FC_end_Set_Response': 'fc_set_response',
    'Island_mode_MCCB_Active_Power': 'mccb_power',
    'MG-LV-MSB_AC_Voltage': 'mg_lv_voltage',
    'Receiving_Point_AC_Voltage': 'receiving_voltage',
    'Island_mode_MCCB_AC_Voltage': 'mccb_voltage',
    'Island_mode_MCCB_Frequency': 'mccb_frequency',
    'MG-LV-MSB_Frequency': 'mg_lv_frequency',
    'Inlet_Temperature_of_Chilled_Water': 'temp_inlet',
    'Outlet_Temperature': 'temp_outlet'
}, inplace=True)

# --- Remplacer NaN par 0 ---
data_daily = data_daily.fillna(0)

# --- Colonnes pour la prédiction ---
features = [
    'battery_power',
    'battery_set_response',
    'pv_power',
    'ge_power_body',
    'ge_power_total',
    'ge_body_set_response',
    'fc_setpoint',
    'fc_power',
    'fc_set_response',
    'mccb_power',
    'receiving_voltage',
    'mccb_voltage',
    'mccb_frequency',
    'mg_lv_frequency',
    'temp_inlet',
    'temp_outlet'
]

target = 'mg_lv_voltage'

# --- Vérifier colonnes manquantes ---
for f in features:
    if f not in data_daily.columns:
        print(f"⚠️ Colonne manquante : {f}, ajoutée avec 0")
        data_daily[f] = 0

# --- Créer X et y ---
X = data_daily[features]
y = data_daily[target]

print(f"Taille X après réduction journalière : {X.shape}")


Taille X après réduction journalière : (273, 16)


In [5]:
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pandas as pd

# --- Suppression des features peu corrélées avec la target ---
corr_matrix = X.corrwith(y).abs()
useful_features = corr_matrix[corr_matrix > 0.05].index.tolist()
X_filtered = X[useful_features]

print("Features conservées :", useful_features)


Features conservées : ['battery_power', 'battery_set_response', 'pv_power', 'ge_power_body', 'ge_power_total', 'ge_body_set_response', 'fc_setpoint', 'fc_power', 'fc_set_response', 'mccb_power', 'receiving_voltage', 'mccb_voltage', 'mccb_frequency', 'mg_lv_frequency', 'temp_inlet', 'temp_outlet']


In [6]:

# --- Définir KFold répété ---
rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

# --- Définir GridSearch pour RandomForest ---
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}


rf = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=rkf,
    scoring='r2',
    n_jobs=-1
)


**NB:**
***K-Fold Cross Validation (CV)***

* Divise dataset en K sous-parties (folds).

* Le modèle est entraîné sur K-1 folds et testé sur le fold restant.



***Grid Search***

* Explore plusieurs combinaisons d’hyperparamètres définis dans une grille.

In [7]:

# --- Entraînement ---
grid_search.fit(X_filtered, y)

best_model = grid_search.best_estimator_

print("Meilleurs paramètres :", grid_search.best_params_)
print("Meilleur R2 CV :", grid_search.best_score_)

# --- Évaluation sur l'ensemble complet ---
y_pred = best_model.predict(X_filtered)
r2 = r2_score(y, y_pred)
mse = mean_squared_error(y, y_pred)

print(f"R2 sur tout le dataset : {r2:.3f}")
print(f"MSE sur tout le dataset : {mse:.3f}")


Meilleurs paramètres : {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Meilleur R2 CV : 0.8760880689594414
R2 sur tout le dataset : 0.960
MSE sur tout le dataset : 42718.429


**Meilleurs hyperparamètres :**

* max_depth = None

* max_features = None

* min_samples_leaf = 1

* min_samples_split = 2

* n_estimators = 500

**Performance (validation croisée) :**

R² CV (moyen) = 0.876 → le modèle explique environ 87.6% de la variance sur les données de validation.

**Performance (tout le dataset) :**

R² global = 0.960 → le modèle explique environ 96% de la variance.

MSE global = 42718.43 → erreur quadratique moyenne sur les prédictions.

In [11]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb = XGBRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1],
    'colsample_bytree': [0.6, 0.8, 1],
}

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='r2',
    cv=5,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X, y)

print("Meilleurs paramètres :", grid_search.best_params_)
print("Meilleur R2 :", grid_search.best_score_)


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Meilleurs paramètres : {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1}
Meilleur R2 : 0.14853664143010697


In [15]:
best_model2 = grid_search.best_estimator_
from sklearn.metrics import r2_score, mean_squared_error

y_pred2 = best_model2.predict(X_filtered)
r2 = r2_score(y, y_pred2)
mse = mean_squared_error(y, y_pred2)

print(f"R2 Score: {r2:.3f}")
print(f"MSE: {mse:.3f}")


R2 Score: 0.889
MSE: 119277.870


* R² Score = 0.889 → le modèle explique 88.9 % de la variance des données, donc très performant.

* MSE = 119 277.870 → erreur moyenne quadratique ; évaluer la précision selon l’échelle de la variable cible.

**Conclusion :** le modèle Random Forest prédit bien les valeurs avec une bonne qualité d’ajustement.

In [8]:
# Sauvegarde du modèle
import joblib
joblib.dump(best_model, "random_forest_model.pkl")
print("✅ Modèle sauvegardé dans random_forest_model.pkl")


✅ Modèle sauvegardé dans random_forest_model.pkl
