In [None]:
import s3fs
import datetime
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
import lightgbm

import warnings
import optuna
from tqdm import TqdmExperimentalWarning

import plotly.graph_objects as go

# Lecture des données
### Ici je lis les données qui sont stockées sur un bucket S3 d'AWS

In [None]:
# Nom du bucket et chemin du fichier
bucket_name = "david-mlops-bucket"
fichier_s3  = "presse a balle/baler2 consumption.csv"


# URL complète vers le fichier
s3_path = f"s3://{bucket_name}/{fichier_s3}"
data    = pd.read_csv(s3_path,sep=",",decimal=".")

In [3]:
data.head(5)

Unnamed: 0,Date,baler_power_kw,ball_tick_before,ball_tick_after,ball_area,material
0,2023-01-01 00:00:00,4438.748,16.998037,9.217524,760.266235,material_0
1,2023-01-01 00:06:00,4351.82,16.97756,9.218289,760.719604,material_0
2,2023-01-01 00:12:00,4716.703,16.932531,9.16925,761.623596,material_0
3,2023-01-01 00:18:00,4423.5105,16.97716,9.211286,761.676086,material_0
4,2023-01-01 00:24:00,4405.9885,16.961489,9.189891,761.159485,material_0


In [None]:
# Transformation de la Date en datetime et utilisation de celle-ci comme index

data['Date'] = data['Date'].apply(lambda x: datetime.datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S"))
data.set_index('Date',inplace=True)


# On ne conserve que l'année 2023
data = data[data.index<= "2024-01-01 00:00:00"]

In [5]:
data.describe()

Unnamed: 0,baler_power_kw,ball_tick_before,ball_tick_after,ball_area
count,87601.0,87601.0,87601.0,87601.0
mean,6025.376031,20.188281,12.564014,1234.670916
std,1441.353819,2.117603,2.288283,231.92801
min,1428.06925,15.417809,7.403808,719.457275
25%,4862.937,18.920767,11.089732,1041.442749
50%,5941.809,20.108919,12.374262,1253.362549
75%,7070.8825,21.339254,13.83996,1394.922607
max,12677.891,35.0,27.092447,1936.587769


# Preprocesseur
### construction du preprocesseur qui effectue une standardisation des features numériques et un encodage des variables catégorielles

In [None]:
def BuildPrepro(data:pd.DataFrame, features:list):

    """
    Function that build the preprocessor of the model pipeline
    Taking into account possible categorical variables

    """

    num_feat = [f for f in data[features].columns if data.dtypes[f]==np.float64]
    cat_feat = [f for f in data[features].columns if data.dtypes[f]==object]

    if len(cat_feat)>0:
        num_prepro  = StandardScaler()
        cat_prepro  = OneHotEncoder(handle_unknown='ignore')
        prepro =  ColumnTransformer([('num',num_prepro,num_feat),('cat',cat_prepro, cat_feat)])
    else:
        num_prepro = StandardScaler()
        prepro =  ColumnTransformer([('num',num_prepro,num_feat)])

    return prepro

In [7]:
features = ['ball_tick_before', 'ball_tick_after', 'ball_area','material']
target   = 'baler_power_kw'
prepro = BuildPrepro(data = data, features = features)

In [10]:
prepro

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


# Recherche des meilleurs hyperparamètres

In [None]:
def objective(trial,X_trs,Y):


    warnings.simplefilter('ignore')

    train_x, test_x, train_y, test_y = train_test_split(X_trs, Y, test_size=0.2,random_state=42)

    callbacks = [lightgbm.early_stopping(100, verbose=0), lightgbm.log_evaluation(period=0)]

    model = LGBMRegressor(verbosity = -1)

    
    param = { 
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5,0.6,0.7]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.02,0.04,0.08,0.12]),
        'max_depth': trial.suggest_categorical('max_depth', [4,5,6]),
        'n_estimators':trial.suggest_int('n_estimators',200,500,10),
        'num_leaves' : trial.suggest_int('num_leaves',100,200,20),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'subsample': trial.suggest_categorical('subsample', [0.7,0.8,0.9])
    }

    fixed_hp =   {
            'metric': 'rmse', 
            'random_state': 48,
            'verbose': -1
        }

    for p, pv in fixed_hp.items():
        param[p] = pv

    model = LGBMRegressor(**param)

    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],callbacks=callbacks)

    preds_train = model.predict(train_x)  
    rmse_train = root_mean_squared_error(train_y, preds_train)
    preds_test = model.predict(test_x)
    rmse_test = root_mean_squared_error(test_y, preds_test)

    alpha_overfit = 0.4
    score_final = alpha_overfit*rmse_train + (1-alpha_overfit)*np.abs(rmse_train-rmse_test)
    
    return score_final



In [None]:
def FindHyperParams(data:pd.DataFrame, target:str, features:list):


    optuna.logging.set_verbosity(optuna.logging.INFO)
    warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)


    data.dropna(inplace=True)
    Y  = data[target]
    X  = data[features]

    X_trs = prepro.fit_transform(X)
    
    study = optuna.create_study(direction='minimize')

    Ntrial = 50


    study.optimize(lambda trial: objective(trial, X_trs, Y), n_trials = Ntrial)
    best_params = study.best_trial.params

    return best_params



In [11]:
best_params = FindHyperParams(data=data,target=target,features=features)

[I 2025-10-07 14:17:04,014] A new study created in memory with name: no-name-08889631-d67a-4438-aae5-9e087ab0147f
[I 2025-10-07 14:17:05,421] Trial 0 finished with value: 204.34212579762135 and parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.02, 'max_depth': 4, 'n_estimators': 360, 'num_leaves': 200, 'reg_alpha': 0.039756352589000274, 'reg_lambda': 1.3348861991583914, 'subsample': 0.7}. Best is trial 0 with value: 204.34212579762135.
[I 2025-10-07 14:17:06,557] Trial 1 finished with value: 192.8512346588364 and parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.04, 'max_depth': 4, 'n_estimators': 320, 'num_leaves': 120, 'reg_alpha': 0.004136872358206065, 'reg_lambda': 1.5486565216320067, 'subsample': 0.7}. Best is trial 1 with value: 192.8512346588364.
[I 2025-10-07 14:17:07,292] Trial 2 finished with value: 182.84760429330726 and parameters: {'colsample_bytree': 0.5, 'learning_rate': 0.08, 'max_depth': 4, 'n_estimators': 300, 'num_leaves': 160, 'reg_alpha': 0.0452903212

## Je relance le modèle avec les meilleur hyper paramètres

In [None]:
ml_pipeline = Pipeline([("preprocessor",prepro),("model", LGBMRegressor(**best_params))])
Y  = data[target]
X  = data[features]
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2,random_state=42)

ml_pipeline.fit(train_x,train_y)

preds_train = ml_pipeline.predict(train_x)  
preds_test  = ml_pipeline.predict(test_x)  
r2_train = r2_score(train_y,preds_train)
r2_test = r2_score(test_y,preds_test)

In [13]:
print(f"R2 score training {r2_train}; R2 score test {r2_test}")

R2 score training 0.9233249724876981; R2 score test 0.9153967232774238


In [19]:
Y.index

DatetimeIndex(['2023-01-01 00:00:00', '2023-01-01 00:06:00',
               '2023-01-01 00:12:00', '2023-01-01 00:18:00',
               '2023-01-01 00:24:00', '2023-01-01 00:30:00',
               '2023-01-01 00:36:00', '2023-01-01 00:42:00',
               '2023-01-01 00:48:00', '2023-01-01 00:54:00',
               ...
               '2023-12-31 23:06:00', '2023-12-31 23:12:00',
               '2023-12-31 23:18:00', '2023-12-31 23:24:00',
               '2023-12-31 23:30:00', '2023-12-31 23:36:00',
               '2023-12-31 23:42:00', '2023-12-31 23:48:00',
               '2023-12-31 23:54:00', '2024-01-01 00:00:00'],
              dtype='datetime64[ns]', name='Date', length=87601, freq=None)

# Visualisation

In [20]:
df_result = pd.DataFrame(index = Y.index, data = {'mesure':Y.values,'modele':ml_pipeline.predict(X)})

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_result.index,
    y=df_result['mesure'],
    mode="lines",
    name="Conso mesurée"
))


fig.add_trace(go.Scatter(
    x=df_result.index,
    y=df_result['modele'],
    mode="lines",
    name="Conso modélisée"
))


fig.update_layout(
    title="Comparaison modèle/mesure",
    xaxis_title="Date",
    yaxis_title="Power (kW)"
)

fig.show()