# Lecture des données

In [80]:
import pandas as pd

file_path = "data/presse balles/baler2 consumption.csv"

data = pd.read_csv(file_path,sep=",",decimal=".")

In [87]:
features = ['ball_tick_before', 'ball_tick_after', 'ball_area','material']
target   = 'baler_power_kw'

In [None]:
import datetime
data['Date'] = data['Date'].apply(lambda x: datetime.datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S"))
data.set_index('Date',inplace=True)

data = data[data.index<= "2024-01-01 00:00:00"]

In [90]:
data

Unnamed: 0_level_0,baler_power_kw,ball_tick_before,ball_tick_after,ball_area,material
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01 00:00:00,4438.7480,16.998037,9.217524,760.266235,material_0
2023-01-01 00:06:00,4351.8200,16.977560,9.218289,760.719604,material_0
2023-01-01 00:12:00,4716.7030,16.932531,9.169250,761.623596,material_0
2023-01-01 00:18:00,4423.5105,16.977160,9.211286,761.676086,material_0
2023-01-01 00:24:00,4405.9885,16.961489,9.189891,761.159485,material_0
...,...,...,...,...,...
2023-12-31 23:36:00,6198.0045,20.833031,13.594015,1204.955811,material_19
2023-12-31 23:42:00,5842.2795,20.769112,13.540275,1205.182129,material_19
2023-12-31 23:48:00,6079.5735,20.827662,13.586188,1205.799316,material_19
2023-12-31 23:54:00,6201.0450,19.291733,12.027036,1186.562256,material_19


# Preprocessor building

In [91]:
def BuildPrepro(data:pd.DataFrame, features:list):

    """
    Function that build the preprocessor of the model pipeline
    Taking into account possible categorical variables

    """

    from sklearn.preprocessing import OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler
    import numpy as np
    
    num_feat = [f for f in data[features].columns if data.dtypes[f]==np.float64]
    cat_feat = [f for f in data[features].columns if data.dtypes[f]==object]

    if len(cat_feat)>0:
        num_prepro  = StandardScaler()
        cat_prepro  = OneHotEncoder(handle_unknown='ignore')
        prepro =  ColumnTransformer([('num',num_prepro,num_feat),('cat',cat_prepro, cat_feat)])
    else:
        num_prepro = StandardScaler()
        prepro =  ColumnTransformer([('num',num_prepro,num_feat)])

    return prepro

In [92]:
prepro = BuildPrepro(data = data, features = features)

In [101]:
prepro

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


# Modèle building

In [59]:
from lightgbm import LGBMRegressor
import lightgbm

In [93]:
def objective(trial,X_trs,Y):

    from sklearn.model_selection import train_test_split
    from sklearn.metrics import root_mean_squared_error
    import numpy as np
    import warnings
    warnings.simplefilter('ignore')

    train_x, test_x, train_y, test_y = train_test_split(X_trs, Y, test_size=0.2,random_state=42)

    callbacks = [lightgbm.early_stopping(100, verbose=0), lightgbm.log_evaluation(period=0)]

    model = LGBMRegressor(verbosity = -1)

    
    param = { 
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5,0.6,0.7]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.02,0.04,0.08,0.12]),
        'max_depth': trial.suggest_categorical('max_depth', [4,5,6]),
        'n_estimators':trial.suggest_int('n_estimators',200,500,10),
        'num_leaves' : trial.suggest_int('num_leaves',100,200,20),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'subsample': trial.suggest_categorical('subsample', [0.7,0.8,0.9])
    }

    fixed_hp =   {
            'metric': 'rmse', 
            'random_state': 48,
            'verbose': -1
        }

    for p, pv in fixed_hp.items():
        param[p] = pv

    model = LGBMRegressor(**param)

    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],callbacks=callbacks)

    preds_train = model.predict(train_x)  
    rmse_train = root_mean_squared_error(train_y, preds_train)
    preds_test = model.predict(test_x)
    rmse_test = root_mean_squared_error(test_y, preds_test)

    alpha_overfit = 0.4
    score_final = alpha_overfit*rmse_train + (1-alpha_overfit)*np.abs(rmse_train-rmse_test)
    
    return score_final



In [94]:
def FindHyperParams(data:pd.DataFrame, target:str, features:list):

    import optuna
    import warnings
    from tqdm import TqdmExperimentalWarning

    optuna.logging.set_verbosity(optuna.logging.INFO)
    warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)


    data.dropna(inplace=True)
    Y  = data[target]
    X  = data[features]

    X_trs = prepro.fit_transform(X)
    
    study = optuna.create_study(direction='minimize')

    Ntrial = 50


    study.optimize(lambda trial: objective(trial, X_trs, Y), n_trials = Ntrial)
    best_params = study.best_trial.params

    return best_params



In [95]:
best_params = FindHyperParams(data=data,target=target,features=features)

[I 2025-10-07 12:34:13,734] A new study created in memory with name: no-name-57e58656-9b20-4675-9cc0-0a8a27282ce3
[I 2025-10-07 12:34:14,322] Trial 0 finished with value: 186.74072482648586 and parameters: {'colsample_bytree': 0.5, 'learning_rate': 0.08, 'max_depth': 4, 'n_estimators': 210, 'num_leaves': 180, 'reg_alpha': 0.30778300937699216, 'reg_lambda': 0.08664467559927622, 'subsample': 0.7}. Best is trial 0 with value: 186.74072482648586.
[I 2025-10-07 12:34:15,306] Trial 1 finished with value: 176.9639436521972 and parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.12, 'max_depth': 4, 'n_estimators': 330, 'num_leaves': 100, 'reg_alpha': 0.04503768054417753, 'reg_lambda': 0.004042776168968736, 'subsample': 0.9}. Best is trial 1 with value: 176.9639436521972.
[I 2025-10-07 12:34:16,087] Trial 2 finished with value: 189.0547717347045 and parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.04, 'max_depth': 5, 'n_estimators': 230, 'num_leaves': 200, 'reg_alpha': 0.2196992221

In [97]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

ml_pipeline = Pipeline([("preprocessor",prepro),("model", LGBMRegressor(**best_params))])
Y  = data[target]
X  = data[features]
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2,random_state=42)

ml_pipeline.fit(train_x,train_y)

preds_train = ml_pipeline.predict(train_x)  
preds_test  = ml_pipeline.predict(test_x)  
r2_train = r2_score(train_y,preds_train)
r2_test = r2_score(test_y,preds_test)

In [99]:
print(f"R2 score training {r2_train}; R2 score test {r2_test}")

R2 score training 0.9287114552346329; R2 score test 0.9181654104135107


In [100]:
r2_test

0.9181654104135107