In [1]:
import pandas as pd
import datetime

## Lecture des données

In [32]:


file_path = "data/dataprocess_2.csv"

data = pd.read_csv(file_path,sep=",",decimal=".")

data['DateProd'] = data['DateProd'].apply(lambda x: datetime.datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S"))
data.set_index('DateProd',inplace=True)

In [33]:
features = ['type','outl','delta_tick','width','var_env']
target   = 'conso'

data = data[[target]+features]

In [82]:
data.head()

Unnamed: 0_level_0,conso,type,outl,delta_tick,width,var_env
DateProd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-01-01 00:54:00,71.952273,prod_5,43.352014,180.909545,1312.553135,1104.613101
2025-01-01 01:06:00,92.218043,prod_3,35.288263,181.543252,1329.950497,1109.759769
2025-01-01 01:12:00,94.023651,autres,27.451722,182.084694,1375.44961,1110.096384
2025-01-01 01:24:00,82.887874,prod_5,40.885472,180.703321,1439.928797,1097.483233
2025-01-01 01:30:00,122.355329,autres,26.984735,182.766184,1463.714585,1102.624784


## Suppression des outliers

In [34]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

cols = ['conso', 'outl','delta_tick','width','var_env']

# Création d'une grille 2 lignes × 3 colonnes
fig = make_subplots(rows=2, cols=3, subplot_titles=cols)

# Ajout des boxplots
for i, col in enumerate(cols):
    row = i // 3 + 1
    col_pos = i % 3 + 1
    fig.add_trace(go.Box(y=data[col], name=col), row=row, col=col_pos)

fig.update_layout(
    height=600, width=1000,
    title_text="Boxplots des 5 variables (avant suppression outliers)",
    showlegend=False
)

fig.show()


In [35]:
data = data[data.conso>0] # On ne s'intéresse qu'aux consommations > 0
data = data[data.outl>2]
data = data[data.var_env>950]


In [36]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

cols = ['conso', 'outl','delta_tick','width','var_env']

# Création d'une grille 2 lignes × 3 colonnes
fig = make_subplots(rows=2, cols=3, subplot_titles=cols)

# Ajout des boxplots
for i, col in enumerate(cols):
    row = i // 3 + 1
    col_pos = i % 3 + 1
    fig.add_trace(go.Box(y=data[col], name=col), row=row, col=col_pos)

fig.update_layout(
    height=600, width=1000,
    title_text="Boxplots des 5 variables (après suppression outliers)",
    showlegend=False
)

fig.show()

## Construction du preprocesseur

In [37]:
def BuildPrepro(data:pd.DataFrame, features:list):

    """
    Function that build the preprocessor of the model pipeline
    Taking into account possible categorical variables

    """

    from sklearn.preprocessing import OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler
    import numpy as np
    
    num_feat = [f for f in data[features].columns if data.dtypes[f]==np.float64]
    cat_feat = [f for f in data[features].columns if data.dtypes[f]==object]

    if len(cat_feat)>0:
        num_prepro  = StandardScaler()
        cat_prepro  = OneHotEncoder(handle_unknown='ignore')
        prepro =  ColumnTransformer([('num',num_prepro,num_feat),('cat',cat_prepro, cat_feat)])
    else:
        num_prepro = StandardScaler()
        prepro =  ColumnTransformer([('num',num_prepro,num_feat)])

    return prepro

## Réduction de dimensionnalité

In [47]:
nb_type = len(data.type.unique())
print(f"nombre de catégories {nb_type}")

nombre de catégories 90


Le nombre de catégories est assez élevé. On effectue une réduction de dimensionnalité afin de ne conserver que les catégories les plus influentes.
Ici, j'utilise la propriété feature_importance des arbres de régression pour classe le poids des catégories

In [None]:
from sklearn.tree import DecisionTreeRegressor
y = data[target]
X = data[features]
prepro = BuildPrepro(data = data, features = features)
X_onehot = prepro.fit_transform(X)
model_reduction =DecisionTreeRegressor()
model_reduction.fit(X_onehot, y)

In [None]:
df_feat_importance = pd.DataFrame(data={"features":prepro.get_feature_names_out().tolist(), # dataframe avec le poids obtenus pour chaque features
                                        "importance":model_reduction.feature_importances_})


In [52]:
df_feat_importance.head(7)

Unnamed: 0,features,importance
0,num__outl,0.229581
1,num__delta_tick,0.164293
2,num__width,0.325669
3,num__var_env,0.185987
4,cat__type_prod_0,0.00213
5,cat__type_prod_1,0.003265
6,cat__type_prod_10,0.001666


In [None]:
# On ne regarde que les features qui sont associées à un produit
df_feat_importance_prod = df_feat_importance[df_feat_importance['features'].str.contains('prod')]
df_feat_importance_prod.sort_values(by='importance',ascending=False,inplace=True)
df_feat_importance_prod['perc'] = 100*df_feat_importance_prod['importance']/df_feat_importance_prod['importance'].sum()
df_feat_importance_prod['percum'] = df_feat_importance_prod['perc'].cumsum()

# on garde les catégories dont le cumul de l'importance est < 80 %
categorie_conservees = df_feat_importance_prod[df_feat_importance_prod.percum<80]['features'].to_list()

# on supprime la chaine "cat__type_" ajoutée par le transformeur
categorie_conservees =  [c.replace("cat__type_", "") for c in categorie_conservees] 

# On crée un dictionnaire pour mapper les types de produits les plus influents dans le dataframe "data"

dico_map_prod = {
    p: p if p in categorie_conservees else 'autres'
    for p in data.type.unique()
}

data['type'] = data['type'].map(dico_map_prod)

## Construction du modèle
Ici in utilise un lightgbm

In [75]:
from lightgbm import LGBMRegressor
import lightgbm

In [76]:
def objective(trial,X_trs,Y):

    from sklearn.model_selection import train_test_split
    from sklearn.metrics import root_mean_squared_error
    import numpy as np
    import warnings
    warnings.simplefilter('ignore')

    train_x, test_x, train_y, test_y = train_test_split(X_trs, Y, test_size=0.2,random_state=42)

    callbacks = [lightgbm.early_stopping(100, verbose=0), lightgbm.log_evaluation(period=0)]

    model = LGBMRegressor(verbosity = -1)

    
    param = { 
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5,0.6,0.7]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.02,0.04,0.08,0.12]),
        'max_depth': trial.suggest_categorical('max_depth', [4,5,6]),
        'n_estimators':trial.suggest_int('n_estimators',200,500,10),
        'num_leaves' : trial.suggest_int('num_leaves',100,200,20),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'subsample': trial.suggest_categorical('subsample', [0.7,0.8,0.9])
    }

    fixed_hp =   {
            'metric': 'rmse', 
            'random_state': 48,
            'verbose': -1
        }

    for p, pv in fixed_hp.items():
        param[p] = pv

    model = LGBMRegressor(**param)

    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],callbacks=callbacks)

    preds_train = model.predict(train_x)  
    rmse_train = root_mean_squared_error(train_y, preds_train)
    preds_test = model.predict(test_x)
    rmse_test = root_mean_squared_error(test_y, preds_test)

    alpha_overfit = 0.4
    score_final = alpha_overfit*rmse_train + (1-alpha_overfit)*np.abs(rmse_train-rmse_test)
    
    return score_final

In [77]:
def FindHyperParams(data:pd.DataFrame, target:str, features:list):

    import optuna
    import warnings
    from tqdm import TqdmExperimentalWarning

    optuna.logging.set_verbosity(optuna.logging.INFO)
    warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)


    data.dropna(inplace=True)
    Y  = data[target]
    X  = data[features]

    prepro = BuildPrepro(data = data, features = features)

    X_trs = prepro.fit_transform(X)
    
    study = optuna.create_study(direction='minimize')

    Ntrial = 50


    study.optimize(lambda trial: objective(trial, X_trs, Y), n_trials = Ntrial)
    best_params = study.best_trial.params

    return best_params


In [78]:
best_params = FindHyperParams(data=data,target=target,features=features)


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

[I 2025-10-09 18:04:23,773] A new study created in memory with name: no-name-dd00fa2e-e262-4923-b038-270b15ee255a
[I 2025-10-09 18:04:27,774] Trial 0 finished with value: 12.650588118712353 and parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.02, 'max_depth': 4, 'n_estimators': 360, 'num_leaves': 180, 'reg_alpha': 0.15233282647912497, 'reg_lambda': 0.07737800224228827, 'subsample': 0.7}. Best is trial 0 with value: 12.650588118712353.
[I 2025-10-09 18:04:28,356] Trial 1 finished with value: 12.705203679965214 and parameters: {'colsample_bytree': 0.5, 'learning_rate': 0.12, 'max_depth': 4, 'n_estimators': 330, 'num_leaves': 100, 'reg_alpha': 0.00520802805440569, 'reg_lambda': 0.42236168133070023, 'subsample': 0.8}. Best is trial 0 with value: 12.650588118712353.
[I 2025-10-09 18:04:29,119] Trial 2 finished with value: 12.640146684209462 and parameters: {

In [79]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

ml_pipeline = Pipeline([("preprocessor",prepro),("model", LGBMRegressor(**best_params))])
Y  = data[target]
X  = data[features]
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2,random_state=42)

ml_pipeline.fit(train_x,train_y)

preds_train = ml_pipeline.predict(train_x)  
preds_test  = ml_pipeline.predict(test_x)  
r2_train = r2_score(train_y,preds_train)
r2_test = r2_score(test_y,preds_test)

In [81]:
r2_test

0.23835836068876026