# Lecture des données

In [18]:
import pandas as pd

data = pd.read_csv("data/dktcc.csv",sep=";",decimal=",")

In [None]:
file_excel = "data/Données UltiWatt_Coils_2019_2020_2021.xlsx"

df = pd.read_excel(file_excel)


In [None]:
col_kept = ['Date heure défourn','reco_metal_bra_og' ,'Long Bra',
             'Poids Bra',
             'Epais Bra théor.',
             'Larg Brame théor.',
             'lonbo_ret',
             'Epf 1',
             'Epf 2',
             'Epf 3',
             'Epf 4',
             'Epf 5',
             'Epf 6',
             'Epf 7',
             'pu_moy_f1_fin',
             'pu_moy_f2_fin',
             'pu_moy_f3_fin',
             'pu_moy_f4_fin',
             'pu_moy_f5_fin',
             'pu_moy_f6_fin',
             'pu_moy_f7_fin',
             'Tpf1']


                        

In [48]:
df['Tpf1']

0    989.729065
1    989.480591
2    985.722351
3    992.792542
4    991.517456
Name: Tpf1, dtype: float64

In [24]:
data["METAL"] = data["METAL"].astype(str)

In [25]:
#target = 'conso'
#features = ['tconserv', 'width', 'height','length', 'text', 'product']

target = 'PU_MOY_F2'
features = ['D_EPF_2','D_EPF_1', 'LARG_S_E5', 'METAL']


In [39]:
data = data.iloc[1:150000]

# Preprocessor building

In [26]:
def BuildPrepro(data:pd.DataFrame, features:list):

    """
    Function that build the preprocessor of the model pipeline
    Taking into account possible categorical variables

    """

    from sklearn.preprocessing import OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler
    import numpy as np
    
    num_feat = [f for f in data[features].columns if data.dtypes[f]==np.float64]
    cat_feat = [f for f in data[features].columns if data.dtypes[f]==object]

    if len(cat_feat)>0:
        num_prepro  = StandardScaler()
        cat_prepro  = OneHotEncoder(handle_unknown='ignore')
        prepro =  ColumnTransformer([('num',num_prepro,num_feat),('cat',cat_prepro, cat_feat)])
    else:
        num_prepro = StandardScaler()
        prepro =  ColumnTransformer([('num',num_prepro,num_feat)])

    return prepro

In [27]:
prepro = BuildPrepro(data = data, features = features)

In [14]:
prepro

# Modèle building

In [28]:
from lightgbm import LGBMRegressor
import lightgbm
from sklearn.pipeline import Pipeline

model = Pipeline([("preprocessor",prepro),("model", LGBMRegressor())])

In [20]:
model

In [29]:
def objective(trial,X_trs,Y):

    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error
    import numpy as np
    import warnings
    warnings.simplefilter('ignore')

    train_x, test_x, train_y, test_y = train_test_split(X_trs, Y, test_size=0.2,random_state=42)

    callbacks = [lightgbm.early_stopping(100, verbose=0), lightgbm.log_evaluation(period=0)]

    model = LGBMRegressor(verbosity = -1)

    
    param = { 
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5,0.6,0.7]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.02,0.04,0.08,0.12]),
        'max_depth': trial.suggest_categorical('max_depth', [4,5,6]),
        'n_estimators':trial.suggest_int('n_estimators',200,500,10),
        'num_leaves' : trial.suggest_int('num_leaves',100,200,20),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'subsample': trial.suggest_categorical('subsample', [0.7,0.8,0.9])
    }

    fixed_hp =   {
            'metric': 'rmse', 
            'random_state': 48,
            'verbose': -1
        }

    for p, pv in fixed_hp.items():
        param[p] = pv

    model = LGBMRegressor(**param)

    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],callbacks=callbacks)

    preds_train = model.predict(train_x)  
    rmse_train = mean_squared_error(train_y, preds_train,squared=False)
    preds_test = model.predict(test_x)
    rmse_test = mean_squared_error(test_y, preds_test,squared=False)

    alpha_overfit = 0.4
    score_final = alpha_overfit*rmse_train + (1-alpha_overfit)*np.abs(rmse_train-rmse_test)
    
    return score_final



In [30]:
def FindHyperParams(data:pd.DataFrame, target:str, features:list):

    import optuna
    import warnings
    from tqdm import TqdmExperimentalWarning

    optuna.logging.set_verbosity(optuna.logging.INFO)
    warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)


    data.dropna(inplace=True)
    Y  = data[target]
    X  = data[features]

    X_trs = prepro.fit_transform(X)
    
    study = optuna.create_study(direction='minimize')

    Ntrial = 100


    study.optimize(lambda trial: objective(trial, X_trs, Y), n_trials = Ntrial)
    best_params = study.best_trial.params

    return best_params



In [40]:
best_params = FindHyperParams(data=data,target=target,features=features)

[I 2025-10-06 15:28:11,729] A new study created in memory with name: no-name-74caaefe-2fa2-4024-8b7b-c4bbd205a2d4
[I 2025-10-06 15:28:16,558] Trial 0 finished with value: 173681.8316799301 and parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.08, 'max_depth': 6, 'n_estimators': 360, 'num_leaves': 200, 'reg_alpha': 0.06002613093229995, 'reg_lambda': 0.10881650848384285, 'subsample': 0.8}. Best is trial 0 with value: 173681.8316799301.
[I 2025-10-06 15:28:20,082] Trial 1 finished with value: 199346.0759753144 and parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.02, 'max_depth': 6, 'n_estimators': 220, 'num_leaves': 160, 'reg_alpha': 2.5338995922171885, 'reg_lambda': 0.2881980682386507, 'subsample': 0.7}. Best is trial 0 with value: 173681.8316799301.
[I 2025-10-06 15:28:26,390] Trial 2 finished with value: 176808.23261873447 and parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.04, 'max_depth': 6, 'n_estimators': 470, 'num_leaves': 120, 'reg_alpha': 2.29683617939630

In [41]:
best_params

{'colsample_bytree': 0.7,
 'learning_rate': 0.12,
 'max_depth': 6,
 'n_estimators': 500,
 'num_leaves': 120,
 'reg_alpha': 0.06230713696353929,
 'reg_lambda': 0.009534854073191164,
 'subsample': 0.7}

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

ml_pipeline = Pipeline([("preprocessor",prepro),("model", LGBMRegressor(**best_params))])
Y  = data[target]
X  = data[features]
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2,random_state=42)

ml_pipeline.fit(train_x,train_y)

preds_train = ml_pipeline.predict(train_x)  
preds_test  = ml_pipeline.predict(test_x)  
r2_train = r2_score(train_y,preds_train)


In [44]:
r2_test = r2_score(test_y,preds_test)

In [45]:
r2_test

0.9146337030596321