# Pipeline entraînement + SHAP + prédiction (LightGBM + MCA batch)
Notebook nettoyé et reproductible.


version ordinateur

In [42]:
!python -m pip install optuna

import numpy as np
import pandas as pd
import prince
import lightgbm as lgb
import shap

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.17.2-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading sqlalchemy-2.0.45-cp313-cp313-win_amd64.whl.metadata (9.8 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet>=1 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.3.0-cp313-cp313-win_amd64.whl.metadata (4.2 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
Downloading alembic-1.17.2-py3-none-any.whl (248 kB)
Downloading sqlalchemy-2.0.45-cp313-cp313-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------------------------------------- 2.1/2.1 MB 23.5 MB/s  0:00:00
Downloading greenlet-3.3.0-cp313-cp313-win_amd64.whl



In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
import prince

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# -------------------------
# PARAMS
# -------------------------
TRAIN_PATH = "data/train.csv"
TEST_PATH  = "data/test.csv"
OUT_PATH   = "submission.csv"

ID_COL = "id"
TARGET_COLS = ["wip", "investissement", "satisfaction"]

N_PARAM_COLS = 51
DROP_ID_COL_IN_X = True

SEED = 42
K_MCA = 20

TEST_SIZE = 0.2
EARLY_STOPPING_ROUNDS = 200
N_TRIALS = 40   # 30–60 est un bon range

# Base params (GPU)
LGB_BASE = dict(
    objective="regression",
    metric="mae",
    boosting_type="gbdt",
    n_estimators=20000,
    random_state=SEED,
    n_jobs=-1,
    verbosity=-1,
    device_type="gpu",
    gpu_use_dp=False,
    max_bin=255,
)

# -------------------------
# 1) Load + split X/Y
# -------------------------
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

assert ID_COL in train_df.columns and ID_COL in test_df.columns
for t in TARGET_COLS:
    assert t in train_df.columns

X_train = train_df.drop(columns=TARGET_COLS)
Y_train = train_df[TARGET_COLS].copy()

test_ids = test_df[ID_COL].values
X_test = test_df.copy()

if DROP_ID_COL_IN_X:
    X_train = X_train.drop(columns=[ID_COL])
    X_test  = X_test.drop(columns=[ID_COL])

# -------------------------
# 2) demand / param
# -------------------------
if N_PARAM_COLS == 0:
    X_train_demand = X_train.copy()
    X_train_param  = pd.DataFrame(index=X_train.index)
    X_test_demand  = X_test.copy()
    X_test_param   = pd.DataFrame(index=X_test.index)
else:
    X_train_demand = X_train.iloc[:, :-N_PARAM_COLS].copy()
    X_train_param  = X_train.iloc[:, -N_PARAM_COLS:].copy()
    X_test_demand  = X_test.iloc[:, :-N_PARAM_COLS].copy()
    X_test_param   = X_test.iloc[:, -N_PARAM_COLS:].copy()

# -------------------------
# 3) train/val split
# -------------------------
Xd_tr, Xd_va, Xp_tr, Xp_va, Y_tr, Y_va = train_test_split(
    X_train_demand, X_train_param, Y_train,
    test_size=TEST_SIZE, random_state=SEED, shuffle=True
)

# -------------------------
# 4) MCA fit + transform
# -------------------------
print(f"[MCA] Fit sur {X_train_demand.shape}", flush=True)
mca = prince.MCA(n_components=K_MCA, n_iter=3, random_state=SEED).fit(X_train_demand)

def mca_transform(df):
    Z = mca.transform(df)
    Z = pd.DataFrame(Z, index=df.index)
    Z.columns = [f"MCA_{i+1}" for i in range(Z.shape[1])]
    return Z

Xmca_tr = mca_transform(Xd_tr)
Xmca_va = mca_transform(Xd_va)
Xmca_te = mca_transform(X_test_demand)

# -------------------------
# 5) Build X_final
# -------------------------
X_final_tr = pd.concat([Xd_tr.reset_index(drop=True), Xmca_tr.reset_index(drop=True), Xp_tr.reset_index(drop=True)], axis=1)
X_final_va = pd.concat([Xd_va.reset_index(drop=True), Xmca_va.reset_index(drop=True), Xp_va.reset_index(drop=True)], axis=1)
X_final_te = pd.concat([X_test_demand.reset_index(drop=True), Xmca_te.reset_index(drop=True), X_test_param.reset_index(drop=True)], axis=1)

# dédup noms + align
X_final_tr = X_final_tr.loc[:, ~X_final_tr.columns.duplicated()].copy()
X_final_va = X_final_va.loc[:, ~X_final_va.columns.duplicated()].copy()
X_final_te = X_final_te.loc[:, ~X_final_te.columns.duplicated()].copy()

X_final_va = X_final_va.reindex(columns=X_final_tr.columns, fill_value=0)
X_final_te = X_final_te.reindex(columns=X_final_tr.columns, fill_value=0)

print("Shapes:", X_final_tr.shape, X_final_va.shape, X_final_te.shape, flush=True)

# -------------------------
# 6) Optuna tuning per target
# -------------------------
def tune_one_target(Xtr, ytr, Xva, yva, n_trials=40):
    def objective(trial):
        params = dict(LGB_BASE)
        params.update({
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.10, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 31, 255),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 200),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
            "bagging_freq": trial.suggest_int("bagging_freq", 0, 10),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10.0),
        })

        m = lgb.LGBMRegressor(**params)
        m.fit(
            Xtr, ytr,
            eval_set=[(Xva, yva)],
            eval_metric="mae",
            callbacks=[lgb.early_stopping(EARLY_STOPPING_ROUNDS, verbose=False)]
        )
        best_iter = int(m.best_iteration_ or params["n_estimators"])
        pred = m.predict(Xva, num_iteration=best_iter)
        return mean_absolute_error(yva, pred)

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    return study.best_params, study.best_value

pred_test = {}
report = []

for t in TARGET_COLS:
    print(f"\n=== Tuning target: {t} ===", flush=True)
    ytr = Y_tr[t].values
    yva = Y_va[t].values

    best_params, best_mae = tune_one_target(X_final_tr, ytr, X_final_va, yva, n_trials=N_TRIALS)
    print(f"[BEST] MAE={best_mae:.6f} | params={best_params}", flush=True)

    # Refit final sur (train+val) avec ces params + n_estimators fixé par early stopping
    # -> on refait un fit avec early stopping pour récupérer un best_iter cohérent avec les best_params
    tmp = lgb.LGBMRegressor(**{**LGB_BASE, **best_params})
    tmp.fit(
        X_final_tr, ytr,
        eval_set=[(X_final_va, yva)],
        eval_metric="mae",
        callbacks=[lgb.early_stopping(EARLY_STOPPING_ROUNDS, verbose=False)]
    )
    best_iter = int(tmp.best_iteration_ or LGB_BASE["n_estimators"])
    print(f"[REFIT] best_iter={best_iter}", flush=True)

    X_all = pd.concat([X_final_tr, X_final_va], axis=0).reset_index(drop=True)
    y_all = np.concatenate([ytr, yva], axis=0)

    final = lgb.LGBMRegressor(**{**LGB_BASE, **best_params, "n_estimators": best_iter})
    final.fit(X_all, y_all)

    pred_test[t] = final.predict(X_final_te)
    report.append((t, best_mae, best_iter))

# -------------------------
# 7) submission.csv
# -------------------------
sub = pd.DataFrame({ID_COL: test_ids})
for t in TARGET_COLS:
    sub[t] = pred_test[t]
sub.to_csv(OUT_PATH, index=False)

print("\nSaved:", OUT_PATH, flush=True)
print("\nRésumé (val Optuna):", flush=True)
for t, mae, best_iter in report:
    print(f"- {t:15s} | MAE={mae:.6f} | best_iter={best_iter}", flush=True)
