# Pipeline entraînement + SHAP + prédiction (LightGBM + MCA batch)
Notebook nettoyé et reproductible.


version ordinateur

In [None]:
python -m pip install lgbm


import numpy as np
import pandas as pd
import prince
import lightgbm as lgb
import shap

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


In [None]:
# =========================
# CELLULE UNIQUE – PIPELINE SERVEUR GPU (A100) + submission.csv
# =========================
#Attention les gars ça runnera pas sur pc

# -------------------------
# PARAMÈTRES À AJUSTER
# -------------------------
TRAIN_PATH = "data/train.csv"
TEST_PATH  = "data/test.csv"
OUT_PATH   = "submission.csv"

ID_COL = "id"
TARGET_COLS = ["wip", "investissement", "satisfaction"]

#il faut le mettre à 51 pour les 51 param_year mais j'ai oublié et ça a quand meme marché
N_PARAM_COLS = 0          # nombre de colonnes "param" à la fin
DROP_ID_COL_IN_X = True

# MCA
SEED = 42
K_MCA = 20

# SHAP
USE_SHAP_SELECTION = True
SHAP_COVER = 0.90
SHAP_BG = 1000
SHAP_SAMPLE = 2000

# LightGBM (GPU activé)
EARLY_STOPPING_ROUNDS = 200
LGB_PARAMS = dict(
    objective="regression",
    metric="mae",
    boosting_type="gbdt",
    learning_rate=0.05,
    n_estimators=20000,
    num_leaves=127,
    min_data_in_leaf=20,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    reg_lambda=1.0,
    random_state=SEED,
    n_jobs=-1,
    verbosity=-1,

    # GPU
    device_type="gpu",
    gpu_use_dp=False,
    max_bin=255,
)

# -------------------------
# 1) Chargement des données
# -------------------------
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

# Vérifications
assert ID_COL in train_df.columns
assert ID_COL in test_df.columns
for t in TARGET_COLS:
    assert t in train_df.columns

X_train = train_df.drop(columns=TARGET_COLS)
Y_train = train_df[TARGET_COLS].copy()
X_test  = test_df.copy()

test_ids = X_test[ID_COL].values

if DROP_ID_COL_IN_X:
    X_train = X_train.drop(columns=[ID_COL])
    X_test  = X_test.drop(columns=[ID_COL])

# -------------------------
# 2) Split demand / param
# -------------------------
if N_PARAM_COLS == 51:
    X_train_demand = X_train.copy()
    X_train_param  = pd.DataFrame(index=X_train.index)
    X_test_demand  = X_test.copy()
    X_test_param   = pd.DataFrame(index=X_test.index)
else:
    X_train_demand = X_train.iloc[:, :-N_PARAM_COLS].copy()
    X_train_param  = X_train.iloc[:, -N_PARAM_COLS:].copy()
    X_test_demand  = X_test.iloc[:, :-N_PARAM_COLS].copy()
    X_test_param   = X_test.iloc[:, -N_PARAM_COLS:].copy()

# -------------------------
# 3) Split train / val
# -------------------------
Xd_tr, Xd_va, Xp_tr, Xp_va, Y_tr, Y_va = train_test_split(
    X_train_demand, X_train_param, Y_train,
    test_size=0.2, random_state=SEED, shuffle=True
)

# -------------------------
# 4) MCA (fit sur TOUT le train demand)
# -------------------------
print(f"[MCA] Fit sur {X_train_demand.shape}")
mca = prince.MCA(n_components=K_MCA, n_iter=3, random_state=SEED).fit(X_train_demand)

def mca_transform(df):
    Z = mca.transform(df)
    Z = pd.DataFrame(Z, index=df.index)
    Z.columns = [f"MCA_{i+1}" for i in range(Z.shape[1])]
    return Z

Xmca_tr = mca_transform(Xd_tr)
Xmca_va = mca_transform(Xd_va)
Xmca_te = mca_transform(X_test_demand)

# -------------------------
# 5) X final = demand + MCA + param
# -------------------------
X_final_tr = pd.concat([Xd_tr.reset_index(drop=True), Xmca_tr.reset_index(drop=True), Xp_tr.reset_index(drop=True)], axis=1)
X_final_va = pd.concat([Xd_va.reset_index(drop=True), Xmca_va.reset_index(drop=True), Xp_va.reset_index(drop=True)], axis=1)
X_final_te = pd.concat([X_test_demand.reset_index(drop=True), Xmca_te.reset_index(drop=True), X_test_param.reset_index(drop=True)], axis=1)

X_final_tr = X_final_tr.loc[:, ~X_final_tr.columns.duplicated()].copy()
X_final_va = X_final_va.loc[:, ~X_final_va.columns.duplicated()].copy()
X_final_te = X_final_te.loc[:, ~X_final_te.columns.duplicated()].copy()

print("Shapes:", X_final_tr.shape, X_final_va.shape, X_final_te.shape)

# -------------------------
# 6) SHAP – sélection de features
# -------------------------
def shap_select_features(model, X, cover=0.9, bg=1000, sample=2000, seed=42):
    rng = np.random.default_rng(seed)
    n = len(X)

    bg_idx = rng.choice(n, size=min(bg, n), replace=False)
    sm_idx = rng.choice(n, size=min(sample, n), replace=False)

    X_bg = X.iloc[bg_idx]
    X_s  = X.iloc[sm_idx]

    explainer = shap.TreeExplainer(model, data=X_bg, feature_perturbation="interventional")
    sv = explainer.shap_values(X_s)

    if isinstance(sv, list):
        sv = sv[0]

    imp = np.mean(np.abs(sv), axis=0)
    imp = pd.Series(imp, index=X.columns).sort_values(ascending=False)

    cumsum = imp.cumsum() / imp.sum()
    selected = list(cumsum[cumsum <= cover].index)

    if len(selected) == 0:
        selected = [imp.index[0]]

    return selected
# -------------------------
# 7) Entraînement par cible + refit final + prédiction test + submission
# -------------------------
pred_test = {}
report = []

for t in TARGET_COLS:
    print(f"\n=== Target: {t} ===")

    ytr = Y_tr[t].values
    yva = Y_va[t].values

    # Modèle full (early stopping)
    model_full = lgb.LGBMRegressor(**LGB_PARAMS)
    model_full.fit(
        X_final_tr, ytr,
        eval_set=[(X_final_va, yva)],
        eval_metric="mae",
        callbacks=[lgb.early_stopping(EARLY_STOPPING_ROUNDS, verbose=False)]
    )

    best_iter = int(model_full.best_iteration_ or LGB_PARAMS["n_estimators"])

    # Eval val
    pred_va = model_full.predict(X_final_va, num_iteration=best_iter)
    mae = mean_absolute_error(yva, pred_va)
    print(f"[VAL] best_iter={best_iter} | MAE={mae:.6f}")

    # Sélection SHAP (sur train uniquement)
    if USE_SHAP_SELECTION:
        feats = shap_select_features(
            model_full, X_final_tr,
            cover=SHAP_COVER,
            bg=SHAP_BG,
            sample=SHAP_SAMPLE,
            seed=SEED
        )
        print(f"[SHAP] selected={len(feats)} / {X_final_tr.shape[1]}")
    else:
        feats = list(X_final_tr.columns)   # <-- ICI était ton bug
        print(f"[SHAP] disabled -> using all features: {len(feats)}")

    # Data sélectionnée
    Xtr_sel = X_final_tr.loc[:, feats]
    Xva_sel = X_final_va.loc[:, feats]
    Xte_sel = X_final_te.loc[:, feats]

    # Refit final sur (train + val) avec n_estimators = best_iter
    X_all = pd.concat([Xtr_sel, Xva_sel], axis=0).reset_index(drop=True)
    y_all = np.concatenate([ytr, yva], axis=0)

    model_final = lgb.LGBMRegressor(**{**LGB_PARAMS, "n_estimators": best_iter})
    model_final.fit(X_all, y_all)

    # Prédiction test
    pred_test[t] = model_final.predict(Xte_sel)

    report.append((t, mae, best_iter, len(feats)))

# -------------------------
# 8) Écriture submission.csv
# -------------------------
sub = pd.DataFrame({ID_COL: test_ids})
for t in TARGET_COLS:
    sub[t] = pred_test[t]

sub.to_csv(OUT_PATH, index=False)

print("\nSaved:", OUT_PATH)
print("\nRésumé validation:")
for t, mae, best_iter, nfeats in report:
    print(f"- {t:15s} | MAE={mae:.6f} | best_iter={best_iter} | feats={nfeats}")


FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'