# üß† Data Challenge 2025 ‚Äì D√©tection de gaz

Ce notebook pr√©sente la solution d√©velopp√©e pour le **Data Challenge 2025 (ENS / Bertin Technologies)**.  
L‚Äôobjectif du challenge est de **pr√©dire le niveau d'alarme de 23 gaz** √† partir de mesures capteurs multivari√©es.  

Le mod√®le final repose sur :
- des **Extra Trees Regressors** entra√Æn√©s sur diff√©rentes variantes (avec et sans features "row-wise"),  
- un **blending optimis√©** des pr√©dictions (global et par cible),  
- une **calibration lin√©aire** et un **shrinkage vers la moyenne** pour stabiliser les sorties.  

Le notebook permet de :
- pr√©traiter les donn√©es,  
- entra√Æner les mod√®les,  
- et **g√©n√©rer automatiquement la soumission finale** au format attendu par la plateforme du challenge.

La performance est √©valu√©e avec la **Weighted RMSE**, une racine d‚Äôerreur quadratique moyenne
pond√©r√©e.

## üì¶ 1. Imports & Chargement des donn√©es
Lecture des fichiers `x_train.csv`, `y_train.csv` et `x_test.csv` √† partir du dossier `../DATA`.  
On affiche les dimensions pour v√©rification rapide.


In [2]:

from __future__ import annotations
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Tuple
import time

import numpy as np
import pandas as pd

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.linear_model import LinearRegression

#Import des donn√©es
DATA_DIR = Path("../DATA").resolve()
assert DATA_DIR.exists(), f"Le dossier DATA est introuvable: {DATA_DIR}"

X_train = pd.read_csv(DATA_DIR / "x_train.csv")
y_train = pd.read_csv(DATA_DIR / "y_train.csv")
X_test  = pd.read_csv(DATA_DIR / "x_test.csv")

print(f"‚úÖ Donn√©es charg√©es :")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}")


‚úÖ Donn√©es charg√©es :
X_train: (202933, 14), y_train: (202933, 24), X_test: (134673, 14)


## ‚öôÔ∏è 2. Fonctions utilitaires
Fonctions r√©utilis√©es dans tout le pipeline : m√©trique (weighted RMSE), folds stratifi√©s,
nettoyage/featurisation ligne, calibration lin√©aire et shrinkage vers la moyenne.


In [3]:


def weighted_rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float: # M√©trique du challenge 
    y_pred = np.clip(y_pred, 0.0, 1.0)
    w = np.where(y_true >= 0.5, 1.2, 1.0)
    mse_per_sample = np.mean(w * (y_pred - y_true)**2, axis=1)
    return float(np.sqrt(np.mean(mse_per_sample)))


def make_stratified_folds(y: pd.DataFrame, n_splits=5, random_state=42): # Folds stratifi√©s sur la moyenne des cibles (stabilit√© OOF).
    y_mean = y.mean(axis=1).to_numpy()
    ranks = pd.Series(y_mean).rank(method="average", pct=True).to_numpy()
    nbins = max(2, min(10, len(y)//n_splits))
    bins = np.floor(ranks * nbins).astype(int)
    bins[bins == nbins] = nbins - 1
    if len(np.unique(bins)) < 2:
        print("‚ö†Ô∏è Stratification impossible (bins uniques) ‚Üí fallback KFold.")
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        return list(kf.split(np.zeros(len(y))))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    return list(skf.split(np.zeros(len(y)), bins))


# Clip par quantiles (fit sur train, appliqu√© √† test) pour limiter l'influence des outliers.
# On retire 'Humidity' (trop bruit√©e selon EDA).
def clean_features(X_train, X_test, lower_q=0.01, upper_q=0.99, drop_humidity=True):
    Xtr = X_train.copy(); Xte = X_test.copy()
    for col in Xtr.columns:
        if col != "ID" and pd.api.types.is_numeric_dtype(Xtr[col]):
            low = Xtr[col].quantile(lower_q)
            high = Xtr[col].quantile(upper_q)
            Xtr[col] = Xtr[col].clip(low, high)
            Xte[col] = Xte[col].clip(low, high)
    if drop_humidity and "Humidity" in Xtr.columns:
        Xtr = Xtr.drop(columns=["Humidity"]) ; Xte = Xte.drop(columns=["Humidity"]) 
    return Xtr, Xte


# Ajoute des features "ligne" robustes (moyennes/percentiles/√©carts) pour capturer structure globale de capteurs.
def add_rowwise_features(X: pd.DataFrame,
                         exclude_cols=("ID",),
                         prefix="rw_") -> pd.DataFrame:
    Xo = X.copy()
    num_cols = [c for c in Xo.columns if c not in exclude_cols and pd.api.types.is_numeric_dtype(Xo[c])]
    if len(num_cols) == 0:
        return Xo
    V = Xo[num_cols].to_numpy(dtype=float)

    def row_nanpercentile(a, p):
        return np.nanpercentile(a, p, axis=1)

    mean   = np.nanmean(V, axis=1)
    median = np.nanmedian(V, axis=1)
    std    = np.nanstd(V, axis=1)
    vmin   = np.nanmin(V, axis=1)
    vmax   = np.nanmax(V, axis=1)
    rng    = vmax - vmin

    q25 = row_nanpercentile(V, 25)
    q75 = row_nanpercentile(V, 75)
    iqr = q75 - q25

    abs_dev = np.abs(V - median[:, None])
    mad = np.nanmedian(abs_dev, axis=1)

    l1 = np.nansum(np.abs(V), axis=1)
    l2 = np.sqrt(np.nansum(V * V, axis=1))

    n_nan   = np.isnan(V).sum(axis=1).astype(float)
    nan_rat = n_nan / V.shape[1]

# Agr√©gats robustes par ligne (nan-safe) : utiles si les distributions par capteur diff√®rent
    R = pd.DataFrame({
        f"{prefix}mean": mean,
        f"{prefix}median": median,
        f"{prefix}std": std,
        f"{prefix}min": vmin,
        f"{prefix}max": vmax,
        f"{prefix}range": rng,
        f"{prefix}q25": q25,
        f"{prefix}q75": q75,
        f"{prefix}iqr": iqr,
        f"{prefix}mad": mad,
        f"{prefix}l1": l1,
        f"{prefix}l2": l2,
        f"{prefix}n_nan": n_nan,
        f"{prefix}nan_ratio": nan_rat,
    }, index=Xo.index)

    return pd.concat([Xo, R], axis=1)


# Calibration lin√©aire ind√©pendante par cible : corrige biais d'√©chelle entre OOF et y_true.
# Si variance quasi nulle ‚Üí on remet √† la moyenne de la cible (√©vite les solutions instables).
def calibrate_predictions_linear(y_true_df, oof_pred_df, test_pred_df, eps=1e-9):
    y_true = y_true_df.values
    y_oof  = oof_pred_df.values
    y_te   = test_pred_df.values.copy()
    for j in range(y_true.shape[1]):
        x = y_oof[:, [j]]
        if np.std(x) < eps:
            a, b = 0.0, float(y_true[:, j].mean())
            y_oof[:, j] = a * x.ravel() + b
            y_te[:,  j] = a * y_te[:,  j] + b
        else:
            lr = LinearRegression()
            lr.fit(x, y_true[:, j])
            y_oof[:, j] = lr.predict(x)
            y_te[:,  j] = lr.predict(y_te[:, [j]])
    y_oof = np.clip(y_oof, 0.0, 1.0)
    y_te  = np.clip(y_te,  0.0, 1.0)
    return pd.DataFrame(y_oof, columns=y_true_df.columns), pd.DataFrame(y_te, columns=test_pred_df.columns)

# Shrinkage global vers la moyenne d'entra√Ænement.


def shrink_to_mean(test_pred_df, y_train_df, alpha=0.95):
    mu = y_train_df.mean(axis=0).values
    Y  = test_pred_df.values
    Ys = alpha * Y + (1 - alpha) * mu
    return pd.DataFrame(np.clip(Ys, 0.0, 1.0), columns=test_pred_df.columns)

# Shrinkage par cible avec alpha_j distincts (permet d'ajuster l'ampleur par colonne)

def shrink_to_mean_per_target(pred_df: pd.DataFrame, y_train_df: pd.DataFrame, alphas: np.ndarray) -> pd.DataFrame:
    """alphas: array shape (n_targets,) avec alpha_j pour chaque cible."""
    mu = y_train_df.mean(axis=0).to_numpy()          # (T,)
    Y  = pred_df.to_numpy()                           # (N,T)
    A  = np.asarray(alphas).reshape(1, -1)            # (1,T)
    Ys = A * Y + (1.0 - A) * mu.reshape(1, -1)
    return pd.DataFrame(np.clip(Ys, 0.0, 1.0), columns=pred_df.columns)


## üå≤ 3. Variantes ExtraTrees
D√©finition d'une petite famille de mod√®les ExtraTrees (avec et sans features "row-wise"),
plusieurs seeds et r√©glages (profondeur, bootstrap). Les param√®tres par d√©faut sont
centralis√©s et chaque variante ne pr√©cise que ses overrides.


In [4]:

# Configuration d'une variante ExtraTrees (nom, usage des features row-wise, overrides de params sklearn)
@dataclass
class ETConfig:
    name: str
    use_rw: bool
    params: Dict


def make_et(**kw) -> ExtraTreesRegressor:
    return ExtraTreesRegressor(
        n_estimators=kw.get("n_estimators", 500),
        max_features=kw.get("max_features", 0.7),
        min_samples_split=kw.get("min_samples_split", 8),
        min_samples_leaf=kw.get("min_samples_leaf", 2),
        max_depth=kw.get("max_depth", None),
        bootstrap=kw.get("bootstrap", False),
        max_samples=kw.get("max_samples", None),
        n_jobs=-1,
        random_state=kw.get("random_state", 42),
    )

VARIANTS: List[ETConfig] = [
    ETConfig("ET-Base", use_rw=False, params=dict(random_state=42)),
    ETConfig("ET-RW",   use_rw=True,  params=dict(random_state=42)),
    ETConfig("ET-DepthCap",   use_rw=False, params=dict(random_state=42, max_depth=24, max_features=0.8, min_samples_leaf=1)),
    ETConfig("ET-Bootstrap",    use_rw=True,  params=dict(random_state=42, bootstrap=True, max_samples=0.8, max_features=0.6, min_samples_leaf=3)),
    ETConfig("ET-Seed13",     use_rw=False, params=dict(random_state=13)),
    ETConfig("ET-Seed71",       use_rw=True,  params=dict(random_state=71)),
]


In [5]:


def run_et_variant(
    cfg: ETConfig,
    X_base_tr: pd.DataFrame,
    X_base_te: pd.DataFrame,
    X_rw_tr: pd.DataFrame,
    X_rw_te: pd.DataFrame,
    y_fit: pd.DataFrame,
    n_splits: int = 5,
    seed: int = 42,
) -> Tuple[np.ndarray, np.ndarray, float]:
    
    # Folds stratifi√©s sur la moyenne des cibles
    folds = make_stratified_folds(y_fit, n_splits=n_splits, random_state=seed)

     # S√©lection des features selon la variante (base vs row-wise), on retire 'ID' si pr√©sent
    Xtr = (X_rw_tr if cfg.use_rw else X_base_tr).drop(columns=["ID"], errors="ignore")
    Xte = (X_rw_te if cfg.use_rw else X_base_te).drop(columns=["ID"], errors="ignore")


    oof = np.zeros((len(Xtr), y_fit.shape[1]), dtype=float)
    te  = np.zeros((len(Xte), y_fit.shape[1]), dtype=float)



    t0 = time.time()

    for i, (tr_idx, va_idx) in enumerate(folds, 1):
        print(f"üöÄ {cfg.name} ‚Äî Fold {i}/{n_splits}")
        X_tr, X_va = Xtr.iloc[tr_idx], Xtr.iloc[va_idx]
        y_tr, y_va = y_fit.iloc[tr_idx], y_fit.iloc[va_idx]

        model = MultiOutputRegressor(make_et(**cfg.params), n_jobs=1)
        model.fit(X_tr, y_tr)

        oof[va_idx] = model.predict(X_va)
        te += model.predict(Xte) / n_splits


    elapsed = time.time() - t0

    # Score OOF avec la m√©tique du challenge 

    score = weighted_rmse(y_fit.values, oof)
    print(f"üéØ {cfg.name} ‚Äî OOF Weighted RMSE: {score:.6f} | ‚è± {elapsed:.1f}s\n")



    return oof, te, score

## üß© 4. Blend des mod√®les
Optimisation des poids sur le simplexe Dirichlet pour combiner les OOF.




In [6]:

# √âchantillonne des poids (n_samples, k) sur le simplex via Dirichlet.
def _dirichlet_weights(k: int, n_samples: int = 4000, temperature: float = 1.0, rng: np.random.Generator | None = None):
    rng = np.random.default_rng() if rng is None else rng
    alpha = np.ones(k) * temperature
    return rng.dirichlet(alpha, size=n_samples)  # (n_samples, k)


def optimize_global_weights(oofs: List[np.ndarray], y_true: np.ndarray, n_samples: int = 6000) -> np.ndarray:
    k = len(oofs)
    W = _dirichlet_weights(k, n_samples=n_samples)
    oofs_stack = np.stack(oofs, axis=-1)  # (n_samples, n_targets, k)
    best_w = None
    best_score = float("inf")
    for w in W:
        y_blend = np.tensordot(oofs_stack, w, axes=([2],[0]))  # (n_samples, n_targets)
        score = weighted_rmse(y_true, y_blend)
        if score < best_score:
            best_score = score
            best_w = w
    print(f"‚úÖ Best global blend (OOF): {best_score:.6f} | weights={np.round(best_w,3)}")
    return best_w


def optimize_per_target_weights(oofs: List[np.ndarray], y_true: np.ndarray, n_samples: int = 3000) -> np.ndarray:
    k = len(oofs)
    T = y_true.shape[1]
    oofs_stack = np.stack(oofs, axis=-1)  # (n_samples, n_targets, k)
    W_out = np.zeros((T, k), dtype=float)
    for j in range(T):
        W = _dirichlet_weights(k, n_samples=n_samples)
        best_w = None
        best = float("inf")
        yj_true = y_true[:, j]
        oofs_j = oofs_stack[:, j, :]  # (n_samples, k)
        for w in W:
            yj_pred = oofs_j @ w
            yj_pred = np.clip(yj_pred, 0, 1)
            wgt = np.where(yj_true >= 0.5, 1.2, 1.0)
            mse = np.mean(wgt * (yj_pred - yj_true) ** 2)
            rmse = np.sqrt(mse)
            if rmse < best:
                best = rmse
                best_w = w
        W_out[j] = best_w
    return W_out


def apply_global_weights(oofs: List[np.ndarray], tests: List[np.ndarray], w: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    oof_stack = np.stack(oofs, axis=-1)
    test_stack = np.stack(tests, axis=-1)
    oof_blend = np.tensordot(oof_stack, w, axes=([2],[0]))
    test_blend = np.tensordot(test_stack, w, axes=([2],[0]))
    return np.clip(oof_blend, 0, 1), np.clip(test_blend, 0, 1)


def apply_per_target_weights(oofs: List[np.ndarray], tests: List[np.ndarray], Wt: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    oof_stack = np.stack(oofs, axis=-1)
    test_stack = np.stack(tests, axis=-1)
    oof_blend = np.einsum('ntk,tk->nt', oof_stack, Wt)
    test_blend = np.einsum('mtk,tk->mt', test_stack, Wt)
    return np.clip(oof_blend, 0, 1), np.clip(test_blend, 0, 1)





## üß™ 5. Pipeline : diversit√© + blend + calibration + shrink
On entra√Æne toutes les variantes, on optimise les poids de blend sur OOF (global et par cible),
puis on applique une calibration lin√©aire et un shrinkage vers la moyenne. On retourne les
pr√©dictions OOF/Test finales et les m√©tadonn√©es (poids, alpha, scores).


In [7]:


def run_step1_diversity_blend(cv_splits: int = 5, seed: int = 42, do_per_target_weights: bool = True,
                              shrink_grid=(0.92,0.95,0.97,0.99)):
    """Entra√Æne les variantes ET, optimise le blend sur OOF, applique calibration + shrink,
                                retourne les r√©sultats."""


    # 1) Nettoyage & features
    X_base_tr, X_base_te = clean_features(X_train, X_test, drop_humidity=True)
    X_rw_tr  = add_rowwise_features(X_base_tr, exclude_cols=("ID",))
    X_rw_te  = add_rowwise_features(X_base_te, exclude_cols=("ID",))

    # 2) Cibles : enl√®ve ID et c15
    y_fit = y_train.drop(columns=["ID", "c15"], errors="ignore").copy()
    y_fit = y_fit.loc[X_base_tr.index]

    # 3) Entra√Æne variantes
    oofs: List[np.ndarray] = []
    tests: List[np.ndarray] = []
    scores: List[Tuple[str, float]] = []
    for cfg in VARIANTS:
        oof, te, sc = run_et_variant(cfg, X_base_tr, X_base_te, X_rw_tr, X_rw_te, y_fit, n_splits=cv_splits, seed=seed)
        oofs.append(oof); tests.append(te); scores.append((cfg.name, sc))

    print("=== R√©sum√© variantes (OOF) ===")
    for name, sc in scores:
        print(f"{name:<24} : {sc:.6f}")

    # 4) Blends
    y_true = y_fit.values
    w_glob = optimize_global_weights(oofs, y_true, n_samples=6000)
    oof_g, test_g = apply_global_weights(oofs, tests, w_glob)
    score_g = weighted_rmse(y_true, oof_g)
    print(f"üèÅ Blend GLOBAL ‚Äî OOF: {score_g:.6f}")

    blends = {"global": (oof_g, test_g, w_glob, score_g)}

    if do_per_target_weights:
        Wt = optimize_per_target_weights(oofs, y_true, n_samples=3000)
        oof_t, test_t = apply_per_target_weights(oofs, tests, Wt)
        score_t = weighted_rmse(y_true, oof_t)
        print(f"üèÅ Blend PAR CIBLE ‚Äî OOF: {score_t:.6f}")
        blends["per_target"] = (oof_t, test_t, Wt, score_t)

    # 5) Calibration + Shrink
    results = {}
    for k, (oof_b, test_b, W, sc_b) in blends.items():
        oof_df  = pd.DataFrame(oof_b,  columns=y_fit.columns)
        test_df = pd.DataFrame(test_b, columns=y_fit.columns)
        oof_cal, test_cal = calibrate_predictions_linear(y_fit, oof_df, test_df)

        best_alpha, best_score = None, float("inf")
        for a in shrink_grid:
            tmp = shrink_to_mean(oof_cal, y_fit, alpha=a)
            s = weighted_rmse(y_true, tmp.values)
            if s < best_score:
                best_alpha, best_score = a, s
        print(f"üîß Blend={k}: meilleur alpha={best_alpha} ‚Üí OOF apr√®s calib+shrink: {best_score:.6f}")

        oof_final  = shrink_to_mean(oof_cal,  y_fit, alpha=best_alpha).values
        test_final = shrink_to_mean(test_cal, y_fit, alpha=best_alpha).values
        oof_final  = np.clip(oof_final,  0, 1)
        test_final = np.clip(test_final, 0, 1)

        results[k] = {
            "weights": W,
            "oof_pre_calib": sc_b,
            "oof_post": best_score,
            "alpha": best_alpha,
            "oof_preds": oof_final,
            "test_preds": test_final,
        }



        # 6) S√©lection finale
    best_key = min(results.keys(), key=lambda k: results[k]["oof_post"])
    print(f"‚úÖ S√©lection finale: {best_key} ‚Äî OOF={results[best_key]['oof_post']:.6f} (alpha={results[best_key]['alpha']})")


    return {
        "variant_scores": scores,
        "blends": results,
        "best": best_key,
        "y_columns": y_fit.columns.tolist(),
    }




## üèÅ 6. Ex√©cution de la pipeline & g√©n√©ration de la soumission
On ex√©cute le pipeline complet (variantes ‚Üí blend ‚Üí calibration ‚Üí shrink), on s√©lectionne le meilleur
blend, puis on g√©n√®re le fichier de soumission final.


In [None]:


# Ex√©cute la pipeline (CV=3 pour la d√©mo ; mettre 5 pour le run final)
out = run_step1_diversity_blend(cv_splits=3, seed=42, do_per_target_weights=True)

# R√©cup√®re la cl√© du meilleur blend et ses pr√©dictions test
best_key = out["best"]
y_pred_blend = out["blends"][best_key]["test_preds"]  # (n_test, n_targets_sans_c15)



In [None]:

# Colonnes cibles : on retire 'ID' et 'c15' du fit
all_targets = [c for c in y_train.columns if c != "ID"]
cols_fit = [c for c in all_targets if c != "c15"]

# Cr√©e le DataFrame de pr√©dictions (clip pour rester dans [0,1])
pred_df = pd.DataFrame(np.clip(y_pred_blend, 0, 1), columns=cols_fit)

# R√©ins√®re la cible manquante 'c15'
pred_df.insert(all_targets.index("c15"), "c15", 0.0)

# Assemble la soumission finale avec ID et ordre exact des colonnes
submission = pd.concat([X_test["ID"], pred_df[all_targets]], axis=1)
submission.to_csv("submission_ET_BLEND_v1.csv", index=False)


# Diagnostics rapides
vals = submission.drop(columns=["ID"]).to_numpy()
assert np.isfinite(vals).all(), "NaN/Inf d√©tect√©s dans la soumission"
assert (vals >= -1e-9).all() and (vals <= 1+1e-9).all(), "Valeurs hors [0,1]"
assert submission.index.equals(X_test.index)
print("‚úÖ Submission pr√™te :", submission.shape)
