In [1]:
#!/usr/bin/env python3
"""
Data Fusion Contest 2026 Task 2: quality-first training pipeline + EDA (FULL-TRAIN + BEST_ITER).

Fixes vs previous FULL-TRAIN block:
1) Best iteration is now selected correctly:
   - We train on (train_split) with eval_set=(holdout) + early stopping
   - Read model.get_best_iteration()  (best on holdout metric)
   - Retrain on FULL TRAIN with iterations = best_iter (no eval_set)
2) Blend weights are tuned on holdout (global or per-target), then applied to FULL-TRAIN test preds.
3) Feature hygiene stats are corrected (no confusing counts).
4) Safer merging by customer_id (instead of assuming row order).

Outputs:
- submission.parquet (raw margins/logits; schema matches sample)
- eda_report.md
- blend_weights.csv (if per-target tuning enabled)
- best_iters.csv (best iterations for Multi + OvR per target)
"""

from __future__ import annotations

from pathlib import Path
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit


# =========================
# CONFIG (edit on Kaggle)
# =========================
DATA_DIR = Path("/kaggle/input/data-fusion-contest-2026")
OUTPUT_PATH = Path("submission.parquet")
EDA_REPORT_PATH = Path("eda_report.md")
BLEND_WEIGHTS_REPORT_PATH = Path("blend_weights.csv")
BEST_ITERS_REPORT_PATH = Path("best_iters.csv")

# If DATA_DIR does not exist, try kagglehub download.
USE_KAGGLEHUB_DOWNLOAD = False
KAGGLE_DATASET_ID = "hatab123/data-fusion-contest-2026"

# FULL TRAIN seeds
MULTI_SEEDS = [42]
OVR_SEEDS = [2026]

# Holdout used ONLY for:
# - early stopping (best iteration selection)
# - blend weight tuning
TUNE_HOLDOUT_SIZE = 0.05
TUNE_SEED = 42

# Blend options (public-LB robust mode)
AUTO_TUNE_BLEND_WEIGHT = True
USE_PER_TARGET_BLEND = True  # recommended for macro AUC
APPLY_RANK_BLEND = True      # blend on rank-normalized probs for robustness
MIN_HOLDOUT_GAIN_FOR_OVR = 0.0008
BLEND_GRID = np.array([0.60, 0.75, 0.85, 0.92, 1.00], dtype=np.float64)
MIN_MULTI_WEIGHT = 0.75      # shrink per-target weights toward Multi backbone
DEFAULT_BLEND_WEIGHT_MULTI = 0.90

# Optional targeted patch stage (for worst holdout targets)
ENABLE_PATCH_STAGE = True
PATCH_TOP_K = 20
PATCH_SEEDS = [42, 1337]
PATCH_MIN_GAIN = 0.0008
PATCH_BLEND_GRID = np.array([0.0, 0.15, 0.30, 0.45], dtype=np.float64)
PATCH_VAL_SHARE = 0.05
PATCH_MAX_ITERS = 2200
PATCH_OD_WAIT = 180

# Feature hygiene
DROP_CONST_FEATURES = True
DROP_NEAR_CONST_FEATURES = True
NEAR_CONST_DOMINANCE_THRESHOLD = 0.9995
MISSING_RATE_THRESHOLD = 0.997  # drop columns with >99.7% missing

# Training caps (upper bounds; early stopping will usually stop earlier)
MULTI_MAX_ITERS = 10000
MULTI_OD_WAIT = 350

OVR_MAX_ITERS = 6000
OVR_OD_WAIT = 300

# For binary on GPU, use Logloss for early stopping (AUC isn't properly supported on GPU)
OVR_EVAL_METRIC = "Logloss"


# =========================
# Utils
# =========================
def sigmoid(x: np.ndarray) -> np.ndarray:
    return 1.0 / (1.0 + np.exp(-x))


def logit(p: np.ndarray, eps: float = 1e-6) -> np.ndarray:
    p = np.clip(p, eps, 1.0 - eps)
    return np.log(p / (1.0 - p))


def safe_auc(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    if np.unique(y_true).shape[0] < 2:
        return 0.5
    return float(roc_auc_score(y_true, y_pred))


def rank_normalize_2d(arr: np.ndarray) -> np.ndarray:
    out = np.zeros_like(arr, dtype=np.float64)
    n = arr.shape[0]
    if n <= 1:
        return np.full_like(arr, 0.5, dtype=np.float64)

    for j in range(arr.shape[1]):
        order = np.argsort(arr[:, j], kind="mergesort")
        ranks = np.empty(n, dtype=np.float64)
        ranks[order] = np.arange(n, dtype=np.float64)
        out[:, j] = ranks / (n - 1.0)

    eps = 1e-6
    return np.clip(out, eps, 1.0 - eps)


def resolve_data_dir() -> Path:
    if DATA_DIR.exists():
        return DATA_DIR

    if not USE_KAGGLEHUB_DOWNLOAD:
        raise FileNotFoundError(
            f"DATA_DIR not found: {DATA_DIR}. Set correct path or enable USE_KAGGLEHUB_DOWNLOAD."
        )

    import kagglehub

    downloaded_root = Path(kagglehub.dataset_download(KAGGLE_DATASET_ID))
    print(f"Downloaded dataset root: {downloaded_root}")

    if (downloaded_root / "data").exists():
        return downloaded_root / "data"
    return downloaded_root


def load_data(data_dir: Path) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    train_main = pd.read_parquet(data_dir / "train_main_features.parquet")
    train_extra = pd.read_parquet(data_dir / "train_extra_features.parquet")
    test_main = pd.read_parquet(data_dir / "test_main_features.parquet")
    test_extra = pd.read_parquet(data_dir / "test_extra_features.parquet")
    target = pd.read_parquet(data_dir / "train_target.parquet")
    sample_submit = pd.read_parquet(data_dir / "sample_submit.parquet")
    return train_main, train_extra, test_main, test_extra, target, sample_submit


def merge_features(main_df: pd.DataFrame, extra_df: pd.DataFrame) -> pd.DataFrame:
    # safer: join by customer_id
    if "customer_id" in main_df.columns and "customer_id" in extra_df.columns:
        # drop duplicates if any
        overlap = [c for c in extra_df.columns if c in main_df.columns and c != "customer_id"]
        extra_use = extra_df.drop(columns=overlap) if overlap else extra_df
        return main_df.merge(extra_use, on="customer_id", how="left")
    # fallback: concat
    cols_to_add = [c for c in extra_df.columns if c not in main_df.columns]
    return pd.concat([main_df, extra_df[cols_to_add]], axis=1)


def run_eda(train_full: pd.DataFrame, test_full: pd.DataFrame, target: pd.DataFrame, report_path: Path) -> None:
    feature_cols = [c for c in train_full.columns if c != "customer_id"]
    cat_cols = [c for c in feature_cols if c.startswith("cat_feature")]
    num_cols = [c for c in feature_cols if c.startswith("num_feature")]

    miss_train = train_full[feature_cols].isna().mean().sort_values(ascending=False)
    miss_test = test_full[feature_cols].isna().mean().sort_values(ascending=False)

    target_cols = [c for c in target.columns if c != "customer_id"]
    pos_rate = target[target_cols].mean().sort_values(ascending=False)

    lines = []
    lines.append("# EDA report\n")
    lines.append(f"- train_full shape: {train_full.shape}")
    lines.append(f"- test_full shape: {test_full.shape}")
    lines.append(f"- target shape: {target.shape}")
    lines.append(f"- feature count: {len(feature_cols)}")
    lines.append(f"- categorical features: {len(cat_cols)}")
    lines.append(f"- numerical features: {len(num_cols)}\n")

    lines.append("## Missingness")
    lines.append(f"- mean missing rate (train): {miss_train.mean():.4f}")
    lines.append(f"- mean missing rate (test): {miss_test.mean():.4f}")
    lines.append("- top-20 missing features (train):")
    for name, val in miss_train.head(20).items():
        lines.append(f"  - {name}: {val:.4f}")

    lines.append("\n## Target prevalence")
    lines.append(f"- mean positive rate across targets: {pos_rate.mean():.4f}")
    lines.append("- top-10 most frequent targets:")
    for name, val in pos_rate.head(10).items():
        lines.append(f"  - {name}: {val:.4f}")
    lines.append("- top-10 rarest targets:")
    for name, val in pos_rate.tail(10).items():
        lines.append(f"  - {name}: {val:.4f}")

    report_path.write_text("\n".join(lines), encoding="utf-8")
    print(f"EDA report saved to: {report_path}")


def apply_feature_hygiene(train_df: pd.DataFrame, test_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, int]]:
    feature_cols = [c for c in train_df.columns if c != "customer_id"]

    const_set = set()
    near_const_set = set()
    ultra_missing_set = set()

    if DROP_CONST_FEATURES:
        const_set = {c for c in feature_cols if train_df[c].nunique(dropna=False) <= 1}

    if DROP_NEAR_CONST_FEATURES:
        for col in feature_cols:
            vc = train_df[col].value_counts(dropna=False, normalize=True)
            if len(vc) > 0 and float(vc.iloc[0]) >= NEAR_CONST_DOMINANCE_THRESHOLD:
                near_const_set.add(col)

    missing_rate = train_df[feature_cols].isna().mean()
    ultra_missing_set = set(missing_rate[missing_rate > MISSING_RATE_THRESHOLD].index.tolist())

    drop_cols = const_set | near_const_set | ultra_missing_set

    keep_cols = [c for c in train_df.columns if c not in drop_cols]
    train_df2 = train_df[keep_cols].copy()
    test_df2 = test_df[keep_cols].copy()

    stats = {
        "dropped_total_unique": len(drop_cols),
        "dropped_const": len(const_set),
        "dropped_near_const": len(near_const_set),
        "dropped_ultra_missing": len(ultra_missing_set),
        "overlap_nearconst_ultramissing": len(near_const_set & ultra_missing_set),
    }
    print(f"Feature hygiene stats: {stats}")
    return train_df2, test_df2, stats


def preprocess_cats(train_df: pd.DataFrame, test_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, List[int], List[str]]:
    feature_cols = [c for c in train_df.columns if c != "customer_id"]
    cat_cols = [c for c in feature_cols if c.startswith("cat_feature")]

    for col in cat_cols:
        train_df[col] = train_df[col].fillna("__MISSING__").astype(str)
        test_df[col] = test_df[col].fillna("__MISSING__").astype(str)

    cat_indices = [feature_cols.index(c) for c in cat_cols]
    return train_df, test_df, cat_indices, feature_cols


# =========================
# Training helpers
# =========================
def _best_iter_from_model(model: CatBoostClassifier, fallback_iters: int) -> int:
    bi = model.get_best_iteration()
    if bi is None or bi < 0:
        # tree_count_ is number of trees, best iteration unknown -> use all
        return int(fallback_iters)
    # get_best_iteration is 0-based; iterations parameter expects count
    return int(bi) + 1


def train_multilabel_tune_best_iter(
    x_tr: pd.DataFrame,
    y_tr: pd.DataFrame,
    x_va: pd.DataFrame,
    y_va: pd.DataFrame,
    cat_indices: List[int],
    seeds: List[int],
) -> Tuple[int, np.ndarray]:
    """
    Train MultiLogloss on train-split with eval_set=holdout and early stopping.
    Returns:
      best_iter_multi (median across seeds)
      pred_va_raw averaged across seeds (raw margins on holdout)
    """
    n_targets = y_tr.shape[1]
    pred_va = np.zeros((x_va.shape[0], n_targets), dtype=np.float64)
    best_iters = []

    tr_pool = Pool(x_tr, label=y_tr, cat_features=cat_indices)
    va_pool = Pool(x_va, label=y_va, cat_features=cat_indices)

    for i, seed in enumerate(seeds, start=1):
        print(f"[Multi-TUNE] seed={seed} ({i}/{len(seeds)})")
        model = CatBoostClassifier(
            loss_function="MultiLogloss",
            eval_metric="MultiLogloss",
            iterations=MULTI_MAX_ITERS,
            learning_rate=0.028,
            depth=8,
            l2_leaf_reg=9.0,
            random_strength=1.2,
            bagging_temperature=0.8,
            border_count=254,
            bootstrap_type="Bayesian",
            leaf_estimation_iterations=5,
            od_type="Iter",
            od_wait=MULTI_OD_WAIT,
            random_seed=seed,
            task_type="GPU",
            devices="0",
            verbose=300,
            allow_writing_files=False,
        )
        model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

        bi = _best_iter_from_model(model, MULTI_MAX_ITERS)
        best_iters.append(bi)
        pred_va += model.predict(va_pool, prediction_type="RawFormulaVal") / max(len(seeds), 1)

    best_iter_multi = int(np.median(best_iters))
    print(f"[Multi-TUNE] best_iter candidates={best_iters} -> median={best_iter_multi}")
    return best_iter_multi, pred_va


def train_multilabel_fulltrain(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_test: pd.DataFrame,
    cat_indices: List[int],
    seeds: List[int],
    iterations: int,
) -> np.ndarray:
    """
    Train MultiLogloss on FULL train with fixed iterations (best_iter from tuning).
    Returns raw margins on test averaged across seeds.
    """
    n_targets = y_train.shape[1]
    test_pred = np.zeros((x_test.shape[0], n_targets), dtype=np.float64)

    train_pool = Pool(x_train, label=y_train, cat_features=cat_indices)
    test_pool = Pool(x_test, cat_features=cat_indices)

    for i, seed in enumerate(seeds, start=1):
        print(f"[Multi-FULL] seed={seed} ({i}/{len(seeds)}) iters={iterations}")
        model = CatBoostClassifier(
            loss_function="MultiLogloss",
            eval_metric="MultiLogloss",
            iterations=int(iterations),
            learning_rate=0.028,
            depth=8,
            l2_leaf_reg=9.0,
            random_strength=1.2,
            bagging_temperature=0.8,
            border_count=254,
            bootstrap_type="Bayesian",
            leaf_estimation_iterations=5,
            random_seed=seed,
            task_type="GPU",
            devices="0",
            verbose=300,
            allow_writing_files=False,
        )
        model.fit(train_pool)
        test_pred += model.predict(test_pool, prediction_type="RawFormulaVal") / max(len(seeds), 1)

    return test_pred


def train_ovr_tune_best_iters(
    x_tr: pd.DataFrame,
    y_tr: pd.DataFrame,
    x_va: pd.DataFrame,
    y_va: pd.DataFrame,
    cat_indices: List[int],
    seeds: List[int],
    target_names: List[str],
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Tune best iterations per target on train-split with eval_set=holdout and early stopping.
    Returns:
      best_iters_ovr: int array shape (n_targets,)
      pred_va_raw: raw margins on holdout averaged across seeds (shape holdout x n_targets)
    """
    n_targets = y_tr.shape[1]
    best_iters = np.zeros(n_targets, dtype=np.int32)
    pred_va = np.zeros((x_va.shape[0], n_targets), dtype=np.float64)

    for j, tname in enumerate(target_names):
        ytr = y_tr.iloc[:, j].values
        yva = y_va.iloc[:, j].values

        if np.unique(ytr).shape[0] < 2 or np.unique(yva).shape[0] < 2:
            # degenerate -> keep small iters
            best_iters[j] = 200
            continue

        iters_list = []
        pv = np.zeros(x_va.shape[0], dtype=np.float64)

        for i, seed in enumerate(seeds, start=1):
            print(f"[OvR-TUNE] {tname} seed={seed} ({i}/{len(seeds)})")
            tr_pool = Pool(x_tr, label=ytr, cat_features=cat_indices)
            va_pool = Pool(x_va, label=yva, cat_features=cat_indices)

            model = CatBoostClassifier(
                loss_function="Logloss",
                eval_metric=OVR_EVAL_METRIC,   # Logloss for GPU early stop
                iterations=OVR_MAX_ITERS,
                learning_rate=0.03,
                depth=7,
                l2_leaf_reg=8.0,
                random_strength=1.0,
                bagging_temperature=0.7,
                border_count=254,
                bootstrap_type="Bayesian",
                leaf_estimation_iterations=5,
                od_type="Iter",
                od_wait=OVR_OD_WAIT,
                auto_class_weights="Balanced",
                random_seed=seed,
                task_type="GPU",
                devices="0",
                verbose=0,   # too verbose otherwise
                allow_writing_files=False,
            )
            model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

            bi = _best_iter_from_model(model, OVR_MAX_ITERS)
            iters_list.append(bi)
            pv += model.predict(va_pool, prediction_type="RawFormulaVal").reshape(-1) / max(len(seeds), 1)

        best_iters[j] = int(np.median(iters_list))
        pred_va[:, j] = pv

        # optional: quick AUC sanity on holdout for this target
        auc_j = safe_auc(yva, sigmoid(pv))
        print(f"[OvR-TUNE] {tname} best_iters={iters_list} median={best_iters[j]} holdout_auc={auc_j:.4f}")

    return best_iters, pred_va


def train_ovr_fulltrain(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_test: pd.DataFrame,
    cat_indices: List[int],
    seeds: List[int],
    target_names: List[str],
    best_iters: np.ndarray,
) -> np.ndarray:
    """
    Train OvR on FULL train per target with fixed iterations (best_iters from tuning).
    Returns raw margins on test (shape test x n_targets).
    """
    n_targets = y_train.shape[1]
    test_pred = np.zeros((x_test.shape[0], n_targets), dtype=np.float64)
    test_pool = Pool(x_test, cat_features=cat_indices)

    for j, tname in enumerate(target_names):
        iters = int(best_iters[j])
        ycol = y_train.iloc[:, j].values
        if np.unique(ycol).shape[0] < 2:
            # degenerate -> constant
            test_pred[:, j] = logit(np.full(x_test.shape[0], ycol.mean(), dtype=np.float64))
            continue

        pred_test = np.zeros(x_test.shape[0], dtype=np.float64)
        train_pool = Pool(x_train, label=ycol, cat_features=cat_indices)

        for i, seed in enumerate(seeds, start=1):
            print(f"[OvR-FULL] {tname} seed={seed} ({i}/{len(seeds)}) iters={iters}")
            model = CatBoostClassifier(
                loss_function="Logloss",
                eval_metric="Logloss",
                iterations=iters,
                learning_rate=0.03,
                depth=7,
                l2_leaf_reg=8.0,
                random_strength=1.0,
                bagging_temperature=0.7,
                border_count=254,
                bootstrap_type="Bayesian",
                leaf_estimation_iterations=5,
                auto_class_weights="Balanced",
                random_seed=seed,
                task_type="GPU",
                devices="0",
                verbose=0,
                allow_writing_files=False,
            )
            model.fit(train_pool)
            pred_test += model.predict(test_pool, prediction_type="RawFormulaVal").reshape(-1) / max(len(seeds), 1)

        test_pred[:, j] = pred_test

    return test_pred


# =========================
# Blend tuning (holdout)
# =========================
def find_best_blend_weight_global(y_true_arr: np.ndarray, p_multi: np.ndarray, p_ovr: np.ndarray) -> Tuple[float, float]:
    weights = np.linspace(0.0, 1.0, 21)
    best_w, best_auc = 0.5, -1.0
    for w in weights:
        p_blend = w * p_multi + (1.0 - w) * p_ovr
        auc = roc_auc_score(y_true_arr, p_blend, average="macro")
        print(f"Global blend weight={w:.2f} | holdout macro AUC={auc:.6f}")
        if auc > best_auc:
            best_auc = float(auc)
            best_w = float(w)
    return best_w, best_auc


def find_best_blend_weight_per_target(
    y_true_arr: np.ndarray,
    p_multi: np.ndarray,
    p_ovr: np.ndarray,
    target_names: List[str],
) -> Tuple[np.ndarray, float, pd.DataFrame]:
    n_targets = y_true_arr.shape[1]

    best_weights = np.ones(n_targets, dtype=np.float64)
    rows = []

    for j in range(n_targets):
        yj = y_true_arr[:, j]
        auc_multi = safe_auc(yj, p_multi[:, j])
        auc_ovr = safe_auc(yj, p_ovr[:, j])

        if auc_ovr <= auc_multi + MIN_HOLDOUT_GAIN_FOR_OVR:
            best_weights[j] = 1.0
            rows.append({
                "target": target_names[j],
                "best_weight_multi": 1.0,
                "holdout_auc": auc_multi,
                "holdout_auc_multi": auc_multi,
                "holdout_auc_ovr": auc_ovr,
                "ovr_used": 0,
            })
            continue

        best_w_j = 1.0
        best_auc_j = auc_multi
        for w in BLEND_GRID:
            pj = w * p_multi[:, j] + (1.0 - w) * p_ovr[:, j]
            auc_j = safe_auc(yj, pj)
            if auc_j > best_auc_j:
                best_auc_j = auc_j
                best_w_j = float(w)

        best_w_j = max(best_w_j, MIN_MULTI_WEIGHT)
        best_weights[j] = best_w_j
        rows.append({
            "target": target_names[j],
            "best_weight_multi": best_w_j,
            "holdout_auc": best_auc_j,
            "holdout_auc_multi": auc_multi,
            "holdout_auc_ovr": auc_ovr,
            "ovr_used": int(best_w_j < 1.0),
        })

    p_blend = p_multi * best_weights.reshape(1, -1) + p_ovr * (1.0 - best_weights.reshape(1, -1))
    macro_auc = roc_auc_score(y_true_arr, p_blend, average="macro")

    report_df = pd.DataFrame(rows)
    return best_weights, float(macro_auc), report_df


def per_target_auc_frame(y_true: np.ndarray, y_pred: np.ndarray, target_names: List[str], score_col: str) -> pd.DataFrame:
    rows = []
    for j, t in enumerate(target_names):
        rows.append({"target": t, score_col: safe_auc(y_true[:, j], y_pred[:, j])})
    return pd.DataFrame(rows)


def patch_params_for_target(pos_rate: float) -> Dict[str, float]:
    if pos_rate < 0.005:
        return {"iterations": min(PATCH_MAX_ITERS, 1600), "depth": 7, "learning_rate": 0.07, "l2_leaf_reg": 28.0, "od_wait": 120}
    if pos_rate < 0.05:
        return {"iterations": min(PATCH_MAX_ITERS, 1900), "depth": 7, "learning_rate": 0.06, "l2_leaf_reg": 24.0, "od_wait": 140}
    return {"iterations": PATCH_MAX_ITERS, "depth": 8, "learning_rate": 0.05, "l2_leaf_reg": 20.0, "od_wait": PATCH_OD_WAIT}


def rank_blend_1d(base_scores: np.ndarray, patch_scores: np.ndarray, patch_weight: float) -> np.ndarray:
    base_r = rank_normalize_2d(base_scores.reshape(-1, 1)).reshape(-1)
    patch_r = rank_normalize_2d(patch_scores.reshape(-1, 1)).reshape(-1)
    blend_r = (1.0 - patch_weight) * base_r + patch_weight * patch_r
    return logit(blend_r)


def apply_targeted_patch_stage(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_test: pd.DataFrame,
    cat_indices: List[int],
    target_names: List[str],
    base_va_logits: np.ndarray,
    base_test_logits: np.ndarray,
    tune_seed: int,
) -> Tuple[np.ndarray, pd.DataFrame]:
    y_sum = y_train.sum(axis=1).values
    bins = pd.cut(y_sum, bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 100], labels=False).astype(int)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=PATCH_VAL_SHARE, random_state=tune_seed)
    tr_idx, va_idx = next(sss.split(np.zeros((len(y_train), 1)), bins))

    x_tr, x_va = x_train.iloc[tr_idx], x_train.iloc[va_idx]
    y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]

    base_va_logits = np.asarray(base_va_logits, dtype=np.float64)
    base_va_prob = sigmoid(base_va_logits)
    y_va_arr = y_va.values

    auc_df = per_target_auc_frame(y_va_arr, base_va_prob, target_names, "base_holdout_auc")
    worst_targets = auc_df.sort_values("base_holdout_auc").head(PATCH_TOP_K)["target"].tolist()
    print(f"[PATCH] selected worst targets: {len(worst_targets)}")

    patched_logits = np.asarray(base_test_logits, dtype=np.float64).copy()
    reports = []

    for tname in worst_targets:
        j = target_names.index(tname)
        ytr_col = y_tr.iloc[:, j].values
        yva_col = y_va.iloc[:, j].values
        if np.unique(ytr_col).shape[0] < 2 or np.unique(yva_col).shape[0] < 2:
            reports.append({"target": tname, "patched": 0, "reason": "degenerate"})
            continue

        pos_rate = float(y_train.iloc[:, j].mean())
        pp = patch_params_for_target(pos_rate)
        va_patch_logits = np.zeros(len(x_va), dtype=np.float64)
        test_patch_logits = np.zeros(len(x_test), dtype=np.float64)

        for seed in PATCH_SEEDS:
            model = CatBoostClassifier(
                loss_function="Logloss",
                eval_metric="Logloss",
                iterations=int(pp["iterations"]),
                learning_rate=float(pp["learning_rate"]),
                depth=int(pp["depth"]),
                l2_leaf_reg=float(pp["l2_leaf_reg"]),
                random_strength=1.0,
                bootstrap_type="Bernoulli",
                subsample=0.88,
                od_type="Iter",
                od_wait=int(pp["od_wait"]),
                auto_class_weights="Balanced",
                random_seed=int(seed),
                task_type="GPU",
                devices="0",
                verbose=0,
                allow_writing_files=False,
            )
            tr_pool = Pool(x_tr, label=ytr_col, cat_features=cat_indices)
            va_pool = Pool(x_va, label=yva_col, cat_features=cat_indices)
            te_pool = Pool(x_test, cat_features=cat_indices)
            model.fit(tr_pool, eval_set=va_pool, use_best_model=True)
            va_patch_logits += model.predict(va_pool, prediction_type="RawFormulaVal").reshape(-1) / max(len(PATCH_SEEDS), 1)
            test_patch_logits += model.predict(te_pool, prediction_type="RawFormulaVal").reshape(-1) / max(len(PATCH_SEEDS), 1)

        base_va_target_logits = base_va_logits[:, j]
        best_w = 0.0
        best_auc = safe_auc(yva_col, sigmoid(base_va_target_logits))

        for w in PATCH_BLEND_GRID:
            trial_logits = rank_blend_1d(base_va_target_logits, va_patch_logits, patch_weight=float(w))
            trial_auc = safe_auc(yva_col, sigmoid(trial_logits))
            if trial_auc > best_auc:
                best_auc = trial_auc
                best_w = float(w)

        base_auc = safe_auc(yva_col, sigmoid(base_va_target_logits))
        gain = best_auc - base_auc
        if gain >= PATCH_MIN_GAIN and best_w > 0:
            patched_logits[:, j] = rank_blend_1d(patched_logits[:, j], test_patch_logits, patch_weight=best_w)
            reports.append({
                "target": tname,
                "patched": 1,
                "patch_weight": best_w,
                "base_holdout_auc": base_auc,
                "patched_holdout_auc": best_auc,
                "gain": gain,
                "pos_rate": pos_rate,
            })
            print(f"[PATCH] {tname}: gain={gain:.6f}, w={best_w:.2f}")
        else:
            reports.append({
                "target": tname,
                "patched": 0,
                "patch_weight": 0.0,
                "base_holdout_auc": base_auc,
                "patched_holdout_auc": best_auc,
                "gain": gain,
                "pos_rate": pos_rate,
            })

    report_df = pd.DataFrame(reports)
    return patched_logits, report_df


def build_submission(sample_submit: pd.DataFrame, preds: np.ndarray, out_path: Path) -> None:
    # ensure schema (order) is exactly sample
    sub = sample_submit.copy()
    sub.iloc[:, 1:] = preds.astype(np.float64)
    # keep exact dtype for customer_id as sample
    if sub["customer_id"].dtype != sample_submit["customer_id"].dtype:
        sub["customer_id"] = sub["customer_id"].astype(sample_submit["customer_id"].dtype)
    sub.to_parquet(out_path, index=False)
    print(f"Saved submission: {out_path} | shape={sub.shape}")


# =========================
# Main
# =========================
def main() -> None:
    print("CONFIG:")
    print(f"  DATA_DIR={DATA_DIR}")
    print(f"  OUTPUT_PATH={OUTPUT_PATH}")
    print(f"  MULTI_SEEDS={MULTI_SEEDS}, OVR_SEEDS={OVR_SEEDS}")
    print(f"  HOLDOUT size={TUNE_HOLDOUT_SIZE} seed={TUNE_SEED}")
    print(f"  AUTO_TUNE_BLEND_WEIGHT={AUTO_TUNE_BLEND_WEIGHT}")
    print(f"  USE_PER_TARGET_BLEND={USE_PER_TARGET_BLEND}")
    print(f"  APPLY_RANK_BLEND={APPLY_RANK_BLEND}")
    print(f"  MIN_HOLDOUT_GAIN_FOR_OVR={MIN_HOLDOUT_GAIN_FOR_OVR}")
    print(f"  ENABLE_PATCH_STAGE={ENABLE_PATCH_STAGE}, PATCH_TOP_K={PATCH_TOP_K}")

    data_dir = resolve_data_dir()
    print(f"Using data dir: {data_dir}")

    print("Loading data...")
    train_main, train_extra, test_main, test_extra, target, sample_submit = load_data(data_dir)

    print("Merging features...")
    train_full = merge_features(train_main, train_extra)
    test_full = merge_features(test_main, test_extra)

    print("Running EDA...")
    run_eda(train_full, test_full, target, EDA_REPORT_PATH)

    print("Applying feature hygiene...")
    train_full, test_full, _ = apply_feature_hygiene(train_full, test_full)

    print("Preprocessing categorical features...")
    train_full, test_full, cat_indices, feature_cols = preprocess_cats(train_full, test_full)

    # align target to train_full by customer_id
    target_cols = [c for c in target.columns if c != "customer_id"]
    if "customer_id" in train_full.columns and "customer_id" in target.columns:
        target = target.sort_values("customer_id").reset_index(drop=True)
        train_full = train_full.sort_values("customer_id").reset_index(drop=True)
        assert np.all(train_full["customer_id"].values == target["customer_id"].values), "customer_id mismatch after sort"
    y_train = target[target_cols]

    x_train = train_full[feature_cols]
    x_test = test_full[feature_cols]

    # ---------- Holdout split for tuning ----------
    y_sum = y_train.sum(axis=1).values
    bins = pd.cut(y_sum, bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 100], labels=False).astype(int)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=TUNE_HOLDOUT_SIZE, random_state=TUNE_SEED)
    tr_idx, va_idx = next(sss.split(np.zeros((len(y_train), 1)), bins))

    x_tr, x_va = x_train.iloc[tr_idx], x_train.iloc[va_idx]
    y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]
    y_va_arr = y_va.values

    print("\n[TUNE] Finding best iterations with early stopping on holdout...")

    best_iter_multi, pred_multi_va_raw = train_multilabel_tune_best_iter(
        x_tr=x_tr, y_tr=y_tr, x_va=x_va, y_va=y_va,
        cat_indices=cat_indices, seeds=MULTI_SEEDS
    )

    best_iters_ovr, pred_ovr_va_raw = train_ovr_tune_best_iters(
        x_tr=x_tr, y_tr=y_tr, x_va=x_va, y_va=y_va,
        cat_indices=cat_indices, seeds=OVR_SEEDS, target_names=target_cols
    )

    # Save best iters report
    iters_df = pd.DataFrame({
        "target": ["__MULTI__"] + target_cols,
        "best_iter": [best_iter_multi] + best_iters_ovr.tolist()
    })
    iters_df.to_csv(BEST_ITERS_REPORT_PATH, index=False)
    print(f"Saved best iters: {BEST_ITERS_REPORT_PATH}")

    # ---------- Tune blend weights on holdout ----------
    p_multi_va = sigmoid(pred_multi_va_raw)
    p_ovr_va = sigmoid(pred_ovr_va_raw)

    if APPLY_RANK_BLEND:
        p_multi_va_for_blend = rank_normalize_2d(p_multi_va)
        p_ovr_va_for_blend = rank_normalize_2d(p_ovr_va)
    else:
        p_multi_va_for_blend = p_multi_va
        p_ovr_va_for_blend = p_ovr_va

    if AUTO_TUNE_BLEND_WEIGHT:
        if USE_PER_TARGET_BLEND:
            best_w_vec, macro_auc, report_df = find_best_blend_weight_per_target(
                y_true_arr=y_va_arr,
                p_multi=p_multi_va_for_blend,
                p_ovr=p_ovr_va_for_blend,
                target_names=target_cols,
            )
            report_df.to_csv(BLEND_WEIGHTS_REPORT_PATH, index=False)
            print(f"[BLEND] Per-target tuned on holdout. macro AUC={macro_auc:.6f}")
            print(f"Saved blend weights report: {BLEND_WEIGHTS_REPORT_PATH}")
        else:
            best_w, best_auc = find_best_blend_weight_global(y_va_arr, p_multi_va_for_blend, p_ovr_va_for_blend)
            best_w_vec = None
            print(f"[BLEND] Best global w={best_w:.3f} macro AUC={best_auc:.6f}")
    else:
        best_w_vec = None
        best_w = DEFAULT_BLEND_WEIGHT_MULTI
        print(f"[BLEND] Using default w={best_w:.3f}")

    if AUTO_TUNE_BLEND_WEIGHT:
        if USE_PER_TARGET_BLEND and best_w_vec is not None:
            p_blend_va = p_multi_va_for_blend * best_w_vec.reshape(1, -1) + p_ovr_va_for_blend * (1.0 - best_w_vec.reshape(1, -1))
        else:
            p_blend_va = best_w * p_multi_va_for_blend + (1.0 - best_w) * p_ovr_va_for_blend
    else:
        p_blend_va = DEFAULT_BLEND_WEIGHT_MULTI * p_multi_va_for_blend + (1.0 - DEFAULT_BLEND_WEIGHT_MULTI) * p_ovr_va_for_blend
    base_va_logits_for_patch = logit(p_blend_va)

    # ---------- FINAL full-train models with tuned iterations ----------
    print("\n[FINAL] Training MultiLogloss on FULL train with best_iter...")
    test_multi_raw = train_multilabel_fulltrain(
        x_train=x_train, y_train=y_train, x_test=x_test,
        cat_indices=cat_indices, seeds=MULTI_SEEDS, iterations=best_iter_multi
    )

    print("\n[FINAL] Training OvR on FULL train with per-target best_iters...")
    test_ovr_raw = train_ovr_fulltrain(
        x_train=x_train, y_train=y_train, x_test=x_test,
        cat_indices=cat_indices, seeds=OVR_SEEDS, target_names=target_cols, best_iters=best_iters_ovr
    )

    p_multi_test = sigmoid(test_multi_raw)
    p_ovr_test = sigmoid(test_ovr_raw)

    if APPLY_RANK_BLEND:
        p_multi_test_for_blend = rank_normalize_2d(p_multi_test)
        p_ovr_test_for_blend = rank_normalize_2d(p_ovr_test)
    else:
        p_multi_test_for_blend = p_multi_test
        p_ovr_test_for_blend = p_ovr_test

    if AUTO_TUNE_BLEND_WEIGHT:
        if USE_PER_TARGET_BLEND and best_w_vec is not None:
            p_blend_test = p_multi_test_for_blend * best_w_vec.reshape(1, -1) + p_ovr_test_for_blend * (1.0 - best_w_vec.reshape(1, -1))
        else:
            p_blend_test = best_w * p_multi_test_for_blend + (1.0 - best_w) * p_ovr_test_for_blend
    else:
        p_blend_test = DEFAULT_BLEND_WEIGHT_MULTI * p_multi_test_for_blend + (1.0 - DEFAULT_BLEND_WEIGHT_MULTI) * p_ovr_test_for_blend

    pred_final = logit(p_blend_test)

    if ENABLE_PATCH_STAGE:
        print("[PATCH] Running targeted patch stage...")
        patched_logits, patch_report = apply_targeted_patch_stage(
            x_train=x_train,
            y_train=y_train,
            x_test=x_test,
            cat_indices=cat_indices,
            target_names=target_cols,
            base_va_logits=base_va_logits_for_patch,
            base_test_logits=pred_final,
            tune_seed=TUNE_SEED + 17,
        )
        patch_report.to_csv("patch_stage_report.csv", index=False)
        print("[PATCH] Saved patch report: patch_stage_report.csv")
        pred_final = patched_logits

    build_submission(sample_submit, pred_final, OUTPUT_PATH)


if __name__ == "__main__":
    main()

CONFIG:
  DATA_DIR=/kaggle/input/data-fusion-contest-2026
  OUTPUT_PATH=submission.parquet
  MULTI_SEEDS=[42], OVR_SEEDS=[2026]
  HOLDOUT size=0.05 seed=42
  AUTO_TUNE_BLEND_WEIGHT=True
  USE_PER_TARGET_BLEND=True
Using data dir: /kaggle/input/data-fusion-contest-2026
Loading data...
Merging features...
Running EDA...
EDA report saved to: eda_report.md
Applying feature hygiene...
Feature hygiene stats: {'dropped_total_unique': 177, 'dropped_const': 0, 'dropped_near_const': 65, 'dropped_ultra_missing': 177, 'overlap_nearconst_ultramissing': 65}
Preprocessing categorical features...

[TUNE] Finding best iterations with early stopping on holdout...
[Multi-TUNE] seed=42 (1/1)
0:	learn: 0.6336614	test: 0.6336422	best: 0.6336422 (0)	total: 859ms	remaining: 2h 23m 9s
300:	learn: 0.0846191	test: 0.0852740	best: 0.0852740 (300)	total: 4m 11s	remaining: 2h 15m 7s
600:	learn: 0.0817943	test: 0.0835613	best: 0.0835613 (600)	total: 8m 20s	remaining: 2h 10m 33s
900:	learn: 0.0799346	test: 0.0827614	

In [2]:
# ============================================================
# BLOCK 1 (FIXED): Build /kaggle/working/per_target_auc.csv
# Priority:
#   1) use OOF (if available in memory or .npy)
#   2) fallback to blend_weights.csv (holdout_auc / oof_auc)
#
# Writes:
#   /kaggle/working/per_target_auc.csv      (all targets, worst->best)
#   /kaggle/working/worst_targets_25.txt    (top-25 worst targets)
# ============================================================

import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

# -----------------------------
# helpers
# -----------------------------
def autodiscover_data_dir():
    candidates = [
        "/kaggle/input/data-fusion-contest-2026",
        "/kaggle/input/datasets/hatab123/data-fusion-contest-2026",
    ]
    for c in candidates:
        if os.path.exists(os.path.join(c, "sample_submit.parquet")) and os.path.exists(os.path.join(c, "train_target.parquet")):
            return c
    base = "/kaggle/input"
    for root, _, files in os.walk(base):
        if "sample_submit.parquet" in files and "train_target.parquet" in files:
            return root
    raise FileNotFoundError("Не удалось найти sample_submit.parquet + train_target.parquet в /kaggle/input")

def safe_auc(y_true, y_score):
    if np.unique(y_true).size < 2:
        return np.nan
    return roc_auc_score(y_true, y_score)

def rank_pct_1d(x: np.ndarray) -> np.ndarray:
    x = np.asarray(x)
    n = x.shape[0]
    order = np.argsort(x, kind="mergesort")
    r = np.empty(n, dtype=np.int32)
    r[order] = np.arange(n, dtype=np.int32)
    return ((r + 1) / (n + 1)).astype(np.float32)

# -----------------------------
# paths + schema
# -----------------------------
DATA_DIR = autodiscover_data_dir()
SAMPLE_SUBMIT_PATH = os.path.join(DATA_DIR, "sample_submit.parquet")
TRAIN_TARGET_PATH  = os.path.join(DATA_DIR, "train_target.parquet")

sample = pd.read_parquet(SAMPLE_SUBMIT_PATH)
submit_cols = sample.columns.tolist()
pred_cols = [c for c in submit_cols if c != "customer_id"]
target_cols = [c.replace("predict_", "target_") for c in pred_cols]

# target prevalence (always useful)
y_df = pd.read_parquet(TRAIN_TARGET_PATH)[["customer_id"] + target_cols].sort_values("customer_id").reset_index(drop=True)
Y = y_df[target_cols].astype(np.uint8).values
pos_rate = Y.mean(axis=0)

# -----------------------------
# try OOF first
# -----------------------------
P = None
oof_source = None

# in-memory candidates
for nm in ["oof", "oof_raw", "oof_cat", "oof_pred", "base_oof", "stack_oof"]:
    if nm in globals():
        arr = np.asarray(globals()[nm])
        if arr.ndim == 2 and arr.shape == Y.shape:
            P = arr
            oof_source = f"globals()['{nm}']"
            break

# file candidates
if P is None:
    for cand in [
        "/kaggle/working/oof.npy",
        "/kaggle/working/oof_cat.npy",
        "/kaggle/working/oof_logits.npy",
        "/kaggle/working/stack_oof.npy",
    ]:
        if os.path.exists(cand):
            arr = np.load(cand)
            if arr.ndim == 2 and arr.shape == Y.shape:
                P = arr
                oof_source = cand
                break

# -----------------------------
# build per_target_auc.csv
# -----------------------------
rows = []

if P is not None:
    # TRUE OOF mode (best)
    auc_rank = np.array([safe_auc(Y[:, j], rank_pct_1d(P[:, j])) for j in range(Y.shape[1])], dtype=np.float64)
    source_name = f"OOF ranks ({oof_source})"
    n_eval_rows = int(Y.shape[0])

    df_full = pd.DataFrame({
        "target": target_cols,
        "pos_rate": pos_rate,
        "auc": auc_rank,        # for old patch blocks expecting column "auc"
        "auc_rank": auc_rank,   # for new blocks
        "metric_source": source_name,
        "n_eval_rows": n_eval_rows,
    })

else:
    # FALLBACK: use holdout metrics from blend_weights.csv (from your FULL-TRAIN block)
    bw_candidates = [
        "/kaggle/working/blend_weights.csv",
        "blend_weights.csv",
    ]
    bw_path = None
    for p in bw_candidates:
        if os.path.exists(p):
            bw_path = p
            break

    if bw_path is None:
        raise ValueError(
            "Не найден OOF И не найден blend_weights.csv.\n"
            "Сделай одно из двух:\n"
            "1) сохрани OOF (oof.npy), ИЛИ\n"
            "2) используй FULL-TRAIN блок с holdout (он создает blend_weights.csv)."
        )

    bw = pd.read_csv(bw_path)

    # detect metric column
    metric_col = None
    for c in ["holdout_auc", "oof_auc", "auc", "auc_rank"]:
        if c in bw.columns:
            metric_col = c
            break

    if metric_col is None or "target" not in bw.columns:
        raise ValueError(f"В {bw_path} нет нужных колонок (ожидал target + holdout_auc/oof_auc/auc).")

    # merge with all targets to ensure all 41 are present
    df_full = pd.DataFrame({
        "target": target_cols,
        "pos_rate": pos_rate,
    }).merge(
        bw[["target", metric_col]].rename(columns={metric_col: "auc"}),
        on="target",
        how="left"
    )

    # duplicate for compatibility
    df_full["auc_rank"] = df_full["auc"]
    df_full["metric_source"] = f"{os.path.basename(bw_path)}::{metric_col} (HOLDOUT proxy)"
    # approximate eval rows from your config (5% of 750k) if not available
    df_full["n_eval_rows"] = int(round(len(Y) * 0.05))

# ranking / gaps
df_full["rank_worst_to_best"] = df_full["auc"].rank(method="min", ascending=True).astype("Int64")
df_full["gap_to_0_82"] = 0.82 - df_full["auc"]
df_full["gap_to_0_85"] = 0.85 - df_full["auc"]

# sort worst -> best
df_full = df_full.sort_values(["auc", "target"], ascending=[True, True]).reset_index(drop=True)

# save
out_csv = "/kaggle/working/per_target_auc.csv"
out_txt = "/kaggle/working/worst_targets_25.txt"

df_full.to_csv(out_csv, index=False)
worst25 = df_full.head(25)["target"].tolist()
with open(out_txt, "w", encoding="utf-8") as f:
    f.write("\n".join(worst25))

# print summary
print("Saved:", out_csv)
print("Saved:", out_txt)
print("Metric source:", df_full["metric_source"].iloc[0])

valid_auc = df_full["auc"].dropna()
if len(valid_auc):
    print("Macro (mean over available targets):", float(valid_auc.mean()))
else:
    print("No valid AUC values found.")

print("\nTOP-10 worst:")
print(df_full.head(10)[["target", "pos_rate", "auc", "gap_to_0_82", "gap_to_0_85"]].to_string(index=False))

print("\nTOP-10 best:")
print(df_full.tail(10)[["target", "pos_rate", "auc"]].to_string(index=False))

Saved: /kaggle/working/per_target_auc.csv
Saved: /kaggle/working/worst_targets_25.txt
Metric source: blend_weights.csv::holdout_auc (HOLDOUT proxy)
Macro (mean over available targets): 0.8418818664615445

TOP-10 worst:
     target  pos_rate      auc  gap_to_0_82  gap_to_0_85
 target_9_3  0.018679 0.689375     0.130625     0.160625
 target_9_6  0.223072 0.700661     0.119339     0.149339
 target_3_1  0.098373 0.706182     0.113818     0.143818
 target_5_2  0.002559 0.729861     0.090139     0.120139
 target_6_2  0.007388 0.730065     0.089935     0.119935
 target_6_1  0.008831 0.746307     0.073693     0.103693
 target_3_3  0.001187 0.749762     0.070238     0.100238
 target_2_4  0.007569 0.760313     0.059687     0.089687
 target_5_1  0.009344 0.763876     0.056124     0.086124
target_10_1  0.315052 0.766047     0.053953     0.083953

TOP-10 best:
    target  pos_rate      auc
target_3_2  0.097409 0.914935
target_1_1  0.010396 0.916774
target_9_4  0.001940 0.925289
target_2_2  0.025345

In [6]:
# ============================================================
# BLOCK 2 (FIXED + SAFE): Patch worst-25 targets on GPU and continue notebook on error
# - reads:  /kaggle/working/per_target_auc.csv
# - base:   latest patched submit or submission.parquet
# - writes: /kaggle/working/submit_patched25.parquet
#           /kaggle/working/patch25_report.csv
#           /kaggle/working/patch25_error.log (if error)
#
# Особенности:
# - Работает даже если в globals() нет X/X_test/y_mat/cat_features:
#   сам загрузит и соберёт матрицы.
# - Если OOF нет, НЕ падает: патчит в conservative режиме (fixed weight).
# - Весь блок обёрнут в try/except, чтобы следующие ячейки запускались.
# ============================================================

try:
    import os, gc, time, warnings, traceback
    warnings.filterwarnings("ignore")

    import numpy as np
    import pandas as pd
    from catboost import CatBoostClassifier, Pool
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.metrics import roc_auc_score

    try:
        from tqdm.auto import tqdm
    except Exception:
        def tqdm(x=None, **kwargs): return x if x is not None else None

    # ----------------------------
    # CONFIG
    # ----------------------------
    SPLIT_SEED = 42                 # int (обязательно)
    PATCH_SEEDS = [42, 1337]        # список seed'ов для bagging патча
    VAL_SIZE = 0.05
    TOP_N = 25

    # Если OOF есть: применяем патч только если gain >= MIN_GAIN
    MIN_GAIN = 0.0015
    W_GRID = [0.0, 0.3, 0.6, 0.8, 1.0]

    # Если OOF нет: консервативный фиксированный вес патча
    FIXED_W_NO_OOF = 0.30

    # GPU caps (баланс качества/времени)
    CAP_RARE = 1200   # pos_rate < 0.5%
    CAP_MID  = 1800   # 0.5%..5%
    CAP_COM  = 3200   # >=5% (самые долгие)
    OD_RARE, OD_MID, OD_COM = 120, 150, 220

    # I/O
    AUC_FILE = "/kaggle/working/per_target_auc.csv"
    OUT_PATH = "/kaggle/working/submit_patched25.parquet"
    REPORT_PATH = "/kaggle/working/patch25_report.csv"
    ERR_LOG_PATH = "/kaggle/working/patch25_error.log"

    assert os.path.exists(AUC_FILE), "Сначала запусти BLOCK 1 -> /kaggle/working/per_target_auc.csv"

    # ----------------------------
    # Helpers
    # ----------------------------
    def autodiscover_data_dir():
        candidates = [
            "/kaggle/input/data-fusion-contest-2026",
            "/kaggle/input/datasets/hatab123/data-fusion-contest-2026",
        ]
        for c in candidates:
            if os.path.exists(os.path.join(c, "sample_submit.parquet")):
                return c
        for root, _, files in os.walk("/kaggle/input"):
            if "sample_submit.parquet" in files:
                return root
        raise FileNotFoundError("Не удалось найти sample_submit.parquet в /kaggle/input")

    def safe_auc(y_true, y_score):
        y_true = np.asarray(y_true)
        if np.unique(y_true).size < 2:
            return np.nan
        return float(roc_auc_score(y_true, y_score))

    def rank_pct_1d(x):
        x = np.asarray(x)
        n = x.shape[0]
        order = np.argsort(x, kind="mergesort")
        r = np.empty(n, dtype=np.int32)
        r[order] = np.arange(n, dtype=np.int32)
        return ((r + 1) / (n + 1)).astype(np.float32)

    def logit_from_rank(r):
        r = np.clip(r, 1e-6, 1 - 1e-6)
        return np.log(r / (1 - r)).astype(np.float32)

    def rank_blend_logits(base_scores, patch_scores, w):
        rb = rank_pct_1d(np.asarray(base_scores, dtype=np.float32))
        rp = rank_pct_1d(np.asarray(patch_scores, dtype=np.float32))
        r = (1.0 - w) * rb + w * rp
        return logit_from_rank(r)

    def gpu_params(seed, group, spw):
        p = dict(
            loss_function="Logloss",
            eval_metric="Logloss",
            random_strength=1.0,
            bootstrap_type="Bernoulli",
            subsample=0.88,
            allow_writing_files=False,
            task_type="GPU",
            devices="0",
            verbose=0,
            random_seed=int(seed),
            boosting_type="Plain",
            od_type="Iter",
        )
        if group == "rare":
            p.update(iterations=CAP_RARE, depth=7, learning_rate=0.10, l2_leaf_reg=35.0, od_wait=OD_RARE)
            p["scale_pos_weight"] = float(spw)
        elif group == "mid":
            p.update(iterations=CAP_MID, depth=8, learning_rate=0.08, l2_leaf_reg=25.0, od_wait=OD_MID)
            p["scale_pos_weight"] = float(spw)
        else:
            p.update(iterations=CAP_COM, depth=10, learning_rate=0.05, l2_leaf_reg=12.0, od_wait=OD_COM)
        return p

    def load_oof_if_any():
        # 1) memory
        for name in ["oof", "oof_raw", "oof_cat", "base_oof", "stack_oof"]:
            if name in globals():
                try:
                    arr = np.asarray(globals()[name])
                    if arr.ndim == 2:
                        return arr, f"globals()['{name}']"
                except Exception:
                    pass
        # 2) disk
        for cand in [
            "/kaggle/working/oof.npy",
            "/kaggle/working/oof_cat.npy",
            "/kaggle/working/oof_logits.npy",
            "/kaggle/working/oof_raw.npy",
        ]:
            if os.path.exists(cand):
                try:
                    arr = np.load(cand)
                    return arr, cand
                except Exception:
                    pass
        return None, None

    def pick_base_submit():
        base_candidates = [
            "/kaggle/working/submit_patched20.parquet",
            "/kaggle/working/submit_patched15.parquet",
            "/kaggle/working/submit_patched.parquet",
            "/kaggle/working/submit.parquet",
            "/kaggle/working/submission.parquet",
        ]
        return next((p for p in base_candidates if os.path.exists(p)), None)

    def save_error_fallback(msg, sample_submit_path=None):
        # report
        pd.DataFrame([{
            "target": "__ALL__",
            "pred_col": "__ALL__",
            "pos_rate": np.nan,
            "group": "n/a",
            "mode": "ERROR_FALLBACK",
            "base_auc_val": np.nan,
            "best_auc_val": np.nan,
            "gain": np.nan,
            "best_w": np.nan,
            "mins": 0.0,
            "status": "ERROR_SKIP",
            "reason": str(msg)[:1000],
        }]).to_csv(REPORT_PATH, index=False)

        base_path = pick_base_submit()
        if base_path is not None:
            sub = pd.read_parquet(base_path)
            pred_cols_local = [c for c in sub.columns if c.startswith("predict_")]
            for c in pred_cols_local:
                sub[c] = sub[c].astype(np.float64)
            sub.to_parquet(OUT_PATH, index=False)
            print(f"[FALLBACK] Saved unchanged submit -> {OUT_PATH} (from {base_path})")
        else:
            # если вообще нет сабмита — хотя бы не падаем
            print("[FALLBACK] Base submit not found, saved only report.")

        print(f"[FALLBACK] Saved report -> {REPORT_PATH}")

    def ensure_preprocessed_matrices(data_dir, target_cols):
        """
        Возвращает X, X_test, Y, cat_features, test_df, submit_cols, pred_cols, target_cols
        1) Пытается взять из globals
        2) Иначе сам загружает и препроцессит
        """
        # ----- try globals first -----
        have = all(v in globals() for v in ["X", "X_test", "y_mat", "cat_features"])
        if have:
            X = globals()["X"]
            X_test = globals()["X_test"]
            Y = np.asarray(globals()["y_mat"])
            cat_features = globals()["cat_features"]
            # sanity
            if Y.shape[1] == len(target_cols):
                print("Using X/X_test/y_mat/cat_features from globals().")
                return X, X_test, Y, cat_features
            else:
                print("Globals found but target dimension mismatch -> rebuilding matrices...")

        # ----- build from parquet -----
        print("Building X/X_test/y_mat/cat_features from parquet (no globals found)...")
        t0 = time.time()

        train_main_path  = os.path.join(data_dir, "train_main_features.parquet")
        train_extra_path = os.path.join(data_dir, "train_extra_features.parquet")
        test_main_path   = os.path.join(data_dir, "test_main_features.parquet")
        test_extra_path  = os.path.join(data_dir, "test_extra_features.parquet")
        train_tgt_path   = os.path.join(data_dir, "train_target.parquet")

        train_main = pd.read_parquet(train_main_path)
        train_extra = pd.read_parquet(train_extra_path)
        test_main = pd.read_parquet(test_main_path)
        test_extra = pd.read_parquet(test_extra_path)

        # merge by customer_id (safe)
        train_df = train_main.merge(train_extra, on="customer_id", how="left")
        test_df = test_main.merge(test_extra, on="customer_id", how="left")
        del train_main, train_extra, test_main, test_extra
        gc.collect()

        y_df = pd.read_parquet(train_tgt_path)[["customer_id"] + target_cols].copy()

        # align by customer_id
        train_df = train_df.sort_values("customer_id").reset_index(drop=True)
        test_df = test_df.sort_values("customer_id").reset_index(drop=True)
        y_df = y_df.sort_values("customer_id").reset_index(drop=True)

        if not np.array_equal(train_df["customer_id"].values, y_df["customer_id"].values):
            # На всякий случай более жёсткая выравнивающая логика
            train_df = train_df.merge(y_df[["customer_id"]], on="customer_id", how="inner")
            train_df = train_df.sort_values("customer_id").reset_index(drop=True)
            y_df = y_df.sort_values("customer_id").reset_index(drop=True)
            assert np.array_equal(train_df["customer_id"].values, y_df["customer_id"].values), \
                "train features and train_target customer_id mismatch"

        Y = y_df[target_cols].astype(np.uint8).values

        # Features
        feature_cols = [c for c in train_df.columns if c != "customer_id"]
        cat_cols = [c for c in feature_cols if c.startswith("cat_feature")]
        num_cols = [c for c in feature_cols if c not in cat_cols]

        # CatBoost can ingest strings as cats
        for c in tqdm(cat_cols, desc="Patch-prep: categorical stringify"):
            train_df[c] = train_df[c].fillna("__MISSING__").astype(str)
            test_df[c] = test_df[c].fillna("__MISSING__").astype(str)

        # Downcast numerics to float32 (memory)
        for c in tqdm(num_cols, desc="Patch-prep: numeric downcast"):
            train_df[c] = pd.to_numeric(train_df[c], errors="coerce").astype(np.float32)
            test_df[c] = pd.to_numeric(test_df[c], errors="coerce").astype(np.float32)

        X = train_df[feature_cols].copy()
        X_test = test_df[feature_cols].copy()
        cat_features = [feature_cols.index(c) for c in cat_cols]

        print(f"Built matrices in {(time.time()-t0)/60:.1f} min | X={X.shape} X_test={X_test.shape} cats={len(cat_features)}")
        return X, X_test, Y, cat_features

    # ----------------------------
    # Discover data + schema
    # ----------------------------
    DATA_DIR = autodiscover_data_dir()
    sample_path = os.path.join(DATA_DIR, "sample_submit.parquet")
    sample = pd.read_parquet(sample_path)

    submit_cols = sample.columns.tolist()
    pred_cols = [c for c in submit_cols if c != "customer_id"]
    target_cols = [c.replace("predict_", "target_") for c in pred_cols]

    # ----------------------------
    # Load matrices (globals or build)
    # ----------------------------
    X, X_test, Y, cat_features = ensure_preprocessed_matrices(DATA_DIR, target_cols)

    if Y.shape[1] != len(target_cols):
        raise ValueError(f"Shape mismatch: y_mat has {Y.shape[1]} targets, sample_submit implies {len(target_cols)}")

    # ----------------------------
    # Load OOF if present (optional)
    # ----------------------------
    OOF, OOF_SRC = load_oof_if_any()
    HAS_OOF = isinstance(OOF, np.ndarray) and OOF.shape == Y.shape
    if HAS_OOF:
        print(f"OOF found: {OOF_SRC} | shape={OOF.shape}")
    else:
        print("OOF not found (or wrong shape) -> conservative mode (fixed weight, no gain gating).")
        OOF = None

    # ----------------------------
    # Base submission to improve
    # ----------------------------
    BASE_SUB_PATH = pick_base_submit()
    if BASE_SUB_PATH is None:
        raise FileNotFoundError("Не найден базовый сабмит в /kaggle/working/ (submit*.parquet)")
    base_sub = pd.read_parquet(BASE_SUB_PATH)[submit_cols].copy()
    patched = base_sub.copy()
    print("Base submit:", BASE_SUB_PATH, patched.shape)

    # ----------------------------
    # Worst targets from per_target_auc.csv
    # ----------------------------
    auc_df = pd.read_csv(AUC_FILE).copy()

    # normalize metric column
    if "auc_rank" not in auc_df.columns:
        if "auc" in auc_df.columns:
            auc_df = auc_df.rename(columns={"auc": "auc_rank"})
        else:
            raise ValueError("В per_target_auc.csv нет колонки auc_rank/auc")

    # keep only valid targets
    auc_df = auc_df[auc_df["target"].isin(target_cols)].copy()
    if auc_df.empty:
        raise ValueError("per_target_auc.csv не содержит валидных target_* колонок")
    auc_df = auc_df.sort_values("auc_rank").reset_index(drop=True)

    targets_to_try = auc_df.head(TOP_N)["target"].tolist()
    print(f"Targets to patch (TOP-{TOP_N} worst):")
    print(targets_to_try)

    # ----------------------------
    # Validation split (same logic as before)
    # ----------------------------
    y_sum = Y.sum(axis=1)
    bins = pd.cut(y_sum, bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 100], labels=False).astype(int)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=VAL_SIZE, random_state=SPLIT_SEED)
    tr_idx, va_idx = next(sss.split(np.zeros((len(Y), 1)), bins))

    X_tr = X.iloc[tr_idx]
    X_va = X.iloc[va_idx]
    pool_test = Pool(X_test, cat_features=cat_features)

    print(f"Val split: train={len(tr_idx)} val={len(va_idx)} (~{VAL_SIZE*100:.1f}%)")

    # ----------------------------
    # Patch loop
    # ----------------------------
    report = []
    applied = 0
    skipped = 0

    total_steps = len(targets_to_try) * len(PATCH_SEEDS)
    pbar = tqdm(total=total_steps, desc="Patch 25 (GPU, ETA)", mininterval=1.0)

    for tname in targets_to_try:
        j = target_cols.index(tname)
        pred_col = tname.replace("target_", "predict_")

        row = auc_df.loc[auc_df["target"] == tname].iloc[0]
        pr = float(row["pos_rate"]) if "pos_rate" in auc_df.columns else float(Y[:, j].mean())

        # prevalence group
        if pr < 0.005:
            group = "rare"
        elif pr < 0.05:
            group = "mid"
        else:
            group = "common"

        y = Y[:, j].astype(np.uint8)
        y_tr = y[tr_idx]
        y_va = y[va_idx]

        if np.unique(y_tr).size < 2 or np.unique(y_va).size < 2:
            skipped += 1
            pbar.update(len(PATCH_SEEDS))
            report.append({
                "target": tname, "pred_col": pred_col, "pos_rate": pr, "group": group,
                "mode": "SKIP_DEGENERATE",
                "base_auc_val": np.nan, "best_auc_val": np.nan, "gain": np.nan,
                "best_w": np.nan, "mins": 0.0, "status": "SKIP_DEGENERATE"
            })
            continue

        # Safe base validation metric only if OOF exists
        if HAS_OOF:
            base_val = OOF[va_idx, j].astype(np.float32)
            base_auc = safe_auc(y_va, rank_pct_1d(base_val))
        else:
            base_val = None
            base_auc = np.nan

        # class weights
        if group == "common":
            spw = None
        else:
            pos = float(y_tr.sum())
            neg = float(len(y_tr) - pos)
            spw_cap = 6.0 if group == "rare" else 12.0
            spw = min(neg / max(1.0, pos), spw_cap)

        if hasattr(tqdm, "write"):
            tqdm.write(f"START {tname} | group={group} | pos_rate={pr:.4%} | spw={spw}")

        t0 = time.time()
        pred_patch_val = np.zeros(len(va_idx), dtype=np.float32)
        pred_patch_test = np.zeros(len(X_test), dtype=np.float32)

        # Pools once per target
        tr_pool = Pool(X_tr, label=y_tr, cat_features=cat_features)
        va_pool = Pool(X_va, label=y_va, cat_features=cat_features)

        for seed in PATCH_SEEDS:
            model = CatBoostClassifier(**gpu_params(seed, group, spw))
            model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

            pred_patch_val += (
                model.predict(va_pool, prediction_type="RawFormulaVal").reshape(-1).astype(np.float32)
                / len(PATCH_SEEDS)
            )
            pred_patch_test += (
                model.predict(pool_test, prediction_type="RawFormulaVal").reshape(-1).astype(np.float32)
                / len(PATCH_SEEDS)
            )

            del model
            gc.collect()
            pbar.update(1)

        del tr_pool, va_pool
        gc.collect()

        # Weight selection
        if HAS_OOF and np.isfinite(base_auc):
            best_w, best_auc = 0.0, base_auc
            for w in W_GRID:
                rblend = (1.0 - w) * rank_pct_1d(base_val) + w * rank_pct_1d(pred_patch_val)
                a = safe_auc(y_va, rblend)
                if np.isfinite(a) and a > best_auc:
                    best_auc, best_w = float(a), float(w)

            gain = float(best_auc - base_auc)
            apply_patch = (best_w > 0) and (gain >= MIN_GAIN)
            mode = "OOF_TUNED"
        else:
            best_w = float(FIXED_W_NO_OOF)
            best_auc = safe_auc(y_va, rank_pct_1d(pred_patch_val))  # info only
            gain = np.nan
            apply_patch = best_w > 0
            mode = "NO_OOF_FIXED"

        elapsed_min = (time.time() - t0) / 60.0

        if apply_patch:
            base_scores_test = patched[pred_col].to_numpy(dtype=np.float32)
            patched[pred_col] = rank_blend_logits(base_scores_test, pred_patch_test, w=best_w).astype(np.float64)
            applied += 1
            status = "APPLY"
        else:
            skipped += 1
            status = "SKIP_LOW_GAIN"

        if hasattr(tqdm, "write"):
            tqdm.write(
                f"DONE  {tname} | status={status} | mode={mode} | w={best_w:.2f} | "
                f"base_auc={base_auc if np.isfinite(base_auc) else np.nan:.4f} | "
                f"best_auc={best_auc if np.isfinite(best_auc) else np.nan:.4f} | "
                f"mins={elapsed_min:.1f}"
            )

        report.append({
            "target": tname,
            "pred_col": pred_col,
            "pos_rate": pr,
            "group": group,
            "mode": mode,
            "base_auc_val": float(base_auc) if np.isfinite(base_auc) else np.nan,
            "best_auc_val": float(best_auc) if np.isfinite(best_auc) else np.nan,
            "gain": float(gain) if np.isfinite(gain) else np.nan,
            "best_w": float(best_w),
            "mins": float(elapsed_min),
            "status": status,
        })

    if pbar is not None:
        pbar.close()

    # ----------------------------
    # Save outputs
    # ----------------------------
    rep_df = pd.DataFrame(report)

    if len(rep_df):
        status_order = {"APPLY": 0, "SKIP_LOW_GAIN": 1, "SKIP_DEGENERATE": 2}
        rep_df["_order"] = rep_df["status"].map(status_order).fillna(99).astype(int)
        rep_df = rep_df.sort_values(["_order", "gain"], ascending=[True, False]).drop(columns=["_order"])

    rep_df.to_csv(REPORT_PATH, index=False)

    patched = patched[submit_cols].copy()
    for c in pred_cols:
        patched[c] = patched[c].astype(np.float64)
    patched.to_parquet(OUT_PATH, index=False)

    print("\nApplied:", applied, "| Skipped:", skipped)
    print("Saved:", OUT_PATH, patched.shape)
    print("Saved report:", REPORT_PATH)

    if len(rep_df):
        print("\nTop applied patches:")
        show_cols = [c for c in ["target","group","mode","best_w","base_auc_val","best_auc_val","gain","mins"] if c in rep_df.columns]
        print(rep_df[rep_df["status"] == "APPLY"].head(10)[show_cols].to_string(index=False))

except Exception as e:
    # -------- SAFE fallback so next blocks still run --------
    import traceback, os
    err_text = f"{type(e).__name__}: {e}"
    tb = traceback.format_exc()

    try:
        with open("/kaggle/working/patch25_error.log", "w", encoding="utf-8") as f:
            f.write(tb)
    except Exception:
        pass

    print("\n[PATCH25 SAFE MODE] Блок упал, но ноутбук продолжит выполнение.")
    print("[PATCH25 SAFE MODE] Ошибка:", err_text)
    print("[PATCH25 SAFE MODE] Трейсбек сохранён в /kaggle/working/patch25_error.log")

    # fallback report + fallback submit (copy latest available)
    try:
        import numpy as np
        import pandas as pd

        pd.DataFrame([{
            "target": "__ALL__",
            "pred_col": "__ALL__",
            "pos_rate": np.nan,
            "group": "n/a",
            "mode": "ERROR_FALLBACK",
            "base_auc_val": np.nan,
            "best_auc_val": np.nan,
            "gain": np.nan,
            "best_w": np.nan,
            "mins": 0.0,
            "status": "ERROR_SKIP",
            "reason": err_text[:1000],
        }]).to_csv("/kaggle/working/patch25_report.csv", index=False)

        base_candidates = [
            "/kaggle/working/submit_patched20.parquet",
            "/kaggle/working/submit_patched15.parquet",
            "/kaggle/working/submit_patched.parquet",
            "/kaggle/working/submit.parquet",
            "/kaggle/working/submission.parquet",
        ]
        base_path = next((p for p in base_candidates if os.path.exists(p)), None)
        if base_path is not None:
            sub = pd.read_parquet(base_path)
            pred_cols_local = [c for c in sub.columns if c.startswith("predict_")]
            for c in pred_cols_local:
                sub[c] = sub[c].astype(np.float64)
            sub.to_parquet("/kaggle/working/submit_patched25.parquet", index=False)
            print(f"[PATCH25 SAFE MODE] Saved fallback submit -> /kaggle/working/submit_patched25.parquet (from {base_path})")
        else:
            print("[PATCH25 SAFE MODE] Базовый сабмит не найден, сохранён только report.")
    except Exception as e2:
        print("[PATCH25 SAFE MODE] Не удалось записать fallback outputs:", repr(e2))

Building X/X_test/y_mat/cat_features from parquet (no globals found)...


Patch-prep: categorical stringify:   0%|          | 0/67 [00:00<?, ?it/s]

Patch-prep: numeric downcast:   0%|          | 0/2373 [00:00<?, ?it/s]

Built matrices in 1.3 min | X=(750000, 2440) X_test=(250000, 2440) cats=67
OOF not found (or wrong shape) -> conservative mode (fixed weight, no gain gating).
Base submit: /kaggle/working/submission.parquet (250000, 42)
Targets to patch (TOP-25 worst):
['target_9_3', 'target_9_6', 'target_3_1', 'target_5_2', 'target_6_2', 'target_6_1', 'target_3_3', 'target_2_4', 'target_5_1', 'target_10_1', 'target_9_1', 'target_6_3', 'target_9_7', 'target_2_6', 'target_2_5', 'target_1_2', 'target_7_1', 'target_7_3', 'target_2_1', 'target_2_3', 'target_9_2', 'target_1_4', 'target_4_1', 'target_7_2', 'target_9_5']
Val split: train=712500 val=37500 (~5.0%)


Patch 25 (GPU, ETA):   0%|          | 0/50 [00:00<?, ?it/s]

START target_9_3 | group=mid | pos_rate=1.8679% | spw=12.0
DONE  target_9_3 | status=APPLY | mode=NO_OOF_FIXED | w=0.30 | base_auc=nan | best_auc=0.6808 | mins=1.5
START target_9_6 | group=common | pos_rate=22.3072% | spw=None
DONE  target_9_6 | status=APPLY | mode=NO_OOF_FIXED | w=0.30 | base_auc=nan | best_auc=0.6981 | mins=13.2
START target_3_1 | group=common | pos_rate=9.8373% | spw=None
DONE  target_3_1 | status=APPLY | mode=NO_OOF_FIXED | w=0.30 | base_auc=nan | best_auc=0.7051 | mins=11.0
START target_5_2 | group=rare | pos_rate=0.2559% | spw=6.0
DONE  target_5_2 | status=APPLY | mode=NO_OOF_FIXED | w=0.30 | base_auc=nan | best_auc=0.7180 | mins=1.3
START target_6_2 | group=mid | pos_rate=0.7388% | spw=12.0
DONE  target_6_2 | status=APPLY | mode=NO_OOF_FIXED | w=0.30 | base_auc=nan | best_auc=0.7133 | mins=1.9
START target_6_1 | group=mid | pos_rate=0.8831% | spw=12.0
DONE  target_6_1 | status=APPLY | mode=NO_OOF_FIXED | w=0.30 | base_auc=nan | best_auc=0.7411 | mins=1.6
START t

In [None]:
# ============================================================
# BLOCK 3 (FIXED + SAFE): Second patch pass for worst-30 targets (GPU)
# - reads:  /kaggle/working/per_target_auc.csv
# - base:   prefer /kaggle/working/submit_patched25.parquet (then fallback)
# - writes: /kaggle/working/submit_patched30.parquet
#           /kaggle/working/patch30_report.csv
#           /kaggle/working/patch30_error.log (if error)
#
# Особенности:
# - Работает даже если в globals() нет X/X_test/y_mat/cat_features: соберёт сам.
# - Если OOF нет, НЕ падает: патчит в conservative режиме (fixed weight).
# - Полностью обёрнут в try/except, чтобы следующие ячейки запускались.
# - По умолчанию МОЖЕТ повторно патчить те же таргеты (re-patch) + новые.
#   Если хочешь только "новые" после patch25 — включи EXCLUDE_ALREADY_APPLIED_FROM_PATCH25=True
# ============================================================

try:
    import os, gc, time, warnings, traceback
    warnings.filterwarnings("ignore")

    import numpy as np
    import pandas as pd
    from catboost import CatBoostClassifier, Pool
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.metrics import roc_auc_score

    try:
        from tqdm.auto import tqdm
    except Exception:
        class _TQDMStub:
            def __init__(self, total=None, desc=None, mininterval=1.0): self.total = total
            def update(self, n=1): pass
            def close(self): pass
            @staticmethod
            def write(x): print(x)
        def tqdm(*args, **kwargs):  # noqa
            if len(args) == 0 and ("total" in kwargs or "desc" in kwargs):
                return _TQDMStub(**kwargs)
            return args[0] if args else _TQDMStub(**kwargs)

    # ----------------------------
    # CONFIG
    # ----------------------------
    SPLIT_SEED = 2026
    PATCH_SEEDS = [42, 1337, 7777]         # 2 seeds = нормально по времени
    VAL_SIZE = 0.05
    TOP_N = 30

    # Если хочешь НЕ трогать уже применённые в patch25 таргеты -> True
    EXCLUDE_ALREADY_APPLIED_FROM_PATCH25 = False
    PATCH25_REPORT_PATH = "/kaggle/working/patch25_report.csv"

    # Если OOF есть: применяем патч только если gain >= MIN_GAIN
    MIN_GAIN = 0.0010
    W_GRID = [0.0, 0.15, 0.30, 0.45, 0.60]

    # Если OOF нет: консервативные веса (2-й проход лучше осторожнее)
    FIXED_W_NO_OOF_RARE = 0.20
    FIXED_W_NO_OOF_MID = 0.20
    FIXED_W_NO_OOF_COM = 0.15

    # GPU caps (2-й проход: слегка урезаны для времени)
    CAP_RARE = 1000   # pos_rate < 0.5%
    CAP_MID  = 1500   # 0.5%..5%
    CAP_COM  = 2600   # >=5%
    OD_RARE, OD_MID, OD_COM = 100, 120, 180

    # I/O
    AUC_FILE = "/kaggle/working/per_target_auc.csv"
    OUT_PATH = "/kaggle/working/submit_patched30.parquet"
    REPORT_PATH = "/kaggle/working/patch30_report.csv"
    ERR_LOG_PATH = "/kaggle/working/patch30_error.log"

    assert os.path.exists(AUC_FILE), "Сначала запусти BLOCK 1 -> /kaggle/working/per_target_auc.csv"

    # ----------------------------
    # Helpers
    # ----------------------------
    def autodiscover_data_dir():
        candidates = [
            "/kaggle/input/data-fusion-contest-2026",
            "/kaggle/input/datasets/hatab123/data-fusion-contest-2026",
        ]
        for c in candidates:
            if os.path.exists(os.path.join(c, "sample_submit.parquet")):
                return c
        for root, _, files in os.walk("/kaggle/input"):
            if "sample_submit.parquet" in files:
                return root
        raise FileNotFoundError("Не удалось найти sample_submit.parquet в /kaggle/input")

    def safe_auc(y_true, y_score):
        y_true = np.asarray(y_true)
        if np.unique(y_true).size < 2:
            return np.nan
        return float(roc_auc_score(y_true, y_score))

    def rank_pct_1d(x):
        x = np.asarray(x)
        n = x.shape[0]
        order = np.argsort(x, kind="mergesort")
        r = np.empty(n, dtype=np.int32)
        r[order] = np.arange(n, dtype=np.int32)
        return ((r + 1) / (n + 1)).astype(np.float32)

    def logit_from_rank(r):
        r = np.clip(r, 1e-6, 1 - 1e-6)
        return np.log(r / (1 - r)).astype(np.float32)

    def rank_blend_logits(base_scores, patch_scores, w):
        rb = rank_pct_1d(np.asarray(base_scores, dtype=np.float32))
        rp = rank_pct_1d(np.asarray(patch_scores, dtype=np.float32))
        r = (1.0 - w) * rb + w * rp
        return logit_from_rank(r)

    def gpu_params(seed, group, spw):
        p = dict(
            loss_function="Logloss",
            eval_metric="Logloss",
            random_strength=1.0,
            bootstrap_type="Bernoulli",
            subsample=0.88,
            allow_writing_files=False,
            task_type="GPU",
            devices="0",
            verbose=0,
            random_seed=int(seed),
            boosting_type="Plain",
            od_type="Iter",
        )
        if group == "rare":
            p.update(iterations=CAP_RARE, depth=7, learning_rate=0.10, l2_leaf_reg=35.0, od_wait=OD_RARE)
            p["scale_pos_weight"] = float(spw)
        elif group == "mid":
            p.update(iterations=CAP_MID, depth=8, learning_rate=0.08, l2_leaf_reg=25.0, od_wait=OD_MID)
            p["scale_pos_weight"] = float(spw)
        else:
            p.update(iterations=CAP_COM, depth=9, learning_rate=0.055, l2_leaf_reg=12.0, od_wait=OD_COM)
        return p

    def load_oof_if_any():
        # memory
        for name in ["oof", "oof_raw", "oof_cat", "base_oof", "stack_oof"]:
            if name in globals():
                try:
                    arr = np.asarray(globals()[name])
                    if arr.ndim == 2:
                        return arr, f"globals()['{name}']"
                except Exception:
                    pass
        # disk
        for cand in [
            "/kaggle/working/oof.npy",
            "/kaggle/working/oof_cat.npy",
            "/kaggle/working/oof_logits.npy",
            "/kaggle/working/oof_raw.npy",
        ]:
            if os.path.exists(cand):
                try:
                    arr = np.load(cand)
                    return arr, cand
                except Exception:
                    pass
        return None, None

    def pick_base_submit():
        base_candidates = [
            "/kaggle/working/submit_patched25.parquet",  # <- this pass should start from patched25
            "/kaggle/working/submit_patched20.parquet",
            "/kaggle/working/submit_patched15.parquet",
            "/kaggle/working/submit_patched.parquet",
            "/kaggle/working/submit.parquet",
            "/kaggle/working/submission.parquet",
        ]
        return next((p for p in base_candidates if os.path.exists(p)), None)

    def ensure_preprocessed_matrices(data_dir, target_cols):
        """
        Returns X, X_test, Y, cat_features
        1) Try globals
        2) Else build from parquet
        """
        have = all(v in globals() for v in ["X", "X_test", "y_mat", "cat_features"])
        if have:
            X_ = globals()["X"]
            X_test_ = globals()["X_test"]
            Y_ = np.asarray(globals()["y_mat"])
            catf_ = globals()["cat_features"]
            if Y_.shape[1] == len(target_cols):
                print("Using X/X_test/y_mat/cat_features from globals().")
                return X_, X_test_, Y_, catf_
            else:
                print("Globals found but target dim mismatch -> rebuilding matrices...")

        print("Building X/X_test/y_mat/cat_features from parquet (no globals found)...")
        t0 = time.time()

        train_main = pd.read_parquet(os.path.join(data_dir, "train_main_features.parquet"))
        train_extra = pd.read_parquet(os.path.join(data_dir, "train_extra_features.parquet"))
        test_main = pd.read_parquet(os.path.join(data_dir, "test_main_features.parquet"))
        test_extra = pd.read_parquet(os.path.join(data_dir, "test_extra_features.parquet"))
        y_df = pd.read_parquet(os.path.join(data_dir, "train_target.parquet"))[["customer_id"] + target_cols].copy()

        # merge safely by id
        train_df = train_main.merge(train_extra, on="customer_id", how="left")
        test_df = test_main.merge(test_extra, on="customer_id", how="left")
        del train_main, train_extra, test_main, test_extra
        gc.collect()

        # align
        train_df = train_df.sort_values("customer_id").reset_index(drop=True)
        test_df = test_df.sort_values("customer_id").reset_index(drop=True)
        y_df = y_df.sort_values("customer_id").reset_index(drop=True)

        if not np.array_equal(train_df["customer_id"].values, y_df["customer_id"].values):
            train_df = train_df.merge(y_df[["customer_id"]], on="customer_id", how="inner")
            train_df = train_df.sort_values("customer_id").reset_index(drop=True)
            y_df = y_df.sort_values("customer_id").reset_index(drop=True)
            assert np.array_equal(train_df["customer_id"].values, y_df["customer_id"].values), \
                "Mismatch customer_id between train features and train_target"

        Y_ = y_df[target_cols].astype(np.uint8).values

        feature_cols = [c for c in train_df.columns if c != "customer_id"]
        cat_cols = [c for c in feature_cols if c.startswith("cat_feature")]
        num_cols = [c for c in feature_cols if c not in cat_cols]

        for c in tqdm(cat_cols, desc="Patch30-prep: categorical stringify"):
            train_df[c] = train_df[c].fillna("__MISSING__").astype(str)
            test_df[c] = test_df[c].fillna("__MISSING__").astype(str)

        for c in tqdm(num_cols, desc="Patch30-prep: numeric downcast"):
            train_df[c] = pd.to_numeric(train_df[c], errors="coerce").astype(np.float32)
            test_df[c] = pd.to_numeric(test_df[c], errors="coerce").astype(np.float32)

        X_ = train_df[feature_cols].copy()
        X_test_ = test_df[feature_cols].copy()
        catf_ = [feature_cols.index(c) for c in cat_cols]

        print(f"Built matrices in {(time.time()-t0)/60:.1f} min | X={X_.shape} X_test={X_test_.shape} cats={len(catf_)}")
        return X_, X_test_, Y_, catf_

    def save_fallback_outputs(err_msg: str):
        # report
        pd.DataFrame([{
            "target": "__ALL__",
            "pred_col": "__ALL__",
            "pos_rate": np.nan,
            "group": "n/a",
            "mode": "ERROR_FALLBACK",
            "base_auc_val": np.nan,
            "best_auc_val": np.nan,
            "gain": np.nan,
            "best_w": np.nan,
            "mins": 0.0,
            "status": "ERROR_SKIP",
            "reason": str(err_msg)[:1000],
        }]).to_csv(REPORT_PATH, index=False)

        # copy base submit if possible
        base_path = pick_base_submit()
        if base_path is not None:
            sub = pd.read_parquet(base_path)
            pred_cols_local = [c for c in sub.columns if c.startswith("predict_")]
            for c in pred_cols_local:
                sub[c] = sub[c].astype(np.float64)
            sub.to_parquet(OUT_PATH, index=False)
            print(f"[FALLBACK] Saved unchanged submit -> {OUT_PATH} (from {base_path})")
        else:
            print("[FALLBACK] Base submit not found, saved only report.")

        print(f"[FALLBACK] Saved report -> {REPORT_PATH}")

    # ----------------------------
    # Discover data + schema
    # ----------------------------
    DATA_DIR = autodiscover_data_dir()
    sample = pd.read_parquet(os.path.join(DATA_DIR, "sample_submit.parquet"))

    submit_cols = sample.columns.tolist()
    pred_cols = [c for c in submit_cols if c != "customer_id"]
    target_cols = [c.replace("predict_", "target_") for c in pred_cols]

    # ----------------------------
    # Load matrices (globals or build)
    # ----------------------------
    X, X_test, Y, cat_features = ensure_preprocessed_matrices(DATA_DIR, target_cols)

    if Y.shape[1] != len(target_cols):
        raise ValueError(f"Shape mismatch: y_mat has {Y.shape[1]} targets, sample_submit implies {len(target_cols)}")

    # ----------------------------
    # OOF (optional)
    # ----------------------------
    OOF, OOF_SRC = load_oof_if_any()
    HAS_OOF = isinstance(OOF, np.ndarray) and OOF.shape == Y.shape
    if HAS_OOF:
        print(f"OOF found: {OOF_SRC} | shape={OOF.shape}")
    else:
        OOF = None
        print("OOF not found (or wrong shape) -> conservative mode (fixed weight, no gain gating).")

    # ----------------------------
    # Base submit
    # ----------------------------
    BASE_SUB_PATH = pick_base_submit()
    if BASE_SUB_PATH is None:
        raise FileNotFoundError("Не найден базовый сабмит в /kaggle/working/")
    base_sub = pd.read_parquet(BASE_SUB_PATH)[submit_cols].copy()
    patched = base_sub.copy()
    print("Base submit:", BASE_SUB_PATH, patched.shape)

    # ----------------------------
    # Load targets from per_target_auc.csv
    # ----------------------------
    auc_df = pd.read_csv(AUC_FILE).copy()

    if "auc_rank" not in auc_df.columns:
        if "auc" in auc_df.columns:
            auc_df = auc_df.rename(columns={"auc": "auc_rank"})
        else:
            raise ValueError("В per_target_auc.csv нет колонки auc_rank/auc")

    auc_df = auc_df[auc_df["target"].isin(target_cols)].copy()
    if auc_df.empty:
        raise ValueError("per_target_auc.csv не содержит валидных target_* колонок")

    auc_df = auc_df.sort_values("auc_rank").reset_index(drop=True)

    # optional: exclude already applied in patch25
    if EXCLUDE_ALREADY_APPLIED_FROM_PATCH25 and os.path.exists(PATCH25_REPORT_PATH):
        rep25 = pd.read_csv(PATCH25_REPORT_PATH)
        if "target" in rep25.columns and "status" in rep25.columns:
            already = set(rep25.loc[rep25["status"].astype(str).str.upper() == "APPLY", "target"].astype(str))
            before_n = len(auc_df)
            auc_df = auc_df[~auc_df["target"].isin(already)].reset_index(drop=True)
            print(f"Excluded already applied from patch25: {before_n - len(auc_df)} targets")
        else:
            print("patch25_report.csv exists but no target/status cols -> skip exclusion")

    targets_to_try = auc_df.head(TOP_N)["target"].tolist()
    if len(targets_to_try) == 0:
        raise ValueError("Список targets_to_try пуст (после фильтров/exclude).")

    print(f"Targets to patch (TOP-{min(TOP_N, len(targets_to_try))} worst):")
    print(targets_to_try)

    # ----------------------------
    # Validation split
    # ----------------------------
    y_sum = Y.sum(axis=1)
    bins = pd.cut(y_sum, bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 100], labels=False).astype(int)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=VAL_SIZE, random_state=SPLIT_SEED)
    tr_idx, va_idx = next(sss.split(np.zeros((len(Y), 1)), bins))

    X_tr = X.iloc[tr_idx]
    X_va = X.iloc[va_idx]
    pool_test = Pool(X_test, cat_features=cat_features)
    print(f"Val split: train={len(tr_idx)} val={len(va_idx)} (~{VAL_SIZE*100:.1f}%)")

    # ----------------------------
    # Patch loop
    # ----------------------------
    report = []
    applied = 0
    skipped = 0

    total_steps = len(targets_to_try) * len(PATCH_SEEDS)
    pbar = tqdm(total=total_steps, desc="Patch 30 (GPU, ETA)", mininterval=1.0)

    for tname in targets_to_try:
        j = target_cols.index(tname)
        pred_col = tname.replace("target_", "predict_")

        row = auc_df.loc[auc_df["target"] == tname].iloc[0]
        pr = float(row["pos_rate"]) if "pos_rate" in auc_df.columns else float(Y[:, j].mean())

        # prevalence group
        if pr < 0.005:
            group = "rare"
        elif pr < 0.05:
            group = "mid"
        else:
            group = "common"

        y = Y[:, j].astype(np.uint8)
        y_tr = y[tr_idx]
        y_va = y[va_idx]

        if np.unique(y_tr).size < 2 or np.unique(y_va).size < 2:
            skipped += 1
            pbar.update(len(PATCH_SEEDS))
            report.append({
                "target": tname, "pred_col": pred_col, "pos_rate": pr, "group": group,
                "mode": "SKIP_DEGENERATE",
                "base_auc_val": np.nan, "best_auc_val": np.nan, "gain": np.nan,
                "best_w": np.nan, "mins": 0.0, "status": "SKIP_DEGENERATE"
            })
            continue

        # safe baseline on val only if OOF exists
        if HAS_OOF:
            base_val = OOF[va_idx, j].astype(np.float32)
            base_auc = safe_auc(y_va, rank_pct_1d(base_val))
        else:
            base_val = None
            base_auc = np.nan

        # class weights
        if group == "common":
            spw = None
        else:
            pos = float(y_tr.sum())
            neg = float(len(y_tr) - pos)
            spw_cap = 6.0 if group == "rare" else 12.0
            spw = min(neg / max(1.0, pos), spw_cap)

        if hasattr(tqdm, "write"):
            tqdm.write(f"START {tname} | group={group} | pos_rate={pr:.4%} | spw={spw}")

        t0 = time.time()
        pred_patch_val = np.zeros(len(va_idx), dtype=np.float32)
        pred_patch_test = np.zeros(len(X_test), dtype=np.float32)

        tr_pool = Pool(X_tr, label=y_tr, cat_features=cat_features)
        va_pool = Pool(X_va, label=y_va, cat_features=cat_features)

        for seed in PATCH_SEEDS:
            model = CatBoostClassifier(**gpu_params(seed, group, spw))
            model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

            pred_patch_val += (
                model.predict(va_pool, prediction_type="RawFormulaVal").reshape(-1).astype(np.float32)
                / len(PATCH_SEEDS)
            )
            pred_patch_test += (
                model.predict(pool_test, prediction_type="RawFormulaVal").reshape(-1).astype(np.float32)
                / len(PATCH_SEEDS)
            )

            del model
            gc.collect()
            pbar.update(1)

        del tr_pool, va_pool
        gc.collect()

        # weight selection
        if HAS_OOF and np.isfinite(base_auc):
            best_w, best_auc = 0.0, float(base_auc)
            for w in W_GRID:
                rblend = (1.0 - w) * rank_pct_1d(base_val) + w * rank_pct_1d(pred_patch_val)
                a = safe_auc(y_va, rblend)
                if np.isfinite(a) and a > best_auc:
                    best_auc, best_w = float(a), float(w)

            gain = float(best_auc - base_auc)
            apply_patch = (best_w > 0) and (gain >= MIN_GAIN)
            mode = "OOF_TUNED"
        else:
            if group == "rare":
                best_w = float(FIXED_W_NO_OOF_RARE)
            elif group == "mid":
                best_w = float(FIXED_W_NO_OOF_MID)
            else:
                best_w = float(FIXED_W_NO_OOF_COM)
            best_auc = safe_auc(y_va, rank_pct_1d(pred_patch_val))  # info only
            gain = np.nan
            apply_patch = best_w > 0
            mode = "NO_OOF_FIXED"

        elapsed_min = (time.time() - t0) / 60.0

        if apply_patch:
            base_scores_test = patched[pred_col].to_numpy(dtype=np.float32)
            patched[pred_col] = rank_blend_logits(base_scores_test, pred_patch_test, w=best_w).astype(np.float64)
            applied += 1
            status = "APPLY"
        else:
            skipped += 1
            status = "SKIP_LOW_GAIN"

        if hasattr(tqdm, "write"):
            tqdm.write(
                f"DONE  {tname} | status={status} | mode={mode} | w={best_w:.2f} | "
                f"base_auc={base_auc if np.isfinite(base_auc) else np.nan:.4f} | "
                f"best_auc={best_auc if np.isfinite(best_auc) else np.nan:.4f} | "
                f"mins={elapsed_min:.1f}"
            )

        report.append({
            "target": tname,
            "pred_col": pred_col,
            "pos_rate": pr,
            "group": group,
            "mode": mode,
            "base_auc_val": float(base_auc) if np.isfinite(base_auc) else np.nan,
            "best_auc_val": float(best_auc) if np.isfinite(best_auc) else np.nan,
            "gain": float(gain) if np.isfinite(gain) else np.nan,
            "best_w": float(best_w),
            "mins": float(elapsed_min),
            "status": status,
        })

    pbar.close()

    # ----------------------------
    # Save outputs
    # ----------------------------
    rep_df = pd.DataFrame(report)
    if len(rep_df):
        status_order = {"APPLY": 0, "SKIP_LOW_GAIN": 1, "SKIP_DEGENERATE": 2}
        rep_df["_order"] = rep_df["status"].map(status_order).fillna(99).astype(int)
        rep_df = rep_df.sort_values(["_order", "gain"], ascending=[True, False]).drop(columns=["_order"])

    rep_df.to_csv(REPORT_PATH, index=False)

    patched = patched[submit_cols].copy()
    for c in pred_cols:
        patched[c] = patched[c].astype(np.float64)   # schema safety
    patched.to_parquet(OUT_PATH, index=False)

    print("\nApplied:", applied, "| Skipped:", skipped)
    print("Saved:", OUT_PATH, patched.shape)
    print("Saved report:", REPORT_PATH)

    if len(rep_df):
        print("\nTop applied patches:")
        show_cols = [c for c in ["target","group","mode","best_w","base_auc_val","best_auc_val","gain","mins"] if c in rep_df.columns]
        print(rep_df[rep_df["status"] == "APPLY"].head(10)[show_cols].to_string(index=False))

except Exception as e:
    # --------------------------------------------------------
    # SAFE fallback: don't stop notebook
    # --------------------------------------------------------
    err_text = f"{type(e).__name__}: {e}"
    tb = traceback.format_exc()

    try:
        with open("/kaggle/working/patch30_error.log", "w", encoding="utf-8") as f:
            f.write(tb)
    except Exception:
        pass

    print("\n[PATCH30 SAFE MODE] Блок упал, но ноутбук продолжит выполнение.")
    print("[PATCH30 SAFE MODE] Ошибка:", err_text)
    print("[PATCH30 SAFE MODE] Трейсбек сохранён в /kaggle/working/patch30_error.log")

    try:
        import numpy as np
        import pandas as pd
        pd.DataFrame([{
            "target": "__ALL__",
            "pred_col": "__ALL__",
            "pos_rate": np.nan,
            "group": "n/a",
            "mode": "ERROR_FALLBACK",
            "base_auc_val": np.nan,
            "best_auc_val": np.nan,
            "gain": np.nan,
            "best_w": np.nan,
            "mins": 0.0,
            "status": "ERROR_SKIP",
            "reason": err_text[:1000],
        }]).to_csv("/kaggle/working/patch30_report.csv", index=False)

        base_candidates = [
            "/kaggle/working/submit_patched25.parquet",
            "/kaggle/working/submit_patched20.parquet",
            "/kaggle/working/submit_patched15.parquet",
            "/kaggle/working/submit_patched.parquet",
            "/kaggle/working/submit.parquet",
            "/kaggle/working/submission.parquet",
        ]
        base_path = next((p for p in base_candidates if os.path.exists(p)), None)
        if base_path is not None:
            sub = pd.read_parquet(base_path)
            pred_cols_local = [c for c in sub.columns if c.startswith("predict_")]
            for c in pred_cols_local:
                sub[c] = sub[c].astype(np.float64)
            sub.to_parquet("/kaggle/working/submit_patched30.parquet", index=False)
            print(f"[PATCH30 SAFE MODE] Saved fallback submit -> /kaggle/working/submit_patched30.parquet (from {base_path})")
        else:
            print("[PATCH30 SAFE MODE] Базовый сабмит не найден, сохранён только report.")
    except Exception as e2:
        print("[PATCH30 SAFE MODE] Не удалось записать fallback outputs:", repr(e2))

Building X/X_test/y_mat/cat_features from parquet (no globals found)...


Patch30-prep: categorical stringify:   0%|          | 0/67 [00:00<?, ?it/s]

Patch30-prep: numeric downcast:   0%|          | 0/2373 [00:00<?, ?it/s]

Built matrices in 1.0 min | X=(750000, 2440) X_test=(250000, 2440) cats=67
OOF not found (or wrong shape) -> conservative mode (fixed weight, no gain gating).
Base submit: /kaggle/working/submit_patched25.parquet (250000, 42)
Targets to patch (TOP-30 worst):
['target_9_3', 'target_9_6', 'target_3_1', 'target_5_2', 'target_6_2', 'target_6_1', 'target_3_3', 'target_2_4', 'target_5_1', 'target_10_1', 'target_9_1', 'target_6_3', 'target_9_7', 'target_2_6', 'target_2_5', 'target_1_2', 'target_7_1', 'target_7_3', 'target_2_1', 'target_2_3', 'target_9_2', 'target_1_4', 'target_4_1', 'target_7_2', 'target_9_5', 'target_1_3', 'target_8_2', 'target_2_7', 'target_6_4', 'target_8_3']
Val split: train=712500 val=37500 (~5.0%)


Patch 30 (GPU, ETA):   0%|          | 0/90 [00:00<?, ?it/s]

START target_9_3 | group=mid | pos_rate=1.8679% | spw=12.0
DONE  target_9_3 | status=APPLY | mode=NO_OOF_FIXED | w=0.20 | base_auc=nan | best_auc=0.7194 | mins=2.3
START target_9_6 | group=common | pos_rate=22.3072% | spw=None
DONE  target_9_6 | status=APPLY | mode=NO_OOF_FIXED | w=0.15 | base_auc=nan | best_auc=0.7012 | mins=13.7
START target_3_1 | group=common | pos_rate=9.8373% | spw=None


In [9]:
!cd /kaggle/working
FileLink("submit_patched30.parquet")

In [None]:
while True:
    continue