# Data Fusion 2026 — quality-first notebook (v2)

Цель: поднять `macro ROC-AUC` выше базового `0.845` за счёт более устойчивого ансамбля:
- `CatBoost MultiClassOneVsAll` (ловит межтаргетные зависимости);
- `CatBoost OvR` по каждому таргету (лучше для редких классов);
- тюнинг blend-весов по holdout в пространстве рангов;
- дополнительный patch для худших таргетов с отдельными гиперпараметрами.


In [None]:
from __future__ import annotations

import os
import gc
import json
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit


# ======================
# CONFIG
# ======================
DATA_DIR_CANDIDATES = [
    Path('/kaggle/input/data-fusion-contest-2026'),
    Path('/workspace/test_fusion'),
]
OUT_DIR = Path('./artifacts_v2')
OUT_DIR.mkdir(parents=True, exist_ok=True)

# holdout only for model selection / blend tuning
HOLDOUT_SIZE = 0.05
SPLIT_SEED = 42

# multi model
MULTI_SEEDS = [42, 2026]
MULTI_MAX_ITERS = 10000
MULTI_OD_WAIT = 400

# ovr model
OVR_SEEDS = [42]
OVR_MAX_ITERS = 7000
OVR_OD_WAIT = 350

# patching
PATCH_TOP_K = 14
PATCH_SEEDS = [3407, 7777]
PATCH_GAIN_MIN = 0.0005

# feature cleaning
MISSING_RATE_DROP = 0.998
NEAR_CONST_DROP = 0.9997


def find_data_dir() -> Path:
    for p in DATA_DIR_CANDIDATES:
        if (p / 'train_main_features.parquet').exists() and (p / 'train_target.parquet').exists():
            return p
    raise FileNotFoundError('Cannot locate dataset directory')


def safe_auc(y_true: np.ndarray, y_score: np.ndarray) -> float:
    if np.unique(y_true).size < 2:
        return 0.5
    return float(roc_auc_score(y_true, y_score))


def rank_pct_1d(x: np.ndarray) -> np.ndarray:
    x = np.asarray(x)
    n = x.shape[0]
    order = np.argsort(x, kind='mergesort')
    ranks = np.empty(n, dtype=np.int32)
    ranks[order] = np.arange(n, dtype=np.int32)
    return ((ranks + 1) / (n + 1)).astype(np.float32)


def logit_from_rank(r: np.ndarray, eps: float = 1e-6) -> np.ndarray:
    r = np.clip(r, eps, 1 - eps)
    return np.log(r / (1 - r)).astype(np.float32)


def macro_auc(y_true: np.ndarray, y_score: np.ndarray) -> float:
    per_target = [safe_auc(y_true[:, j], y_score[:, j]) for j in range(y_true.shape[1])]
    return float(np.mean(per_target))


In [None]:
# ======================
# DATA + FEATURES
# ======================
DATA_DIR = find_data_dir()
print('DATA_DIR =', DATA_DIR)

train_main = pd.read_parquet(DATA_DIR / 'train_main_features.parquet')
train_extra = pd.read_parquet(DATA_DIR / 'train_extra_features.parquet')
test_main = pd.read_parquet(DATA_DIR / 'test_main_features.parquet')
test_extra = pd.read_parquet(DATA_DIR / 'test_extra_features.parquet')
train_target = pd.read_parquet(DATA_DIR / 'train_target.parquet')
sample_submit = pd.read_parquet(DATA_DIR / 'sample_submit.parquet')

train_df = train_main.merge(train_extra, on='customer_id', how='left', suffixes=('', '_extra'))
test_df = test_main.merge(test_extra, on='customer_id', how='left', suffixes=('', '_extra'))

id_col = 'customer_id'
target_cols = [c for c in train_target.columns if c != id_col]

full_train = train_df.merge(train_target, on=id_col, how='inner')

feature_cols = [c for c in train_df.columns if c != id_col]
cat_cols = [c for c in feature_cols if c.startswith('cat_feature_')]
num_cols = [c for c in feature_cols if c.startswith('num_feature_')]

# drop ultra-missing and near-constant columns
missing_rate = full_train[feature_cols].isna().mean()
keep_cols = [c for c in feature_cols if missing_rate[c] <= MISSING_RATE_DROP]

final_keep = []
for c in keep_cols:
    vc = full_train[c].value_counts(dropna=False, normalize=True)
    dom = float(vc.iloc[0]) if len(vc) > 0 else 1.0
    if dom < NEAR_CONST_DROP:
        final_keep.append(c)

feature_cols = final_keep
cat_cols = [c for c in cat_cols if c in feature_cols]
cat_idx = [feature_cols.index(c) for c in cat_cols]

X = full_train[feature_cols].copy()
X_test = test_df[feature_cols].copy()
y = full_train[target_cols].values.astype(np.int32)

# CatBoost требует cat_features как string/int (не float)
for c in cat_cols:
    X[c] = X[c].astype('string').fillna('__NaN__').astype(str)
    X_test[c] = X_test[c].astype('string').fillna('__NaN__').astype(str)

print('train shape:', X.shape, 'test shape:', X_test.shape)
print('targets:', len(target_cols), 'cat feats:', len(cat_cols))

# stratification proxy: count of active labels clipped
strat = np.clip(y.sum(axis=1), 0, 5)
sss = StratifiedShuffleSplit(n_splits=1, test_size=HOLDOUT_SIZE, random_state=SPLIT_SEED)
tr_idx, va_idx = next(sss.split(np.zeros(len(strat)), strat))

X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
y_tr, y_va = y[tr_idx], y[va_idx]

print('split:', X_tr.shape, X_va.shape)


In [None]:
# ======================
# MODEL 1: MULTI
# ======================
va_multi_probs = np.zeros((X_va.shape[0], y.shape[1]), dtype=np.float32)
test_multi_probs = np.zeros((X_test.shape[0], y.shape[1]), dtype=np.float32)
best_iters_multi = []

for seed in MULTI_SEEDS:
    m = CatBoostClassifier(
        loss_function='MultiClassOneVsAll',
        eval_metric='MultiClass',
        task_type='GPU',
        devices='0',
        iterations=MULTI_MAX_ITERS,
        learning_rate=0.03,
        depth=8,
        l2_leaf_reg=6.0,
        random_strength=0.5,
        bootstrap_type='Bernoulli',
        subsample=0.9,
        od_type='Iter',
        od_wait=MULTI_OD_WAIT,
        random_seed=seed,
        verbose=200,
        allow_writing_files=False,
    )
    m.fit(Pool(X_tr, y_tr, cat_features=cat_idx), eval_set=Pool(X_va, y_va, cat_features=cat_idx), use_best_model=True)
    best_it = int(m.get_best_iteration()) if m.get_best_iteration() is not None else MULTI_MAX_ITERS
    best_iters_multi.append(best_it)

    va_multi_probs += m.predict_proba(Pool(X_va, cat_features=cat_idx)) / len(MULTI_SEEDS)

    m_full = CatBoostClassifier(
        **{k: v for k, v in m.get_params().items() if k not in ['iterations', 'od_wait']},
        iterations=max(best_it, 200),
        od_wait=None,
    )
    m_full.fit(Pool(X, y, cat_features=cat_idx), verbose=0)
    test_multi_probs += m_full.predict_proba(Pool(X_test, cat_features=cat_idx)) / len(MULTI_SEEDS)

print('mean best_iter multi =', np.mean(best_iters_multi))
print('holdout macro AUC multi =', macro_auc(y_va, va_multi_probs))


In [None]:
# ======================
# MODEL 2: OVR
# ======================
va_ovr_probs = np.zeros((X_va.shape[0], y.shape[1]), dtype=np.float32)
test_ovr_probs = np.zeros((X_test.shape[0], y.shape[1]), dtype=np.float32)
per_target_auc = []

for t_idx, t in enumerate(target_cols):
    y_tr_t = y_tr[:, t_idx]
    y_va_t = y_va[:, t_idx]
    pos_rate = float(y_tr_t.mean())

    if pos_rate < 0.003:
        depth, lr, l2 = 10, 0.025, 10.0
    elif pos_rate < 0.02:
        depth, lr, l2 = 9, 0.03, 8.0
    else:
        depth, lr, l2 = 8, 0.035, 6.0

    spw = float((1 - pos_rate) / max(pos_rate, 1e-6))
    spw = float(np.clip(spw, 1.0, 150.0))

    va_pred_seed = np.zeros(X_va.shape[0], dtype=np.float32)
    test_pred_seed = np.zeros(X_test.shape[0], dtype=np.float32)
    best_its = []

    for seed in OVR_SEEDS:
        clf = CatBoostClassifier(
            loss_function='Logloss',
            eval_metric='Logloss',
            task_type='GPU',
            devices='0',
            iterations=OVR_MAX_ITERS,
            learning_rate=lr,
            depth=depth,
            l2_leaf_reg=l2,
            bootstrap_type='Bernoulli',
            subsample=0.9,
            od_type='Iter',
            od_wait=OVR_OD_WAIT,
            random_seed=seed,
            scale_pos_weight=spw,
            verbose=0,
            allow_writing_files=False,
        )
        clf.fit(Pool(X_tr, y_tr_t, cat_features=cat_idx), eval_set=Pool(X_va, y_va_t, cat_features=cat_idx), use_best_model=True)
        bi = int(clf.get_best_iteration()) if clf.get_best_iteration() is not None else OVR_MAX_ITERS
        best_its.append(max(bi, 150))

        va_pred_seed += clf.predict_proba(Pool(X_va, cat_features=cat_idx))[:, 1] / len(OVR_SEEDS)

        clf_full = CatBoostClassifier(
            **{k: v for k, v in clf.get_params().items() if k not in ['iterations', 'od_wait']},
            iterations=max(bi, 150),
            od_wait=None,
        )
        clf_full.fit(Pool(X, y[:, t_idx], cat_features=cat_idx), verbose=0)
        test_pred_seed += clf_full.predict_proba(Pool(X_test, cat_features=cat_idx))[:, 1] / len(OVR_SEEDS)

    va_ovr_probs[:, t_idx] = va_pred_seed
    test_ovr_probs[:, t_idx] = test_pred_seed
    per_target_auc.append((t, safe_auc(y_va_t, va_pred_seed), pos_rate, int(np.mean(best_its))))

auc_df = pd.DataFrame(per_target_auc, columns=['target', 'auc_ovr_holdout', 'pos_rate', 'best_iter'])
auc_df = auc_df.sort_values('auc_ovr_holdout')
auc_df.to_csv(OUT_DIR / 'per_target_auc_ovr.csv', index=False)

print('holdout macro AUC ovr =', macro_auc(y_va, va_ovr_probs))


In [None]:
# ======================
# BLENDING + PATCHING
# ======================
# blend in rank space, target-wise
va_final = np.zeros_like(va_ovr_probs)
test_final = np.zeros_like(test_ovr_probs)
blend_rows = []

for j, t in enumerate(target_cols):
    yj = y_va[:, j]
    r_multi = rank_pct_1d(va_multi_probs[:, j])
    r_ovr = rank_pct_1d(va_ovr_probs[:, j])

    best_auc = -1.0
    best_w = 0.5
    for w in np.linspace(0.0, 1.0, 21):
        r = (1 - w) * r_multi + w * r_ovr
        auc = safe_auc(yj, r)
        if auc > best_auc:
            best_auc, best_w = auc, float(w)

    blend_rows.append((t, best_w, best_auc))

    va_final[:, j] = (1 - best_w) * r_multi + best_w * r_ovr

    rt_multi = rank_pct_1d(test_multi_probs[:, j])
    rt_ovr = rank_pct_1d(test_ovr_probs[:, j])
    rt = (1 - best_w) * rt_multi + best_w * rt_ovr
    test_final[:, j] = 1 / (1 + np.exp(-logit_from_rank(rt)))

blend_df = pd.DataFrame(blend_rows, columns=['target', 'w_ovr', 'auc_holdout'])
blend_df.to_csv(OUT_DIR / 'blend_weights.csv', index=False)

print('holdout macro AUC blended =', macro_auc(y_va, va_final))

# optional patch for worst targets
worst_targets = blend_df.sort_values('auc_holdout').head(PATCH_TOP_K)['target'].tolist()
print('patch targets:', worst_targets)

patch_log = []
for t in worst_targets:
    j = target_cols.index(t)
    y_tr_t, y_va_t = y_tr[:, j], y_va[:, j]
    base_auc = safe_auc(y_va_t, va_final[:, j])

    # slightly more aggressive model for hard targets
    pos_rate = float(y_tr_t.mean())
    spw = float(np.clip((1 - pos_rate) / max(pos_rate, 1e-6), 1.0, 200.0))

    va_patch = np.zeros(X_va.shape[0], dtype=np.float32)
    te_patch = np.zeros(X_test.shape[0], dtype=np.float32)

    for seed in PATCH_SEEDS:
        patch = CatBoostClassifier(
            loss_function='Logloss', eval_metric='Logloss',
            task_type='GPU', devices='0',
            iterations=9000, learning_rate=0.022,
            depth=10, l2_leaf_reg=12.0,
            random_strength=1.2,
            bootstrap_type='Bernoulli', subsample=0.85,
            od_type='Iter', od_wait=450,
            scale_pos_weight=spw,
            random_seed=seed, verbose=0,
            allow_writing_files=False,
        )
        patch.fit(Pool(X_tr, y_tr_t, cat_features=cat_idx), eval_set=Pool(X_va, y_va_t, cat_features=cat_idx), use_best_model=True)
        bi = int(patch.get_best_iteration()) if patch.get_best_iteration() is not None else 4000

        va_patch += patch.predict_proba(Pool(X_va, cat_features=cat_idx))[:, 1] / len(PATCH_SEEDS)

        patch_full = CatBoostClassifier(
            **{k: v for k, v in patch.get_params().items() if k not in ['iterations', 'od_wait']},
            iterations=max(bi, 200), od_wait=None,
        )
        patch_full.fit(Pool(X, y[:, j], cat_features=cat_idx), verbose=0)
        te_patch += patch_full.predict_proba(Pool(X_test, cat_features=cat_idx))[:, 1] / len(PATCH_SEEDS)

    # rank blend base + patch
    rb = rank_pct_1d(va_final[:, j])
    rp = rank_pct_1d(va_patch)
    best_auc = base_auc
    best_w = 0.0
    for w in [0.0, 0.15, 0.30, 0.45, 0.60]:
        auc = safe_auc(y_va_t, (1 - w) * rb + w * rp)
        if auc > best_auc:
            best_auc, best_w = auc, w

    if best_auc >= base_auc + PATCH_GAIN_MIN:
        va_final[:, j] = (1 - best_w) * rb + best_w * rp

        rtb = rank_pct_1d(test_final[:, j])
        rtp = rank_pct_1d(te_patch)
        rt = (1 - best_w) * rtb + best_w * rtp
        test_final[:, j] = 1 / (1 + np.exp(-logit_from_rank(rt)))

    patch_log.append((t, base_auc, best_auc, best_w))

patch_df = pd.DataFrame(patch_log, columns=['target', 'base_auc', 'patched_auc', 'patch_w'])
patch_df.to_csv(OUT_DIR / 'patch_report.csv', index=False)

print('holdout macro AUC final =', macro_auc(y_va, va_final))


In [None]:
# ======================
# SAVE SUBMISSION
# ======================
submission = sample_submit[[id_col]].copy()
for j, t in enumerate(target_cols):
    # submit as logit-like margins (common robust format for this comp)
    p = np.clip(test_final[:, j], 1e-6, 1 - 1e-6)
    submission[t] = np.log(p / (1 - p)).astype(np.float32)

sub_path = OUT_DIR / 'submission_v2.parquet'
submission.to_parquet(sub_path, index=False)

meta = {
    'data_dir': str(DATA_DIR),
    'n_features': int(len(feature_cols)),
    'n_cat_features': int(len(cat_cols)),
    'holdout_macro_auc_final': float(macro_auc(y_va, va_final)),
}
with open(OUT_DIR / 'run_meta.json', 'w') as f:
    json.dump(meta, f, indent=2)

print('Saved:', sub_path)
print(json.dumps(meta, indent=2))
