# Pip Install Libraries for Metric

In [1]:
!pip install /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

Processing /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
Installing collected packages: autograd
Successfully installed autograd-1.7.0
Processing /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l- \ done
[?25h  Created wheel for autograd-gamma: filename=autograd_gamma-0.5.0-py3-none-any.whl size=4030 sha256=5dfd8eeee54708c144fed68edd845485bd55e95601d19e3c724d89adf66c790b
  Stored in directory: /root/.cache/pip/wheels/6b/b5/e0/4c79e15c0b5f2c15ecf613c720bb20daab20a666eb67135155
Successfully built autograd-gamma
Installing collected packages: autograd-gamma
Successfully installed autograd-gamma-0.5.0
Processing /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
Installing collected packages: interface-meta
Successfully installed interface-meta-1.3.0


In [2]:
!pip install /kaggle/input/pip-install-ray-tun-hyperopt/output/ray-2.42.1-cp310-cp310-manylinux2014_x86_64.whl
!pip install /kaggle/input/pip-install-ray-tun-hyperopt/output/hyperopt-0.2.7-py2.py3-none-any.whl

Processing /kaggle/input/pip-install-ray-tun-hyperopt/output/ray-2.42.1-cp310-cp310-manylinux2014_x86_64.whl
Installing collected packages: ray
  Attempting uninstall: ray
    Found existing installation: ray 2.24.0
    Uninstalling ray-2.24.0:
      Successfully uninstalled ray-2.24.0
Successfully installed ray-2.42.1
Processing /kaggle/input/pip-install-ray-tun-hyperopt/output/hyperopt-0.2.7-py2.py3-none-any.whl
hyperopt is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


# Imports

In [3]:
import os
import matplotlib.pyplot as plt


import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from lifelines.utils import concordance_index
from lifelines import KaplanMeierFitter

# Model libraries
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
from metric import score

In [4]:
test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")
print("Test shape:", test.shape )

train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
print("Train shape:",train.shape)

Test shape: (3, 58)
Train shape: (28800, 60)


## efs und efs_time mit KaplanMeier in einem Ziel umwandeln
In diesem Wettbewerb müssen wir den Risk Score vorhersagen. Daher werden wir ein Ziel erstellen, das den Risk Score nachahmt, um unser Modell zu trainieren.

In [5]:
from lifelines import KaplanMeierFitter
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y
train["y"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

## Features
Es gibt insgesamt 57 Features. Davon sind 35 kategorisch und 22 numerisch. Wir werden die kategorialen Features mit Label Encoding kodieren. Das Modell wird diese dann als kategoriale Features akzeptieren und intern speziell verarbeiten. Die fehlenden Werte (NANs) bei den numerischen Features lassen wir unverändert, da das Modell, das wir trainieren, mit NANs umgehen kann und diese Information nutzen wird.

1. Auswahl der Features

In [6]:
RMV = ["ID","efs","efs_time","y"]
FEATURES = [c for c in train.columns if not c in RMV]
print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")

There are 57 FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10']


2. Identifikation und Bearbeitung von kategorialen Feature

In [7]:
CATS = []
for c in FEATURES:
    if train[c].dtype=="object":
        CATS.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")

In these features, there are 35 CATEGORICAL FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate']


3.Dieser Code kombiniert die train- und test-Datensätze, kodiert kategoriale Features mit Label Encoding und konvertiert sie in den category-Typ, reduziert die Präzision numerischer Features auf float32 und int32 zur Speicheroptimierung und teilt die Datensätze anschließend wieder in train und test auf.

In [8]:
combined = pd.concat([train,test],axis=0,ignore_index=True)
#print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
print("We LABEL ENCODE the CATEGORICAL FEATURES: ",end="")
for c in FEATURES:

    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        print(f"{c}, ",end="")
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")
        
    # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
    else:
        if combined[c].dtype=="float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype=="int64":
            combined[c] = combined[c].astype("int32")
    
train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

We LABEL ENCODE the CATEGORICAL FEATURES: dri_score, psych_disturb, cyto_score, diabetes, tbi_status, arrhythmia, graft_type, vent_hist, renal_issue, pulm_severe, prim_disease_hct, cmv_status, tce_imm_match, rituximab, prod_type, cyto_score_detail, conditioning_intensity, ethnicity, obesity, mrd_hct, in_vivo_tcd, tce_match, hepatic_severe, prior_tumor, peptic_ulcer, gvhd_proph, rheum_issue, sex_match, race_group, hepatic_mild, tce_div_match, donor_related, melphalan_dose, cardiac, pulm_moderate, 

## Survival:Cox braucht dieses Ziel (um efs und efs_time zu verarbeiten)

In [9]:
train["efs_time2"] = train.efs_time.copy()
train.loc[train.efs==0,"efs_time2"] *= -1

## Hier wird ein Ray-Cluster initialisiert, der ermittelt die verfügbaren Systemressourcen und legt die Ressourcen pro Durchlauf basierend auf den verfügbaren Ressourcen oder den Systemstandards fest.

In [10]:
ray.init(ignore_reinit_error=True)

2025-05-24 16:07:08,242	INFO worker.py:1841 -- Started a local Ray instance.


0,1
Python version:,3.10.14
Ray version:,2.42.1


In [11]:
available_resources = ray.cluster_resources()


In [12]:
resources_per_trial = {
    "cpu": int(available_resources.get("CPU", os.cpu_count())),
    "gpu": int(available_resources.get("GPU", 0))
}

# CatBoost Survival:Cox Hyperparameter Tunning

In [13]:
import numpy as np
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor
from ray import tune
from metric import score  # Import your custom metric function

def train_catboost(config):
    FOLDS = 10
    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
    # Initialize out-of-fold predictions and test predictions if a test set exists.
    oof_preds = np.zeros(len(train))
    test_preds = np.zeros(len(test)) if "test" in globals() else None

    for fold, (train_idx, valid_idx) in enumerate(kf.split(train)):
        print("#" * 25)
        print(f"### Fold {fold+1}")
        print("#" * 25)
        
        # Extract training and validation sets.
        x_train = train.loc[train_idx, FEATURES].copy()
        y_train = train.loc[train_idx, "efs_time2"]
        x_valid = train.loc[valid_idx, FEATURES].copy()
        y_valid = train.loc[valid_idx, "efs_time2"]
        
       
        model = CatBoostRegressor(
            depth=config["max_depth"],
            learning_rate=config["learning_rate"],
            min_data_in_leaf=config["min_child_weight"],
            #subsample=config["subsample"],
            rsm=config["colsample_bytree"],
            l2_leaf_reg=config["reg_lambda"],
            iterations=config["n_estimators"],
            random_strength=config["random_strength"],
            grow_policy="Lossguide",
            loss_function="Cox",
            bootstrap_type="Bernoulli",
            verbose=False
        )
        
        # Fit the model with early stopping.
        model.fit(
            x_train, y_train,
            cat_features=CATS,
            eval_set=(x_valid, y_valid),
            #early_stopping_rounds=25,
            #verbose=-1
        )
        
        # Save predictions for the validation fold.
        oof_preds[valid_idx] = model.predict(x_valid)
        
        # If test data exists, aggregate predictions over folds.
        if test_preds is not None:
            x_test = test[FEATURES].copy()
            test_preds += model.predict(x_test)
    
    # Average test predictions over folds (if applicable).
    if test_preds is not None:
        test_preds /= FOLDS

    # Prepare data for the overall CV metric computation.
    y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
    y_pred = train[["ID"]].copy()
    y_pred["prediction"] = oof_preds
    
    # Evaluate the overall performance using the custom metric.
    cv_metric = score(y_true.copy(), y_pred.copy(), "ID")
    
    # Report the metric to Ray Tune.
    tune.report({"cv_metric": cv_metric})

# Define the hyperparameter search space.
config = {
    "max_depth": tune.randint(2, 11),
    "learning_rate": tune.loguniform(0.001, 0.2),
    "min_child_weight": tune.randint(1, 51),
    #"subsample": tune.uniform(0.4, 1.0),
    "colsample_bytree": tune.uniform(0.2, 1.0),
    "reg_lambda": tune.loguniform(1e-4, 100),
    "n_estimators": tune.randint(50, 3001),
    "random_strength": tune.randint(1, 11)
}

# Create a HyperOptSearch instance specifying that we wish to maximize the cv_metric.
hyperopt_search = HyperOptSearch(metric="cv_metric", mode="max")

# Run tuning with Ray Tune using HyperOpt as the search algorithm.
analysis = tune.run(
    train_catboost,
    search_alg=hyperopt_search,
    config=config,
    num_samples=10,
    resources_per_trial=resources_per_trial,
    trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}",
    verbose=1
)

# Retrieve the best configuration based on the overall CV metric (cv_metric).
best_config = analysis.get_best_config(metric="cv_metric", mode="max")
print("Best Hyperparameters:", best_config)

0,1
Current time:,2025-05-24 18:41:09
Running for:,02:33:46.18
Memory:,1.9/31.4 GiB

Trial name,status,loc,colsample_bytree,learning_rate,max_depth,min_child_weight,n_estimators,random_strength,reg_lambda,iter,total time (s),cv_metric
train_catboost_9f3f9ffc,TERMINATED,172.19.2.2:427,0.927694,0.0106775,5,12,2637,7,16.6851,1,1483.75,0.667492
train_catboost_3cfecf64,TERMINATED,172.19.2.2:753,0.350482,0.00311516,10,17,2264,3,2.95259,1,1086.75,0.652761
train_catboost_bcf25914,TERMINATED,172.19.2.2:1034,0.302115,0.0289093,8,4,1088,9,2.35609,1,496.091,0.669814
train_catboost_10345ee4,TERMINATED,172.19.2.2:1255,0.933664,0.0018288,8,9,1294,10,7.49344,1,920.515,0.638538
train_catboost_d308b064,TERMINATED,172.19.2.2:1520,0.929356,0.00883841,7,31,2872,2,1.03361,1,2031.9,0.670152
train_catboost_ec279a13,TERMINATED,172.19.2.2:1894,0.464593,0.111666,9,39,422,5,13.2285,1,230.129,0.670519
train_catboost_f9771ba1,TERMINATED,172.19.2.2:2090,0.484065,0.00366877,3,38,1783,4,0.0135244,1,566.922,0.647438
train_catboost_f648759f,TERMINATED,172.19.2.2:2319,0.543219,0.0185863,9,37,762,6,14.6124,1,449.811,0.66016
train_catboost_901e1172,TERMINATED,172.19.2.2:2536,0.694967,0.00141203,5,37,2550,2,0.0461372,1,1319.01,0.650194
train_catboost_b2090773,TERMINATED,172.19.2.2:2840,0.362812,0.00830557,5,30,1516,7,0.00338279,1,593.925,0.654207


2025-05-24 18:41:09,006	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/train_catboost_2025-05-24_16-07-10' in 0.0114s.
2025-05-24 18:41:09,018	INFO tune.py:1041 -- Total run time: 9238.85 seconds (9226.17 seconds for the tuning loop).


Best Hyperparameters: {'max_depth': 9, 'learning_rate': 0.11166617741801442, 'min_child_weight': 39, 'colsample_bytree': 0.4645925617070444, 'reg_lambda': 13.22846453228072, 'n_estimators': 422, 'random_strength': 5}
