# Pip Install Libraries for Metric

In [1]:
!pip install /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

Processing /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
Installing collected packages: autograd
Successfully installed autograd-1.7.0
Processing /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l- \ done
[?25h  Created wheel for autograd-gamma: filename=autograd_gamma-0.5.0-py3-none-any.whl size=4030 sha256=f54a6787d5454cfd2eba23c17cc5710d50561cd3a01e05b407befbc83eab4c23
  Stored in directory: /root/.cache/pip/wheels/6b/b5/e0/4c79e15c0b5f2c15ecf613c720bb20daab20a666eb67135155
Successfully built autograd-gamma
Installing collected packages: autograd-gamma
Successfully installed autograd-gamma-0.5.0
Processing /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
Installing collected packages: interface-meta
Successfully installed interface-meta-1.3.0


In [2]:
!pip install /kaggle/input/pip-install-ray-tun-hyperopt/output/ray-2.42.1-cp310-cp310-manylinux2014_x86_64.whl
!pip install /kaggle/input/pip-install-ray-tun-hyperopt/output/hyperopt-0.2.7-py2.py3-none-any.whl

Processing /kaggle/input/pip-install-ray-tun-hyperopt/output/ray-2.42.1-cp310-cp310-manylinux2014_x86_64.whl
Installing collected packages: ray
  Attempting uninstall: ray
    Found existing installation: ray 2.24.0
    Uninstalling ray-2.24.0:
      Successfully uninstalled ray-2.24.0
Successfully installed ray-2.42.1
Processing /kaggle/input/pip-install-ray-tun-hyperopt/output/hyperopt-0.2.7-py2.py3-none-any.whl
hyperopt is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


# Imports

In [3]:
import os
import matplotlib.pyplot as plt


import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from lifelines.utils import concordance_index
from lifelines import KaplanMeierFitter


from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
from metric import score

In [4]:
test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")
print("Test shape:", test.shape )

train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
print("Train shape:",train.shape)

Test shape: (3, 58)
Train shape: (28800, 60)


## efs und efs_time mit KaplanMeier in einem Ziel umwandeln
In diesem Wettbewerb müssen wir den Risk Score vorhersagen. Daher werden wir ein Ziel erstellen, das den Risk Score nachahmt, um unser Modell zu trainieren.

In [5]:
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y
train["y"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

## Features
Es gibt insgesamt 57 Features. Davon sind 35 kategorisch und 22 numerisch. Wir werden die kategorialen Features mit Label Encoding kodieren. Das Modell wird diese dann als kategoriale Features akzeptieren und intern speziell verarbeiten. Die fehlenden Werte (NANs) bei den numerischen Features lassen wir unverändert, da das Modell, das wir trainieren, mit NANs umgehen kann und diese Information nutzen wird.

1. Auswahl der Features

In [6]:
RMV = ["ID","efs","efs_time","y"]
FEATURES = [c for c in train.columns if not c in RMV]
print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")

There are 57 FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10']


2. Identifikation und Bearbeitung von kategorialen Features

In [7]:
CATS = []
for c in FEATURES:
    if train[c].dtype=="object":
        CATS.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")

In these features, there are 35 CATEGORICAL FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate']


3.Dieser Code kombiniert die train- und test-Datensätze, kodiert kategoriale Features mit Label Encoding und konvertiert sie in den category-Typ, reduziert die Präzision numerischer Features auf float32 und int32 zur Speicheroptimierung und teilt die Datensätze anschließend wieder in train und test auf.

In [8]:
combined = pd.concat([train,test],axis=0,ignore_index=True)
#print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
print("We LABEL ENCODE the CATEGORICAL FEATURES: ",end="")
for c in FEATURES:

    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        print(f"{c}, ",end="")
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")
        
    # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
    else:
        if combined[c].dtype=="float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype=="int64":
            combined[c] = combined[c].astype("int32")
    
train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

We LABEL ENCODE the CATEGORICAL FEATURES: dri_score, psych_disturb, cyto_score, diabetes, tbi_status, arrhythmia, graft_type, vent_hist, renal_issue, pulm_severe, prim_disease_hct, cmv_status, tce_imm_match, rituximab, prod_type, cyto_score_detail, conditioning_intensity, ethnicity, obesity, mrd_hct, in_vivo_tcd, tce_match, hepatic_severe, prior_tumor, peptic_ulcer, gvhd_proph, rheum_issue, sex_match, race_group, hepatic_mild, tce_div_match, donor_related, melphalan_dose, cardiac, pulm_moderate, 

## Survival:Cox braucht dieses Ziel (um efs und efs_time zu verarbeiten)

In [9]:
train["efs_time2"] = train.efs_time.copy()
train.loc[train.efs==0,"efs_time2"] *= -1

## Hier wird ein Ray-Cluster initialisiert, der ermittelt die verfügbaren Systemressourcen und legt die Ressourcen pro Durchlauf basierend auf den verfügbaren Ressourcen oder den Systemstandards fest.

In [10]:
ray.init(ignore_reinit_error=True)

2025-05-24 16:06:03,504	INFO worker.py:1841 -- Started a local Ray instance.


0,1
Python version:,3.10.14
Ray version:,2.42.1


In [11]:
available_resources = ray.cluster_resources()

In [12]:
resources_per_trial = {
    "cpu": int(available_resources.get("CPU", os.cpu_count())),
    "gpu": int(available_resources.get("GPU", 0))
}

# XGBBoost Survival:Cox Hyperparameter Tunning

In [13]:
def train_xgb(config):
    from metric import score  # Import your custom metric function

    FOLDS = 10
    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
    # Arrays to store out-of-fold predictions and test predictions (if test set is available)
    oof_preds = np.zeros(len(train))
    test_preds = np.zeros(len(test)) if "test" in globals() else None
    
    for i, (train_index, valid_index) in enumerate(kf.split(train)):
        print("#" * 25)
        print(f"### Fold {i+1}")
        print("#" * 25)
        
        x_train = train.loc[train_index, FEATURES].copy()
        y_train = train.loc[train_index, "efs_time2"]
        x_valid = train.loc[valid_index, FEATURES].copy()
        y_valid = train.loc[valid_index, "efs_time2"]

        model = XGBRegressor(
            device="cpu",  # Use "cpu" if a GPU is not available
            max_depth=config["max_depth"],
            learning_rate=config["learning_rate"],
            min_child_weight=config["min_child_weight"],
            subsample=config["subsample"],
            colsample_bytree=config["colsample_bytree"],
            gamma=config["gamma"],
            reg_lambda=config["reg_lambda"],
            reg_alpha=config["reg_alpha"],
            n_estimators=config["n_estimators"],
            enable_categorical=True,
            max_delta_step=config["max_delta_step"],
            objective='survival:cox',
            eval_metric='cox-nloglik',
        )
        # Set early stopping rounds
        model.set_params(early_stopping_rounds=25)
        
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
            verbose=False
        )
        
        # Store out-of-fold predictions for this fold
        oof_preds[valid_index] = model.predict(x_valid)
        
        # If a test set is available, aggregate its predictions
        if test_preds is not None:
            x_test = test[FEATURES].copy()
            test_preds += model.predict(x_test)
    
    # If test predictions are available, average them over the folds
    if test_preds is not None:
        test_preds /= FOLDS

    # Prepare data for the overall CV metric computation
    y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
    y_pred = train[["ID"]].copy()
    y_pred["prediction"] = oof_preds

    # Compute overall CV metric using the custom 'score' function
    cv_metric = score(y_true.copy(), y_pred.copy(), "ID")
    
    # Report the overall CV metric to Ray Tune for optimization
    tune.report({"cv_metric" : cv_metric})

# Define hyperparameter search space using Ray Tune's API
config = {
    "max_depth": tune.randint(2, 11),
    "learning_rate": tune.loguniform(0.001, 0.2),
    "min_child_weight": tune.randint(1, 51),
    "subsample": tune.uniform(0.4, 1.0),
    "colsample_bytree": tune.uniform(0.2, 1.0),
    "gamma": tune.uniform(0, 2),
    "reg_lambda": tune.loguniform(1e-4, 100),
    "reg_alpha": tune.loguniform(1e-4, 1),
    "n_estimators": tune.randint(50, 3001),
    "max_delta_step": tune.randint(0, 4)
}

# Create a HyperOptSearch instance specifying that we wish to maximize the cv_metric
hyperopt_search = HyperOptSearch(metric="cv_metric", mode="max")

# Run tuning with Ray Tune using HyperOpt as the search algorithm
analysis = tune.run(
    train_xgb,
    search_alg=hyperopt_search,
    config=config,
    num_samples=10,
    resources_per_trial=resources_per_trial,
    trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}",
    verbose=1
)

# Retrieve the best configuration based on the overall CV metric (cv_metric)
best_config = analysis.get_best_config(metric="cv_metric", mode="max")
print("Best Hyperparameters:", best_config)

# Inspect all trial results
results_df = analysis.results_df
print(results_df)


0,1
Current time:,2025-05-24 16:20:35
Running for:,00:14:17.23
Memory:,1.9/31.4 GiB

Trial name,status,loc,colsample_bytree,gamma,learning_rate,max_delta_step,max_depth,min_child_weight,n_estimators,reg_alpha,reg_lambda,subsample,iter,total time (s),cv_metric
train_xgb_accca0f4,TERMINATED,172.19.2.2:423,0.453931,1.48542,0.107992,3,4,9,526,0.0182924,13.9606,0.850625,1,26.2311,0.67046
train_xgb_eb607a57,TERMINATED,172.19.2.2:514,0.206159,0.322513,0.00666937,2,3,44,580,0.00328115,0.000873082,0.417457,1,53.4342,0.651062
train_xgb_071f3017,TERMINATED,172.19.2.2:601,0.749267,1.16853,0.0633918,0,3,21,537,0.00011853,0.00494412,0.672712,1,37.0053,0.667994
train_xgb_6d799c7c,TERMINATED,172.19.2.2:690,0.707295,1.65358,0.00235422,2,9,35,2573,0.0144503,16.2235,0.751359,1,476.743,0.668686
train_xgb_9da28518,TERMINATED,172.19.2.2:819,0.896878,1.24891,0.125141,2,7,37,1728,0.0484808,0.00170708,0.618542,1,15.3056,0.663967
train_xgb_fa63d302,TERMINATED,172.19.2.2:903,0.88536,0.672458,0.0161645,1,4,32,454,0.0283521,42.3497,0.737136,1,49.8289,0.661264
train_xgb_e4eead07,TERMINATED,172.19.2.2:991,0.963023,0.358626,0.0213292,3,5,6,749,0.00032068,0.152642,0.707669,1,65.0476,0.668435
train_xgb_82a9ffaa,TERMINATED,172.19.2.2:1080,0.282401,0.287937,0.00853887,1,8,21,54,0.000980359,0.315105,0.58452,1,14.2637,0.648293
train_xgb_40df955d,TERMINATED,172.19.2.2:1164,0.345848,0.143753,0.00887403,1,3,29,89,0.0052815,0.00063331,0.48897,1,11.1619,0.640519
train_xgb_a2920c0b,TERMINATED,172.19.2.2:1247,0.882997,1.7406,0.00396346,2,10,32,251,0.610784,0.656297,0.749715,1,61.9401,0.656884


2025-05-24 16:20:35,770	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/train_xgb_2025-05-24_16-06-05' in 0.0162s.
2025-05-24 16:20:35,784	INFO tune.py:1041 -- Total run time: 870.49 seconds (857.21 seconds for the tuning loop).


Best Hyperparameters: {'max_depth': 4, 'learning_rate': 0.10799201436131363, 'min_child_weight': 9, 'subsample': 0.8506245898832192, 'colsample_bytree': 0.45393128679854133, 'gamma': 1.4854204121880117, 'reg_lambda': 13.96058322373917, 'reg_alpha': 0.01829243576580757, 'n_estimators': 526, 'max_delta_step': 3}
          cv_metric   timestamp checkpoint_dir_name  done  training_iteration  \
trial_id                                                                        
accca0f4   0.670460  1748102809                None  True                   1   
eb607a57   0.651062  1748102866                None  True                   1   
071f3017   0.667994  1748102908                None  True                   1   
6d799c7c   0.668686  1748103390                None  True                   1   
9da28518   0.663967  1748103409                None  True                   1   
fa63d302   0.661264  1748103464                None  True                   1   
e4eead07   0.668435  1748103533         