# Pip Install Libraries for Metric

In [1]:
!pip install /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

Processing /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
Installing collected packages: autograd
Successfully installed autograd-1.7.0
Processing /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l- \ done
[?25h  Created wheel for autograd-gamma: filename=autograd_gamma-0.5.0-py3-none-any.whl size=4030 sha256=22ddc5651712467fbe79cb2fdb15a85a8f9afb9633a20c0e2dfa6430d09a586d
  Stored in directory: /root/.cache/pip/wheels/6b/b5/e0/4c79e15c0b5f2c15ecf613c720bb20daab20a666eb67135155
Successfully built autograd-gamma
Installing collected packages: autograd-gamma
Successfully installed autograd-gamma-0.5.0
Processing /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
Installing collected packages: interface-meta
Successfully installed interface-meta-1.3.0


In [2]:
!pip install /kaggle/input/pip-install-ray-tun-hyperopt/output/ray-2.42.1-cp310-cp310-manylinux2014_x86_64.whl
!pip install /kaggle/input/pip-install-ray-tun-hyperopt/output/hyperopt-0.2.7-py2.py3-none-any.whl

Processing /kaggle/input/pip-install-ray-tun-hyperopt/output/ray-2.42.1-cp310-cp310-manylinux2014_x86_64.whl
Installing collected packages: ray
  Attempting uninstall: ray
    Found existing installation: ray 2.24.0
    Uninstalling ray-2.24.0:
      Successfully uninstalled ray-2.24.0
Successfully installed ray-2.42.1
Processing /kaggle/input/pip-install-ray-tun-hyperopt/output/hyperopt-0.2.7-py2.py3-none-any.whl
hyperopt is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


# Imports

In [3]:
import os
import matplotlib.pyplot as plt


import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from lifelines.utils import concordance_index
from lifelines import KaplanMeierFitter

# Model libraries
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
from metric import score

In [4]:
test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")
print("Test shape:", test.shape )

train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
print("Train shape:",train.shape)

Test shape: (3, 58)
Train shape: (28800, 60)


## efs und efs_time mit KaplanMeier in einem Ziel umwandeln
In diesem Wettbewerb müssen wir den Risk Score vorhersagen. Daher werden wir ein Ziel erstellen, das den Risk Score nachahmt, um unser Modell zu trainieren.

In [5]:
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y
train["y"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

## Features
Es gibt insgesamt 57 Features. Davon sind 35 kategorisch und 22 numerisch. Wir werden die kategorialen Features mit Label Encoding kodieren. Das Modell wird diese dann als kategoriale Features akzeptieren und intern speziell verarbeiten. Die fehlenden Werte (NANs) bei den numerischen Features lassen wir unverändert, da das Modell, das wir trainieren, mit NANs umgehen kann und diese Information nutzen wird.

1. Auswahl der Features

In [6]:
RMV = ["ID","efs","efs_time","y"]
FEATURES = [c for c in train.columns if not c in RMV]
print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")

There are 57 FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10']


2. Identifikation und Bearbeitung von kategorialen Features

In [7]:
CATS = []
for c in FEATURES:
    if train[c].dtype=="object":
        CATS.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")

In these features, there are 35 CATEGORICAL FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate']


3.Dieser Code kombiniert die train- und test-Datensätze, kodiert kategoriale Features mit Label Encoding und konvertiert sie in den category-Typ, reduziert die Präzision numerischer Features auf float32 und int32 zur Speicheroptimierung und teilt die Datensätze anschließend wieder in train und test auf.

In [8]:
combined = pd.concat([train,test],axis=0,ignore_index=True)
#print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
print("We LABEL ENCODE the CATEGORICAL FEATURES: ",end="")
for c in FEATURES:

    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        print(f"{c}, ",end="")
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")
        
    # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
    else:
        if combined[c].dtype=="float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype=="int64":
            combined[c] = combined[c].astype("int32")
    
train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

We LABEL ENCODE the CATEGORICAL FEATURES: dri_score, psych_disturb, cyto_score, diabetes, tbi_status, arrhythmia, graft_type, vent_hist, renal_issue, pulm_severe, prim_disease_hct, cmv_status, tce_imm_match, rituximab, prod_type, cyto_score_detail, conditioning_intensity, ethnicity, obesity, mrd_hct, in_vivo_tcd, tce_match, hepatic_severe, prior_tumor, peptic_ulcer, gvhd_proph, rheum_issue, sex_match, race_group, hepatic_mild, tce_div_match, donor_related, melphalan_dose, cardiac, pulm_moderate, 

## Survival:Cox braucht dieses Ziel (um efs und efs_time zu verarbeiten)

In [9]:
train["efs_time2"] = train.efs_time.copy()
train.loc[train.efs==0,"efs_time2"] *= -1

## Hier wird ein Ray-Cluster initialisiert, der ermittelt die verfügbaren Systemressourcen und legt die Ressourcen pro Durchlauf basierend auf den verfügbaren Ressourcen oder den Systemstandards fest.

In [10]:
ray.init(ignore_reinit_error=True)

2025-05-24 16:06:29,050	INFO worker.py:1841 -- Started a local Ray instance.


0,1
Python version:,3.10.14
Ray version:,2.42.1


In [11]:
available_resources = ray.cluster_resources()


In [12]:
resources_per_trial = {
    "cpu": int(available_resources.get("CPU", os.cpu_count())),
    "gpu": int(available_resources.get("GPU", 0))
}

# LightGBM with Kaplan Meier Hyperparameter Tunning

In [13]:
def train_lgbm(config):
    FOLDS = 10
    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
    # Initialize out-of-fold predictions and test predictions if a test set exists.
    oof_preds = np.zeros(len(train))
    test_preds = np.zeros(len(test)) if "test" in globals() else None

    for fold, (train_idx, valid_idx) in enumerate(kf.split(train)):
        
        
        # Use the features from the training set and target column "y"
        x_train = train.loc[train_idx, FEATURES].copy()
        y_train = train.loc[train_idx, "y"]
        x_valid = train.loc[valid_idx, FEATURES].copy()
        y_valid = train.loc[valid_idx, "y"]
        
        # Create LightGBM datasets.
        train_data = lgb.Dataset(x_train, label=y_train)
        valid_data = lgb.Dataset(x_valid, label=y_valid, reference=train_data)
        
        # Build the model with hyperparameters from config.
        # "min_split_gain" is added as a hyperparameter.
        
        
        params = {
            "objective": config["objective"],
            "max_depth": config["max_depth"],
            "learning_rate": config["learning_rate"],
            "min_child_weight": config["min_child_weight"],
            "subsample": config["subsample"],
            "colsample_bytree": config["colsample_bytree"],
            "reg_lambda": config["reg_lambda"],
            "reg_alpha": config["reg_alpha"],
            "min_split_gain": config["min_split_gain"],
            "num_leaves": config["num_leaves"],
            "force_row_wise": True,
            "verbose": -1,
            "verbosity": -1
            
        }
        
        model = lgb.train(
            params,
            train_data,
            num_boost_round=config["n_estimators"],
            valid_sets=[valid_data],
            #early_stopping_rounds=25,
            #verbose_eval=False
        )
        
        # Save predictions for the validation fold.
        preds = model.predict(x_valid, num_iteration=model.best_iteration)
        oof_preds[valid_idx] = preds
        
        # If test data exists, aggregate predictions over the folds.
        if test_preds is not None:
            x_test = test[FEATURES].copy()
            test_preds += model.predict(x_test, num_iteration=model.best_iteration)
    
    # Average test predictions over folds (if applicable).
    if test_preds is not None:
        test_preds /= FOLDS

    # Prepare data for the overall CV metric computation.
    y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
    y_pred = train[["ID"]].copy()
    y_pred["prediction"] = oof_preds
    # Evaluate the overall performance using the custom metric.
    cv_metric = score(y_true.copy(), y_pred.copy(), "ID")
    
    # Report the metric to Ray Tune.
    tune.report({"cv_metric": cv_metric})

# Define the hyperparameter search space.
config = {
    "max_depth": tune.randint(2, 11),
    "learning_rate": tune.loguniform(0.001, 0.2),
    "min_child_weight": tune.randint(1, 51),
    "subsample": tune.uniform(0.4, 1.0),
    "colsample_bytree": tune.uniform(0.2, 1.0),
    "reg_lambda": tune.loguniform(1e-4, 100),
    "reg_alpha": tune.loguniform(1e-4, 1),
    "n_estimators": tune.randint(50, 3001),
    "min_split_gain": tune.uniform(0, 1.0),
    "num_leaves": tune.randint(31, 128),
    "objective": "regression"
}

# Create a HyperOptSearch instance specifying that we wish to maximize the cv_metric.
hyperopt_search = HyperOptSearch(metric="cv_metric", mode="max")

# Run tuning with Ray Tune using HyperOpt as the search algorithm.
analysis = tune.run(
    train_lgbm,
    search_alg=hyperopt_search,
    config=config,
    num_samples=10,
    resources_per_trial=resources_per_trial,
    trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}",
    verbose=1
)

# Retrieve the best configuration based on the overall CV metric (cv_metric).
best_config = analysis.get_best_config(metric="cv_metric", mode="max")
print("Best Hyperparameters:", best_config)


0,1
Current time:,2025-05-24 16:17:48
Running for:,00:11:07.87
Memory:,1.8/31.4 GiB

Trial name,status,loc,colsample_bytree,learning_rate,max_depth,min_child_weight,min_split_gain,n_estimators,num_leaves,objective,reg_alpha,reg_lambda,subsample,iter,total time (s),cv_metric
train_lgbm_09e72660,TERMINATED,172.19.2.2:424,0.761279,0.00276836,7,24,0.326787,516,113,regression,0.018172,0.0180253,0.495065,1,50.8073,0.656042
train_lgbm_0ef23251,TERMINATED,172.19.2.2:516,0.969496,0.00330362,8,31,0.241286,242,103,regression,0.000317369,26.5464,0.468744,1,31.5144,0.650104
train_lgbm_92794138,TERMINATED,172.19.2.2:603,0.561343,0.00746307,3,37,0.0443647,1147,126,regression,0.000160161,13.6432,0.63964,1,57.5183,0.661793
train_lgbm_651182a5,TERMINATED,172.19.2.2:691,0.241623,0.0112709,3,41,0.280754,2598,91,regression,0.00787268,0.0105446,0.458739,1,84.2113,0.661722
train_lgbm_f1ec83c0,TERMINATED,172.19.2.2:782,0.437664,0.00689179,4,2,0.183442,1236,87,regression,0.151058,0.772191,0.590208,1,75.1917,0.664445
train_lgbm_a5437b4b,TERMINATED,172.19.2.2:872,0.978491,0.0485388,10,14,0.826243,1954,44,regression,0.00157973,0.511682,0.875826,1,29.3186,0.656919
train_lgbm_90cc908c,TERMINATED,172.19.2.2:957,0.601324,0.147672,2,48,0.466966,2989,93,regression,0.00653223,0.000467078,0.990253,1,71.3849,0.658401
train_lgbm_5955c0bc,TERMINATED,172.19.2.2:1047,0.435364,0.0522807,9,32,0.600464,2998,93,regression,0.000443353,0.00178489,0.438544,1,92.2064,0.659784
train_lgbm_9055e507,TERMINATED,172.19.2.2:1138,0.785977,0.0422654,7,33,0.132804,2913,52,regression,0.000430208,3.83114,0.49823,1,50.8324,0.664655
train_lgbm_0a489c4e,TERMINATED,172.19.2.2:1226,0.463866,0.105484,5,15,0.291183,2495,35,regression,0.0166819,45.6953,0.821994,1,77.573,0.662545


2025-05-24 16:17:48,417	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/train_lgbm_2025-05-24_16-06-30' in 0.0102s.
2025-05-24 16:17:48,428	INFO tune.py:1041 -- Total run time: 677.56 seconds (667.86 seconds for the tuning loop).


Best Hyperparameters: {'max_depth': 7, 'learning_rate': 0.04226542576307475, 'min_child_weight': 33, 'subsample': 0.4982299813447037, 'colsample_bytree': 0.7859774296288189, 'reg_lambda': 3.831135676795729, 'reg_alpha': 0.0004302075996004448, 'n_estimators': 2913, 'min_split_gain': 0.13280424604279573, 'num_leaves': 52, 'objective': 'regression'}
