In [1]:
!pip install /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

Processing /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
autograd is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
Processing /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l[?25hdone
  Created wheel for autograd-gamma: filename=autograd_gamma-0.5.0-py3-none-any.whl size=4031 sha256=a2f1a0b7590a11059afc8f1fff55c9027092c50a018b6b1fc2e16c0939fbf4b0
  Stored in directory: /root/.cache/pip/wheels/6b/b5/e0/4c79e15c0b5f2c15ecf613c720bb20daab20a666eb67135155
Successfully built autograd-gamma
Installing collected packages: autograd-gamma
Successfully installed autograd-gamma-0.5.0
Processing /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
Installing collected packages: interface-

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from lifelines.utils import concordance_index
from lifelines import KaplanMeierFitter
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the data
data = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')

In [4]:
data.shape

(28800, 60)

In [5]:
data.head(2)

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,...,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672


In [6]:
def kaplan(data, time_col = 'efs_time', event_col='efs'):
    
    kmf = KaplanMeierFitter()
    kmf.fit(data[time_col], event_observed=data[event_col])
    return kmf.survival_function_at_times(data[time_col]).values.flatten()


data['target'] = kaplan(data=data)

In [7]:
data.shape

(28800, 61)

In [8]:
new_data = data.drop(columns=['efs', 'efs_time'], errors='ignore', index=False)

In [9]:
new_data.shape

(28800, 59)

In [10]:
new_data.head(1)

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,target
0,0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,...,No,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.458687


In [11]:
# Cleaning data: Dropping duplicate rows and irrelevant columns
data_cleaned = new_data.drop(columns=["ID"], errors='ignore').drop_duplicates()

# Replacing common null values in text fields and converting to lowercase
for col in data_cleaned.select_dtypes(include='object').columns:
    data_cleaned[col] = data_cleaned[col].str.strip().str.lower().replace(
        {'n/a': None, 'na': None, 'nan': None, '-': None})

# Filling missing values for numerical columns with their median
numerical_cols = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
data_cleaned[numerical_cols] = data_cleaned[numerical_cols].fillna(data_cleaned[numerical_cols].median())

# Filling missing values for categorical columns with their mode
categorical_cols = data_cleaned.select_dtypes(include='object').columns
data_cleaned[categorical_cols] = data_cleaned[categorical_cols].apply(
    lambda col: col.fillna(col.mode()[0] if not col.mode().empty else 'unknown'))

for i in data_cleaned[categorical_cols]:
    data_cleaned[i], _ = pd.factorize(data_cleaned[i])
    


In [12]:
data_cleaned.shape

(28800, 58)

In [13]:
data_cleaned.head(3)

Unnamed: 0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,target
0,0,0,0,0,2.0,8.0,0,0,6.0,0,...,0,0,0,0,8.0,0,2.0,0,10.0,0.458687
1,1,0,1,0,2.0,8.0,1,0,6.0,1,...,0,0,1,0,8.0,0,2.0,1,10.0,0.847759
2,0,0,0,0,2.0,8.0,0,0,6.0,0,...,0,0,1,0,8.0,0,2.0,0,10.0,0.462424


In [14]:
X = data_cleaned.drop(columns=['target', 'rituximab'], errors='ignore')
y = data_cleaned["target"]

In [15]:
scalare = StandardScaler()
X = scalare.fit_transform(X)


In [16]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Define the CatBoostRegressor model
model = CatBoostRegressor(verbose=0)

In [18]:
# Define the hyperparameter grid
param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.01, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5]
}

In [19]:
# Grid search for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='r2', verbose=2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   0.8s
[CV] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   0.7s
[CV] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   0.7s
[CV] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.1; total time=   0.7s
[CV] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.1; total time=   0.7s
[CV] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.1; total time=   0.7s
[CV] END depth=4, iterations=100, l2_leaf_reg=3, learning_rate=0.01; total time=   0.7s
[CV] END depth=4, iterations=100, l2_leaf_reg=3, learning_rate=0.01; total time=   0.8s
[CV] END depth=4, iterations=100, l2_leaf_reg=3, learning_rate=0.01; total time=   0.8s
[CV] END depth=4, iterations=100, l2_leaf_reg=3, learning_rate=0.1; total time=   0.7s
[CV] END depth=4, iterations=100, l2_leaf_reg=3, learning_rate

In [20]:
# Best model and hyperparameters
best_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'depth': 6, 'iterations': 200, 'l2_leaf_reg': 3, 'learning_rate': 0.1}


In [21]:
# Predict and calculate metrics
y_pred = best_model.predict(X_test)

# Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Concordance Index
c_index = concordance_index(y_test, y_pred)
print("Concordance Index:", c_index)

Mean Squared Error: 0.024701981551305426
Concordance Index: 0.644945429264315


In [22]:
best_model.fit(X, y)

<catboost.core.CatBoostRegressor at 0x78e7604bf850>

In [23]:
test = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')

In [24]:
# Cleaning data: Dropping duplicate rows and irrelevant columns
test_cleaned = test.drop(columns=["ID"], errors='ignore').drop_duplicates()
test_cleaned.drop(columns=['rituximab'], errors='ignore')
# Replacing common null values in text fields and converting to lowercase
for col in test_cleaned.select_dtypes(include='object').columns:
    test_cleaned[col] = test_cleaned[col].str.strip().str.lower().replace(
        {'n/a': None, 'na': None, 'nan': None, '-': None})

# Filling missing values for numerical columns with their median
numerical_cols = test_cleaned.select_dtypes(include=['float64', 'int64']).columns
test_cleaned[numerical_cols] = test_cleaned[numerical_cols].fillna(test_cleaned[numerical_cols].median())

# Filling missing values for categorical columns with their mode
categorical_cols = test_cleaned.select_dtypes(include='object').columns
test_cleaned[categorical_cols] = test_cleaned[categorical_cols].apply(
    lambda col: col.fillna(col.mode()[0] if not col.mode().empty else 'unknown'))

for i in test_cleaned[categorical_cols]:
    test_cleaned[i], _ = pd.factorize(test_cleaned[i])

In [25]:
test_cleaned.head()

Unnamed: 0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10
0,0,0,0,0,2.0,8.0,0,0,6.0,0,...,90.0,0,0,0,0,8.0,0,2.0,0,10.0
1,1,0,0,0,2.0,8.0,1,0,6.0,1,...,90.0,0,0,1,0,8.0,0,2.0,1,10.0
2,0,0,0,0,2.0,8.0,0,0,6.0,0,...,90.0,0,0,1,0,8.0,0,2.0,0,10.0


In [26]:
scalare = StandardScaler()
test_cleaned = scalare.fit_transform(test_cleaned)

In [27]:
# Predict and calculate metrics
y_pred = best_model.predict(test_cleaned)

In [28]:
y_pred

array([0.5581278 , 0.65406207, 0.55940234])

In [29]:
# Load the test data
test_file_path = '/kaggle/input/equity-post-HCT-survival-predictions/test.csv'
test_data = pd.read_csv(test_file_path)

# Predict probabilities for the test dataset
#prediction = model.predict(test_data.drop(columns=['ID', 'rituximab'], errors='ignore'))

# Add predictions to the test dataset
test_data['prediction'] = y_pred

# Save predictions to a new CSV file
output_file_path = 'submission.csv'
test_data[['ID', 'prediction']].to_csv(output_file_path, index=False)

print(f"Predictions saved to {output_file_path}")

Predictions saved to submission.csv


===========================================================================================================