In [1]:
#pip install /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
#from lifelines.utils import concordance_index


In [3]:
# Load the datasets
train_data = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
test_data = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')

# Display dataset structure
print("Train Data Shape:", train_data.shape)
print("Test Data Shape:", test_data.shape)
train_data.head()


Train Data Shape: (28800, 60)
Test Data Shape: (3, 58)


Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,...,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793
3,3,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,102.349
4,4,High,No,,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Related,MEL,8.0,No,2.0,No,10.0,0.0,16.223


In [4]:
# Check for missing values
print(train_data.isnull().sum())

# Fill or drop missing values
train_data.fillna(-999, inplace=True)

# Encode categorical variables if necessary
train_data = pd.get_dummies(train_data, drop_first=True)


ID                            0
dri_score                   154
psych_disturb              2062
cyto_score                 8068
diabetes                   2119
hla_match_c_high           4620
hla_high_res_8             5829
tbi_status                    0
arrhythmia                 2202
hla_low_res_6              3270
graft_type                    0
vent_hist                   259
renal_issue                1915
pulm_severe                2135
prim_disease_hct              0
hla_high_res_6             5284
cmv_status                  634
hla_high_res_10            7163
hla_match_dqb1_high        5199
tce_imm_match             11133
hla_nmdp_6                 4197
hla_match_c_low            2800
rituximab                  2148
hla_match_drb1_low         2643
hla_match_dqb1_low         4194
prod_type                     0
cyto_score_detail         11923
conditioning_intensity     4789
ethnicity                   587
year_hct                      0
obesity                    1760
mrd_hct 

In [5]:
# Split features and target
X = train_data.drop(['efs','efs_time'], axis=1)
y = train_data['efs']

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Initialize a simple Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)[:, 1]


In [7]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_val, y_prob))
print("Classification Report:\n", classification_report(y_val, y_pred))


Accuracy: 0.6746527777777778
AUC-ROC Score: 0.7326236244019356
Classification Report:
               precision    recall  f1-score   support

         0.0       0.69      0.55      0.61      2683
         1.0       0.67      0.78      0.72      3077

    accuracy                           0.67      5760
   macro avg       0.68      0.67      0.67      5760
weighted avg       0.68      0.67      0.67      5760



In [8]:
unwanted_columns = ['efs_time', 'efs']  
train_data = train_data.drop(columns=unwanted_columns, errors='ignore')

train_data.fillna(-999, inplace=True)
test_data.fillna(-999, inplace=True)

train_data_encoded = pd.get_dummies(train_data, drop_first=True)
test_data_encoded = pd.get_dummies(test_data, drop_first=True)

test_data_aligned = test_data_encoded.reindex(columns=train_data_encoded.columns, fill_value=0)

test_predictions = model.predict(test_data_aligned)

print(test_predictions[:10]) 


[0. 1. 0.]


In [9]:
# Create a submission file
submission = pd.DataFrame({'ID': test_data['ID'], 'prediction': test_predictions})
submission.to_csv('submission.csv', index=False)


In [10]:
submission

Unnamed: 0,ID,prediction
0,28800,0.0
1,28801,1.0
2,28802,0.0
