In [None]:
# Install required packages (run this cell first)
!pip install -q numpy pandas scikit-learn xgboost shap matplotlib joblib


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix, roc_curve)
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import joblib
import warnings
warnings.filterwarnings('ignore')

# Colab save dir
BASE_SAVE_DIR = '/content'
os.makedirs(BASE_SAVE_DIR, exist_ok=True)
print('Save directory:', BASE_SAVE_DIR)


In [None]:
def generate_synthetic_hct_data(n=2000, random_state=42):
    rng = np.random.RandomState(random_state)
    age = rng.normal(loc=45, scale=12, size=n).clip(18, 80).astype(int)
    gender = rng.choice(['Male','Female'], size=n, p=[0.55,0.45])
    donor_type = rng.choice(['Matched sibling','Matched unrelated','Haploidentical','Cord blood'],
                            p=[0.35,0.4,0.2,0.05], size=n)
    comorbidity_score = rng.poisson(lam=2.0, size=n)
    disease_type = rng.choice(['AML','ALL','MDS','Lymphoma','Other'],
                              p=[0.25,0.15,0.2,0.25,0.15], size=n)
    conditioning_intensity = rng.choice(['Myeloablative','Reduced-intensity'], p=[0.4,0.6], size=n)
    prior_transplants = rng.binomial(1, 0.12, size=n)
    time_from_diagnosis_days = rng.exponential(scale=365, size=n).astype(int).clip(10, 5000)
    treatment_days = (30 + (comorbidity_score * 5) + rng.normal(0,10,size=n)).astype(int).clip(7, 365)

    risk = (
        0.03 * (age - 40) +
        0.4 * (comorbidity_score) +
        0.6 * (prior_transplants) +
        (donor_type == 'Matched sibling') * -0.6 +
        (donor_type == 'Haploidentical') * 0.4 +
        (conditioning_intensity == 'Myeloablative') * 0.3 +
        rng.normal(0, 0.8, size=n)
    )
    prob_survival = 1 / (1 + np.exp(risk))
    survival = (rng.rand(n) < prob_survival).astype(int)
    df = pd.DataFrame({
        'age': age,
        'gender': gender,
        'donor_type': donor_type,
        'comorbidity_score': comorbidity_score,
        'disease_type': disease_type,
        'conditioning_intensity': conditioning_intensity,
        'prior_transplants': prior_transplants,
        'time_from_diagnosis_days': time_from_diagnosis_days,
        'treatment_days': treatment_days,
        'survival': survival
    })
    for col in ['donor_type','comorbidity_score','conditioning_intensity']:
        mask = rng.rand(n) < 0.03
        df.loc[mask, col] = np.nan
    return df

def make_preprocessor(categorical_features, numeric_features):
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    return preprocessor


In [None]:
def train_and_save(df, save_dir=BASE_SAVE_DIR, random_state=42):
    target = 'survival'
    X = df.drop(columns=[target])
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        stratify=y, random_state=random_state)

    categorical_features = ['gender','donor_type','disease_type','conditioning_intensity']
    numeric_features = [c for c in X.columns if c not in categorical_features]

    preprocessor = make_preprocessor(categorical_features, numeric_features)

    xgb_clf = XGBClassifier(
        n_estimators=200,
        max_depth=4,
        learning_rate=0.05,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=random_state,
        n_jobs=2
    )

    pipeline = Pipeline(steps=[('preproc', preprocessor), ('xgb', xgb_clf)])

    # small validation split for early stopping
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.15,
                                                stratify=y_train, random_state=random_state)
    try:
        pipeline.fit(
            X_tr, y_tr,
            xgb__eval_set=[(preprocessor.fit_transform(X_val), y_val)],
            xgb__early_stopping_rounds=20,
            xgb__verbose=False
        )
    except Exception:
        pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_test, y_proba)
    }
    cm = confusion_matrix(y_test, y_pred)

    # save pipeline
    save_path = os.path.join(save_dir, 'xgb_hct_pipeline.joblib')
    joblib.dump(pipeline, save_path)
    print(f"Saved pipeline to: {save_path}")

    # save ROC image
    try:
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        plt.figure(figsize=(6,4))
        plt.plot(fpr, tpr)
        plt.plot([0,1],[0,1], '--', linewidth=0.8)
        plt.xlabel('FPR')
        plt.ylabel('TPR')
        plt.title('ROC curve')
        plt.grid(True)
        roc_path = os.path.join(save_dir, 'roc_curve.png')
        plt.savefig(roc_path, bbox_inches='tight', dpi=150)
        plt.close()
        print(f"Saved ROC plot to: {roc_path}")
    except Exception as e:
        print("Could not save ROC plot:", e)

    return pipeline, metrics, cm, (X_test, y_test, y_proba)

# Example: train on synthetic data (run this cell to start training)
df = generate_synthetic_hct_data(n=3000, random_state=123)
pipeline, metrics, cm, test_data = train_and_save(df, save_dir=BASE_SAVE_DIR, random_state=123)

print("Evaluation metrics:")
for k,v in metrics.items():
    print(f"  {k}: {v:.4f}")
print("Confusion matrix:\n", cm)


In [None]:
# Create sample patients and output table
patients = [
    {'Patient ID': 'P001', 'age': 45, 'gender': 'Female', 'donor_type': 'Matched sibling',
     'comorbidity_score': 1, 'disease_type': 'AML', 'conditioning_intensity': 'Reduced-intensity',
     'prior_transplants': 0, 'time_from_diagnosis_days': 180, 'treatment_days': 30,
     'Key Risk Factors': 'Low comorbidity, related donor'},

    {'Patient ID': 'P002', 'age': 62, 'gender': 'Male', 'donor_type': 'Matched unrelated',
     'comorbidity_score': 4, 'disease_type': 'MDS', 'conditioning_intensity': 'Myeloablative',
     'prior_transplants': 1, 'time_from_diagnosis_days': 450, 'treatment_days': 70,
     'Key Risk Factors': 'Older age, unrelated donor'},

    {'Patient ID': 'P003', 'age': 35, 'gender': 'Female', 'donor_type': 'Matched sibling',
     'comorbidity_score': 2, 'disease_type': 'ALL', 'conditioning_intensity': 'Reduced-intensity',
     'prior_transplants': 0, 'time_from_diagnosis_days': 220, 'treatment_days': 40,
     'Key Risk Factors': 'Young age, mild comorbidity'},

    {'Patient ID': 'P004', 'age': 58, 'gender': 'Male', 'donor_type': 'Haploidentical',
     'comorbidity_score': 5, 'disease_type': 'Lymphoma', 'conditioning_intensity': 'Myeloablative',
     'prior_transplants': 1, 'time_from_diagnosis_days': 300, 'treatment_days': 90,
     'Key Risk Factors': 'Multiple comorbidities, high regimen intensity'}
]

patient_df = pd.DataFrame(patients)
X_patient = patient_df.drop(columns=['Patient ID', 'Key Risk Factors'])
probs = pipeline.predict_proba(X_patient)[:, 1]

output_df = pd.DataFrame({
    'Patient ID': patient_df['Patient ID'],
    'Predicted Survival Probability': [f"{p:.2f} ({int(p*100)}%)" for p in probs],
    'Key Risk Factors': patient_df['Key Risk Factors']
})

csv_path = os.path.join(BASE_SAVE_DIR, 'patient_predictions.csv')
output_df.to_csv(csv_path, index=False)
print(f"Saved predictions CSV to: {csv_path}")

html_path = os.path.join(BASE_SAVE_DIR, 'predictions_table.html')
html_style = """
<style>
body { background:#0f1720; color:#e6eef6; font-family: Arial, Helvetica, sans-serif; padding:20px; }
table { border-collapse: collapse; width: 100%; max-width:900px; }
th, td { padding:12px 14px; border-bottom:1px solid #263238; text-align:left; }
th { background: #0b1220; color: #fff; font-weight:600; }
tr { background: #0f1720; }
td { color: #dbeaf3; }
</style>
"""
output_df_html = output_df.to_html(index=False, escape=True)
with open(html_path, 'w') as f:
    f.write(html_style + output_df_html)
print(f"Saved HTML table to: {html_path}")

from IPython.display import HTML, display
display(HTML(output_df_html))
