In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# -------------------------
# 1. Load the tables
# -------------------------

from pathlib import Path

cwd = Path.cwd()
project_root = None

# Fallback: search for the data/raw/train directory structure
if project_root is None:
    for p in [cwd] + list(cwd.parents):
        if (p / 'data' / 'raw' / 'train').is_dir():
            project_root = p
            break

if project_root is None:
    project_root = cwd

data_raw_train = project_root / 'data' / 'raw' / 'train'
clinical_path = data_raw_train / 'clinical_train.csv'
molecular_path = data_raw_train / 'molecular_train.csv'
target_path = data_raw_train / 'target_train.csv'

print(f"Project root: {project_root}")
print(f"Looking for data at: {data_raw_train}")

if not target_path.exists():
    raise FileNotFoundError(f'Could not find target file at {target_path}')
if not clinical_path.exists():
    raise FileNotFoundError(f'Could not find clinical file at {clinical_path}')
if not molecular_path.exists():
    raise FileNotFoundError(f'Could not find molecular file at {molecular_path}')


target = pd.read_csv(target_path, sep=",", header=0,
                     names=['ID','OS_YEARS','OS_STATUS'],
                     dtype={'ID': str, 'OS_YEARS': float, 'OS_STATUS': float})

clinical = pd.read_csv(clinical_path, sep=",", header=0,
    names=['ID','CENTER','BM_BLAST','WBC','ANC','MONOCYTES','HB','PLT','CYTOGENETICS'],
    dtype={'ID': str, 'CENTER': str, 'BM_BLAST': float, 'WBC': float,
           'ANC': float, 'MONOCYTES': float, 'HB': float, 'PLT': float, 'CYTOGENETICS': str})

molecular = pd.read_csv(molecular_path, sep=",", header=0,
    names=['ID','CHR','START','END','REF','ALT','GENE','PROTEIN_CHANGE',
           'EFFECT','VAF','DEPTH'],
    dtype={'ID': str, 'CHR': str, 'START': float, 'END': float,
           'REF': str, 'ALT': str, 'GENE': str, 'PROTEIN_CHANGE': str,
           'EFFECT': str, 'VAF': float, 'DEPTH': float})


Project root: c:\Users\alexb\Documents\EI3\APST1\Data Challenge\project_root
Looking for data at: c:\Users\alexb\Documents\EI3\APST1\Data Challenge\project_root\data\raw\train


In [6]:
# We check that the data is loaded correctly
print(f"Target shape: {target.shape}")
print(f"Clinical shape: {clinical.shape}")
print(f"Molecular shape: {molecular.shape}")

Target shape: (3323, 3)
Clinical shape: (3323, 9)
Molecular shape: (10935, 11)


In [14]:
# -------------------------
# 2. Convert molecular table into per-patient mutation features
# -------------------------

# Example feature engineering: binary indicator for each gene mutated
gene_counts = molecular.groupby(['ID', 'GENE']).size().unstack(fill_value=0)

# You can also include VAF summary statistics:
vaf_stats = molecular.groupby("ID")["VAF"].agg(['mean','max','min']).add_prefix("VAF_")

# Combine molecular features
mol_features = gene_counts.join(vaf_stats, how="left").fillna(0)

# -------------------------
# 3. Merge everything into a single training table
# -------------------------

X = clinical.merge(mol_features, how="left", on="ID").fillna(0)
y = target.set_index("ID").loc[X["ID"]]["OS_YEARS"]  # using OS_YEARS as target

# Remove ID column
X = X.set_index("ID")

# Force categorical fields to string
categorical_cols = ["CENTER", "CYTOGENETICS"]
for col in categorical_cols:
    X[col] = X[col].astype(str)

# Remove patients with missing survival time
valid_idx = ~y.isna()
X = X[valid_idx]
y = y[valid_idx]



In [16]:
# We check the integrity of the merged data
print(X)

        CENTER  BM_BLAST     WBC   ANC  MONOCYTES    HB    PLT  \
ID                                                               
P132697    MSK      14.0    2.80  0.20       0.70   7.6  119.0   
P132698    MSK       1.0    7.40  2.40       0.10  11.6   42.0   
P116889    MSK      15.0    3.70  2.10       0.10  14.2   81.0   
P132699    MSK       1.0    3.90  1.90       0.10   8.9   77.0   
P132700    MSK       6.0  128.00  9.70       0.90  11.1  195.0   
...        ...       ...     ...   ...        ...   ...    ...   
P121826     VU       1.0    2.50  1.02       0.20  10.2   78.0   
P121827     VU       1.5    8.10  2.66       0.45  11.3   40.0   
P121830     VU       0.0    1.80  0.55       0.29   9.4   86.0   
P121853     VU       5.0    1.37  0.37       0.11  11.4  102.0   
P121834     VU       0.0    2.70  0.72       0.23   8.2  239.0   

                                              CYTOGENETICS  ABL1  ARID1A  ...  \
ID                                                          

In [None]:
# -------------------------
# 4. Build preprocessing + model pipeline
# -------------------------

# Categorical columns from clinical features
categorical_cols = ["CENTER", "CYTOGENETICS"]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# GradientBoostingRegressor is stable for noisy biomedical data
model = GradientBoostingRegressor(
    n_estimators=400,
    learning_rate=0.03,
    max_depth=3,
    subsample=0.8
)

pipeline = Pipeline([
    ("preprocess", preprocess),
    ("model", model)
])


# -------------------------
# 5. Train/test split and training
# -------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

pred = pipeline.predict(X_test)

RÂ² on test: 0.17303938312621658


In [None]:
from lifelines.utils import concordance_index
from sksurv.metrics import concordance_index_ipcw
from sksurv.util import Surv

# -------------------------
# 6. Evaluating model performance
# -------------------------

time_test = y_test.values
status_test = target.set_index("ID").loc[X_test.index]["OS_STATUS"].astype(bool).values

c_index = concordance_index(
    event_times=time_test,
    predicted_scores=pred,
    event_observed=status_test
)

print("C-index:", c_index)

# Need train data for censoring distribution
time_train = y_train.values
status_train = target.set_index("ID").loc[X_train.index]["OS_STATUS"].astype(bool).values

# scikit-survival requires its own structured array format:
y_train_struct = Surv.from_arrays(status_train, time_train)
y_test_struct  = Surv.from_arrays(status_test, time_test)

c_index_ipcw = concordance_index_ipcw(
    y_train_struct,
    y_test_struct,
    pred
)[0]   # first entry is the IPCW-c-index

print("IPCW C-index:", c_index_ipcw)

In [None]:
# -------------------------
# 6. Predict survivability score for all patients
# -------------------------

survivability_score = pipeline.predict(X)

# Attach score to IDs
output = pd.DataFrame({
    "ID": X.index,
    "SURVIVABILITY_SCORE": survivability_score
})

print(output.head())

# Save to CSV
output.to_csv("survivability_predictions.csv", index=False)