In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.util import Surv

# -------------------------
# 1. Load the tables
# -------------------------

from pathlib import Path

cwd = Path.cwd()
project_root = None

# Fallback: search for the data/raw/train directory structure
if project_root is None:
    for p in [cwd] + list(cwd.parents):
        if (p / 'data' / 'raw' / 'train').is_dir():
            project_root = p
            break

if project_root is None:
    project_root = cwd

data_raw_train = project_root / 'data' / 'raw' / 'train'
clinical_path = data_raw_train / 'clinical_train.csv'
molecular_path = data_raw_train / 'molecular_train.csv'
target_path = data_raw_train / 'target_train.csv'
code
python
# -------------------------
# 6.5. Hyperparameter optimization (Optuna) with K-Fold IPCW objective
# - Uses K-Fold cross-validation on the training split and maximizes IPCW C-index
# - After tuning, computes feature importances, selects top features, and retrains
# -------------------------
import optuna
from sklearn.model_selection import KFold, train_test_split
from sksurv.metrics import concordance_index_ipcw
import numpy as np

# K-fold for inner CV during optimization
N_SPLITS = 5

def objective(trial):
    # Hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.5),
        'max_depth': trial.suggest_int('max_depth', 1, 6),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
    }

    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
    scores = []

    # Iterate folds on the training split (X_train/y_train defined earlier)
    for train_idx, val_idx in kf.split(range(len(X_train))):
        X_tr = X_train.iloc[train_idx]
        X_val = X_train.iloc[val_idx]
        y_tr = y_train[train_idx]
        y_val = y_train[val_idx]

        # Build model for this trial
        gb = GradientBoostingSurvivalAnalysis(
            n_estimators=params['n_estimators'],
            learning_rate=params['learning_rate'],
            max_depth=params['max_depth'],
            min_samples_split=params['min_samples_split'],
            min_samples_leaf=params['min_samples_leaf'],
            subsample=params['subsample'],
            max_features=params['max_features'],
            random_state=42
        )

        pipe = Pipeline([("pre", preprocess), ("model", gb)])
        # Fit on fold train
        pipe.fit(X_tr, y_tr)
        # Predict risk on validation fold
        pred_val = pipe.predict(X_val)
        # IPCW measured on this fold (train used to estimate censoring weights)
        try:
            ipcw_fold = concordance_index_ipcw(y_tr, y_val, pred_val)[0]
        except Exception:
            ipcw_fold = 0.0
        scores.append(ipcw_fold)

    # Return mean IPCW across folds
    return float(np.mean(scores))

# Create and run study (use a moderate number of trials to start)
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=40, n_jobs=1)

print("Best IPCW (CV):", study.best_value)
print("Best params:", study.best_params)

# Save best params into best_params variable for downstream cells
best_params = study.best_params

# -------------------------
# Train final pipeline on full X_train using best params
# -------------------------
gb_final = GradientBoostingSurvivalAnalysis(
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    subsample=best_params['subsample'],
    max_features=best_params['max_features'],
    random_state=42
)
pipe_full = Pipeline([('pre', preprocess), ('model', gb_final)])
pipe_full.fit(X_train, y_train)

# Extract fitted preprocessor and model
pre = pipe_full.named_steps['pre']
model_fitted = pipe_full.named_steps['model']

# Get transformed feature names (best-effort) and importances
def get_feature_names(preprocessor, X_ref):
    # Try sklearn's get_feature_names_out, fall back to building names
    try:
        return preprocessor.get_feature_names_out()
    except Exception:
        names = []
        # categorical columns used earlier
        categorical_cols = [c for c in ['CENTER','CYTOGENETICS'] if c in X_ref.columns]
        numeric_cols = [c for c in X_ref.columns if c not in categorical_cols]
        if 'cat' in preprocessor.named_transformers_:
            ohe = preprocessor.named_transformers_['cat']
            try:
                cat_names = ohe.get_feature_names_out(categorical_cols)
            except Exception:
                cat_names = []
                for i, col in enumerate(categorical_cols):
                    cats = ohe.categories_[i]
                    cat_names.extend([f"{col}_%s" % str(x) for x in cats])
            names.extend(list(cat_names))
        names.extend(numeric_cols)
        return names

feature_names = get_feature_names(pre, X_train)
importances = model_fitted.feature_importances_

# Create a sorted DataFrame of importances
import pandas as pd
feat_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feat_df = feat_df.sort_values('importance', ascending=False).reset_index(drop=True)
print(feat_df.head(30))
feat_df.to_csv('feature_importances.csv', index=False)

# Select top features (by preprocessed feature importance)
TOP_K = min(50, len(feature_names))
top_idx = np.argsort(importances)[::-1][:TOP_K]
selected_feature_names = [feature_names[i] for i in top_idx]
pd.Series(selected_feature_names).to_csv('selected_features_preprocessed.csv', index=False, header=['feature'])
print("Selected top features (preprocessed):")
print(selected_feature_names[:30])

# Transform X_train/X_test with preprocessor and select the top preprocessed columns
X_train_arr = pre.transform(X_train)
X_test_arr = pre.transform(X_test)
# If sparse, convert to dense
if hasattr(X_train_arr, 'toarray'):
    X_train_arr = X_train_arr.toarray()
    X_test_arr = X_test_arr.toarray()

X_train_sel = X_train_arr[:, top_idx]
X_test_sel = X_test_arr[:, top_idx]

# Retrain a survival model on selected transformed features (no preprocess step needed now)
gb_sel = GradientBoostingSurvivalAnalysis(
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    subsample=best_params['subsample'],
    max_features=best_params['max_features'],
    random_state=42
)
gb_sel.fit(X_train_sel, y_train)
pred_sel = gb_sel.predict(X_test_sel)

# Evaluate IPCW for the selected-features model
c_index_ipcw_selected = concordance_index_ipcw(y_train, y_test, pred_sel)[0]
print("IPCW (selected features) on test: ", c_index_ipcw_selected)

# Save selected feature names (preprocessed) for reference
with open('selected_features_preprocessed.txt', 'w') as f:
    for s in selected_feature_names:
        f.write(s + '
































           'EFFECT': str, 'VAF': float, 'DEPTH': float})           'REF': str, 'ALT': str, 'GENE': str, 'PROTEIN_CHANGE': str,    dtype={'ID': str, 'CHR': str, 'START': float, 'END': float,           'EFFECT','VAF','DEPTH'],    names=['ID','CHR','START','END','REF','ALT','GENE','PROTEIN_CHANGE',molecular = pd.read_csv(molecular_path, sep=",", header=0,           'ANC': float, 'MONOCYTES': float, 'HB': float, 'PLT': float, 'CYTOGENETICS': str})    dtype={'ID': str, 'CENTER': str, 'BM_BLAST': float, 'WBC': float,    names=['ID','CENTER','BM_BLAST','WBC','ANC','MONOCYTES','HB','PLT','CYTOGENETICS'],clinical = pd.read_csv(clinical_path, sep=",", header=0,                     dtype={'ID': str, 'OS_YEARS': float, 'OS_STATUS': float})                     names=['ID','OS_YEARS','OS_STATUS'],target = pd.read_csv(target_path, sep=",", header=0,    raise FileNotFoundError(f'Could not find molecular file at {molecular_path}')if not molecular_path.exists():    raise FileNotFoundError(f'Could not find clinical file at {clinical_path}')if not clinical_path.exists():    raise FileNotFoundError(f'Could not find target file at {target_path}')if not target_path.exists():print(f"Looking for data at: {data_raw_train}")print(f"Project root: {project_root}")# best_params variable is already set above# Also keep best_params persisted (useful for later cells)')

Project root: c:\Users\alexb\Documents\EI3\APST1\Data Challenge\project_root
Looking for data at: c:\Users\alexb\Documents\EI3\APST1\Data Challenge\project_root\data\raw\train


In [18]:
# We check that the data is loaded correctly
print(f"Target shape: {target.shape}")
print(f"Clinical shape: {clinical.shape}")
print(f"Molecular shape: {molecular.shape}")

Target shape: (3323, 3)
Clinical shape: (3323, 9)
Molecular shape: (10935, 11)


In [23]:
# -------------------------
# 2. Convert molecular table into per-patient mutation features
# -------------------------

# FLAG: Mutation feature engineering mode
#   0 = mutation count only
#   1 = lethal mutation flags (statistically significant)
#   2 = all mutations as binary indicators
mutation_feature_mode = 0

# List of genes to exclude from features (manual exclusions)
EXCLUDE_GENES = ["WT1", "ZBTB33"]

# Binary indicator for each gene mutated
gene_counts = molecular.groupby(['ID', 'GENE']).size().unstack(fill_value=0)

# Drop explicitly excluded genes if present
for g in EXCLUDE_GENES:
    if g in gene_counts.columns:
        gene_counts = gene_counts.drop(columns=g)

if mutation_feature_mode == 0:
    # Simple mutation count per patient: keep only TOTAL_MUTATED_GENES
    gene_counts["TOTAL_MUTATED_GENES"] = (gene_counts > 0).sum(axis=1)
    # Reduce to single column so no individual gene flags remain
    gene_counts = gene_counts[["TOTAL_MUTATED_GENES"]]
    print(f"Feature engineering with mutation count only:")
    print(f"  - Only TOTAL_MUTATED_GENES retained")

elif mutation_feature_mode == 1:
    # Define lethal mutations: top 10 genes by mortality rate from statistically significant mutations
    lethal_mutations = [g for g in mutation_df_filtered.head(10).index.tolist() if g not in EXCLUDE_GENES]
    print(f"Lethal mutations selected (statistically significant, p < 0.05) excluding manual list:")
    for gene in lethal_mutations:
        row = mutation_df_filtered.loc[gene]
        print(f"  - {gene}: {row['mortality_rate']:.1f}% ({int(row['deaths'])}/{int(row['patient_count'])} patients, p={row['p_value']:.4f})")
    
    # Ensure columns for lethal mutations exist and are binary
    for gene in lethal_mutations:
        if gene in gene_counts.columns:
            gene_counts[gene] = (gene_counts[gene] > 0).astype(int)
        else:
            gene_counts[gene] = 0

    # Aggregate all other (non-lethal) genes into OTHER_MUTATIONS_COUNT
    existing_cols = [c for c in gene_counts.columns if c not in lethal_mutations]
    if existing_cols:
        gene_counts["OTHER_MUTATIONS_COUNT"] = gene_counts[existing_cols].sum(axis=1).astype(int)
        # Drop the non-lethal individual gene columns
        gene_counts = gene_counts.drop(columns=existing_cols)

    # Now explicitly keep only lethal flags + OTHER_MUTATIONS_COUNT
    final_cols = lethal_mutations.copy()
    if 'OTHER_MUTATIONS_COUNT' in gene_counts.columns:
        final_cols.append('OTHER_MUTATIONS_COUNT')

    # Add TOTAL_MUTATED_GENES as sum of lethal flags + other count
    gene_counts["TOTAL_MUTATED_GENES"] = gene_counts[final_cols].sum(axis=1).astype(int)
    final_cols.append('TOTAL_MUTATED_GENES')

    # Reduce gene_counts to final columns only (enforces no stray gene flags)
    gene_counts = gene_counts[final_cols]

    print(f"\nFeature engineering with lethal mutation flags (strict):")
    print(f"  - Kept lethal flags: {len(lethal_mutations)}")
    print(f"  - Kept OTHER_MUTATIONS_COUNT: {'OTHER_MUTATIONS_COUNT' in gene_counts.columns}")
    print(f"  - Kept TOTAL_MUTATED_GENES: {'TOTAL_MUTATED_GENES' in gene_counts.columns}")

elif mutation_feature_mode == 2:
    # Keep all genes as binary indicators
    for col in gene_counts.columns:
        gene_counts[col] = (gene_counts[col] > 0).astype(int)
    
    # Adding a column with a total number of mutated genes
    gene_counts["TOTAL_MUTATED_GENES"] = (gene_counts > 0).sum(axis=1)
    
    print(f"Feature engineering without lethal mutation flags:")
    print(f"  - All {len(gene_counts.columns) - 1} genes included as binary indicators")

# VAF summary statistics:
vaf_stats = molecular.groupby("ID")["VAF"].agg(['mean','max','min']).add_prefix("VAF_")

# Combine molecular features
mol_features = gene_counts.join(vaf_stats, how="left").fillna(0)

# -------------------------
# 3. Merge everything into a single training table
# -------------------------

X = clinical.merge(mol_features, how="left", on="ID").fillna(0)

# Remove ID column
X = X.set_index("ID")

# Drop complex/unwanted clinical columns
if 'CYTOGENETICS' in X.columns:
    X = X.drop(columns=['CYTOGENETICS'])

# Force categorical fields to string (only keep CENTER if present)
if 'CENTER' in X.columns:
    X["CENTER"] = X["CENTER"].astype(str)

# Prepare survival data: need both OS_YEARS (time) and OS_STATUS (event indicator)
survival_data = target.set_index("ID").loc[X.index][["OS_YEARS", "OS_STATUS"]].copy()
survival_data["OS_STATUS"] = survival_data["OS_STATUS"].astype(bool)

# Remove patients with missing survival time
valid_idx = ~survival_data["OS_YEARS"].isna()
X = X[valid_idx]
survival_data = survival_data[valid_idx]

# Create structured array for survival analysis
y = Surv.from_arrays(
    event=survival_data["OS_STATUS"].values,
    time=survival_data["OS_YEARS"].values
)


Feature engineering with mutation count only:
  - Only TOTAL_MUTATED_GENES retained


In [24]:
# We check the integrity of the merged data
print(X)

        CENTER  BM_BLAST     WBC   ANC  MONOCYTES    HB    PLT  \
ID                                                               
P132697    MSK      14.0    2.80  0.20       0.70   7.6  119.0   
P132698    MSK       1.0    7.40  2.40       0.10  11.6   42.0   
P116889    MSK      15.0    3.70  2.10       0.10  14.2   81.0   
P132699    MSK       1.0    3.90  1.90       0.10   8.9   77.0   
P132700    MSK       6.0  128.00  9.70       0.90  11.1  195.0   
...        ...       ...     ...   ...        ...   ...    ...   
P121826     VU       1.0    2.50  1.02       0.20  10.2   78.0   
P121827     VU       1.5    8.10  2.66       0.45  11.3   40.0   
P121830     VU       0.0    1.80  0.55       0.29   9.4   86.0   
P121853     VU       5.0    1.37  0.37       0.11  11.4  102.0   
P121834     VU       0.0    2.70  0.72       0.23   8.2  239.0   

         TOTAL_MUTATED_GENES  VAF_mean  VAF_max  VAF_min  
ID                                                        
P132697                

In [25]:
# -------------------------
# 4. Build preprocessing + model pipeline
# -------------------------

# Categorical columns from clinical features (compute dynamically in case CYTOGENETICS was dropped)
categorical_cols = [c for c in ["CENTER", "CYTOGENETICS"] if c in X.columns]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# GradientBoostingSurvivalAnalysis - survival-aware model that properly handles censoring
gb = GradientBoostingSurvivalAnalysis(
    n_estimators=600,
    learning_rate=0.03,
    max_depth=3,
    subsample=0.8,
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=42
)

pipeline = Pipeline([
    ("pre", preprocess),
    ("model", gb)
])


# -------------------------
# 5. Train/test split and training
# -------------------------

# Split data (must preserve survival structure)
idx_train, idx_test = train_test_split(range(len(X)), test_size=0.2, random_state=42)

X_train = X.iloc[idx_train]
X_test = X.iloc[idx_test]
y_train = y[idx_train]
y_test = y[idx_test]

# Fit the survival pipeline
pipeline.fit(X_train, y_train)

# Predict risk scores (higher = worse survival)
pred = pipeline.predict(X_test)

In [None]:
from lifelines.utils import concordance_index

# lifelines expects higher scores => longer survival (predicted time),
# so invert risk scores for concordance.
c_index = concordance_index(
    event_times=y_test['time'],
    predicted_scores=-pred,               # <- invert
    event_observed=y_test['event'].astype(bool)
)
print("C-index:", c_index)

from sksurv.metrics import concordance_index_ipcw
from sksurv.util import Surv

# -------------------------
# 6. Evaluating model performance
# -------------------------

# IPCW C-index accounts for censoring in the evaluation set
c_index_ipcw = concordance_index_ipcw(
    y_train,
    y_test,
    pred
)[0]   # first entry is the IPCW-c-index

print("IPCW C-index:", c_index_ipcw)

C-index: 0.271486438886932
IPCW C-index: 0.6961526084937755


In [29]:
# -------------------------
# 7. Train Final Model with Best Hyperparameters
# -------------------------

# Create final model with best parameters (survival-aware)
gb_final = GradientBoostingSurvivalAnalysis(
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    subsample=best_params['subsample'],
    max_features=best_params['max_features'],
    random_state=42
)

# Create final pipeline
model = Pipeline([
    ("pre", preprocess),
    ("model", gb_final)
])

# Train final model on training data
model.fit(X_train, y_train)

# Make predictions on test data
pred = model.predict(X_test)

# Evaluate final model
c_index_final = concordance_index(
    event_times=y_test['time'],
    predicted_scores=pred,
    event_observed=y_test['event'].astype(bool)
)

c_index_ipcw_final = concordance_index_ipcw(
    y_train,
    y_test,
    pred
)[0]

print("=" * 60)
print("Final Model Performance (with optimized hyperparameters):")
print("=" * 60)
print(f"C-index: {c_index_final:.4f}")
print(f"IPCW C-index: {c_index_ipcw_final:.4f}")
print("=" * 60)

Final Model Performance (with optimized hyperparameters):
C-index: 0.2841
IPCW C-index: 0.6853


In [None]:
# -------------------------
# 8. Predict survivability score for all patients (using optimized model)
# -------------------------

survivability_score = model.predict(X)

# Attach score to IDs
output = pd.DataFrame({
    "ID": X.index,
    "SURVIVABILITY_SCORE": survivability_score
})

print(output.head())

# Save to CSV
output.to_csv("survivability_predictions.csv", index=False)