# HSCT survival: XGBoost ensemble

## Notebook set-up

In [None]:
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import configuration as config
import functions.helper as helper_funcs

notebook_num='03.3'
gpu=0

# Data files
datasets_file=f'{config.PROCESSED_DATA}/02.1-dataset_definitions.pkl'
coxph_survival_file=f'{config.PROCESSED_DATA}/02.1-coxPH_survival.pkl'
weibullaft_survival_file=f'{config.PROCESSED_DATA}/02.2-weibullAFT_survival.pkl'
learned_efs_file=f'{config.PROCESSED_DATA}/02.4-learned_efs.pkl'

# Model files
tuned_model_file=f'{config.MODELS_PATH}/{notebook_num}-XGBoost_engineered_features_tuned.pkl'

# Experiment results
hyperparameter_tuning_results=f'{config.DATA_PATH}/results/data/{notebook_num}-hyperparameter_tuning_results.csv'
training_scores_file=f'{config.DATA_PATH}/results/data/{notebook_num}-training_scores.csv'
testing_scores_file=f'{config.DATA_PATH}/results/data/{notebook_num}-testing_scores.csv'

retune_model=True

## 1. Load data

In [None]:
# Read the dataset metadata
with open(datasets_file, 'rb') as input_file:
    datasets=pickle.load(input_file)

# Load one of the datasets
with open(datasets['Nominal one-hot/ordinal encoded, NANs imputed'], 'rb') as input_file:
    data_dict=pickle.load(input_file)

print('Data dictionary contains:\n')
for key, value in data_dict.items():
    print(f' {key}: {type(value)}')

# Load Cox Proportional Hazard model features
with open(coxph_survival_file, 'rb') as input_file:
    coxph_features=pickle.load(input_file)

print('\nCox PH features:\n')
for key, value in coxph_features.items():
    print(f' {key}: {type(value)}')

# Load Weibull Accelerated Failure Time model features
with open(weibullaft_survival_file, 'rb') as input_file:
    weibullaft_features=pickle.load(input_file)

print('\nWeibull AFT features:\n')
for key, value in weibullaft_features.items():
    print(f' {key}: {type(value)}')

# Load learned efs features
with open(learned_efs_file, 'rb') as input_file:
    learned_efs_features=pickle.load(input_file)

print('\nLearned EFS features:\n')
for key, value in learned_efs_features.items():
    print(f' {key}: {type(value)}')

## 2. Data preparation

### 2.1. Add survival model features

In [None]:
training_features_df=data_dict['Training features']
training_features_df['CoxPH survival']=coxph_features['Training survival']
training_features_df['CoxPH partial hazard']=coxph_features['Training partial hazard']
training_features_df['WeibullAFT survival']=weibullaft_features['Training survival']
training_features_df['WeibullAFT expectation']=weibullaft_features['Training expectation']

testing_features_df=data_dict['Testing features']
testing_features_df['CoxPH survival']=coxph_features['Testing survival']
testing_features_df['CoxPH partial hazard']=coxph_features['Testing partial hazard']
testing_features_df['WeibullAFT survival']=weibullaft_features['Testing survival']
testing_features_df['WeibullAFT expectation']=weibullaft_features['Testing expectation']

training_features_df.head().transpose()

### 2.2. Add learned EFS features

In [None]:
# training_features_df['learned_efs']=learned_efs_features['Training efs probability']
# testing_features_df['learned_efs']=learned_efs_features['Testing efs probability']

# training_features_df.head().transpose()

### 2.3. Load labels, race group and ID

In [None]:
training_labels_df=data_dict['Training labels']
training_labels_df['efs_time']=np.log(training_labels_df['efs_time'])
training_race_groups=data_dict['Training race group']
training_ids=data_dict['Training IDs']

testing_labels_df=data_dict['Testing labels']
testing_labels_df['efs_time']=np.log(testing_labels_df['efs_time'])
testing_race_groups=data_dict['Testing race group']
testing_ids=data_dict['Testing IDs']

## 3. XGBoost regression ensemble model

### 3.1. Training

In [None]:
def train_ensemble(training_features_df:pd.DataFrame, training_labels_df:pd.DataFrame, label:str, n:int, hyperparameters:dict) -> list:

    working_training_features_df=training_features_df.copy()
    working_training_labels_df=training_labels_df.copy()

    splitter=ShuffleSplit(n_splits=1, test_size=.25)
    models=[]

    for i in range(n):
        print(f'Fold {i}')
        
        training_df=pd.concat([working_training_features_df, working_training_labels_df], axis=1)
        
        for training_idx, validation_idx in splitter.split(training_df):

            # Get the features for this fold
            training_features=training_df.iloc[training_idx].drop(['efs', 'efs_time'], axis=1)
            validation_features=training_df.iloc[validation_idx].drop(['efs', 'efs_time'], axis=1)

            # Get the labels
            training_labels=training_df.iloc[training_idx][label]
            validation_labels=training_df.iloc[validation_idx][label]

            dtraining=xgb.DMatrix(
                training_features,
                label=training_labels
            )

            dvalidation=xgb.DMatrix(
                validation_features,
                label=validation_labels
            )

            tree_model=xgb.train(
                hyperparameters,
                dtraining,
                num_boost_round=10000,
                evals=[(dvalidation, 'validation')],
                early_stopping_rounds=500,
                verbose_eval=1
            )

            models.append(tree_model)

            dtraining=xgb.DMatrix(
                working_training_features_df
            )

            predictions=tree_model.predict(dtraining)
            working_training_features_df[f'predictions_{i}']=predictions

    return models

In [None]:
%%time

hyperparameters={
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.01,
    'max_depth': 6,
    'gamma': 0.01,
    'subsample': 0.5
}

if gpu != None:
    hyperparameters['gpu_id']=gpu
    hyperparameters['tree_method']='gpu_hist'

models=train_ensemble(training_features_df, training_labels_df, 'efs_time', 10, hyperparameters)
print()

### 3.2. Predictions

In [None]:
testing_features=testing_features_df.copy()

for i, model in enumerate(models):

    dtesting=xgb.DMatrix(testing_features)
    predictions=model.predict(dtesting)
    testing_features[f'predictions_{i}']=predictions

# Calculate fit residuals
testing_residuals=predictions - testing_labels_df['efs_time']

fig, axs=plt.subplots(1,2, figsize=(9,4))
axs=axs.flatten()

axs[0].set_title('Testing predictions')
axs[0].scatter(testing_labels_df['efs_time'], predictions, color='black', s=0.2)
axs[0].set_xlabel('True EFS time')
axs[0].set_ylabel('Predicted EFS time')

axs[1].set_title('Testing residuals')
axs[1].scatter(predictions, testing_residuals, color='black', s=0.2)
axs[1].set_xlabel('Predicted EFS time')
axs[1].set_ylabel('EFS time residual')

fig.tight_layout()
fig.show()

### 4.3. Scoring

In [None]:
score_results=helper_funcs.score_predictions(
    'Testing predictions',
    predictions,
    testing_labels_df['efs_time'].values,
    testing_labels_df['efs'].values,
    testing_race_groups,
    testing_ids
)

score_results=helper_funcs.score_predictions(
    'Labels',
    testing_labels_df['efs_time'].values,
    testing_labels_df['efs_time'].values,
    testing_labels_df['efs'].values,
    testing_race_groups,
    testing_ids,
    results=score_results
)

score_results_df=pd.DataFrame(score_results)
score_results_df.head()

## 4. XGBoost classification ensemble

### 4.1. Training

In [None]:
%%time

# Calculated class weighting
class_weight=(len(training_labels_df) - sum(training_labels_df['efs'])) / sum(training_labels_df['efs'])

hyperparameters={
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'scale_pos_weight': class_weight,
    'learning_rate': 0.01,
    'max_depth': 6,
    'gamma': 0.01,
    'subsample': 0.5
}

if gpu != None:
    hyperparameters['gpu_id']=gpu
    hyperparameters['tree_method']='gpu_hist'

models=train_ensemble(training_features_df, training_labels_df, 'efs', 10, hyperparameters)
print()

In [None]:
testing_features=testing_features_df.copy()

for i, model in enumerate(models):

    dtesting=xgb.DMatrix(testing_features)
    predictions=model.predict(dtesting)
    testing_features[f'predictions_{i}']=predictions

In [None]:
fig, axs=plt.subplots(1,2, figsize=(9,3.5))
axs=axs.flatten()

# Make calls with threshold
calls=np.where(predictions < 0.5, 0, 1)

# Plot the confusion matrix
cm=confusion_matrix(testing_labels_df['efs'], calls, normalize='true')
cm_disp=ConfusionMatrixDisplay(confusion_matrix=cm)
_=cm_disp.plot(ax=axs[0])

axs[0].set_title('Classification performance')
axs[0].set_xlabel('Predicted EFS')
axs[0].set_ylabel('True EFS')

axs[1].set_title('Classification probability')
axs[1].hist(predictions, bins=30, color='black')
axs[1].set_xlabel('Probability')
axs[1].set_ylabel('Count')

fig.tight_layout()
fig.show()