# HSCT survival: SciKit-learn gradient boosting models datasets test

## 1. Notebook set-up

### 1.1. Imports & options

In [None]:
# PyPI imports
import os
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import ShuffleSplit, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier

# Internal imports
import configuration as config
import functions.helper as helper_funcs

notebook_num='01.4'

pd.set_option('display.max_rows', 500)
os.environ['OMP_NUM_THREADS']='2'

### 1.2. Parameters

In [None]:
# Run options
regression_test=True
classification_test=True
randomsearch_depth=5000
cpus=2

# Define the hyperparameter search space for gradient boosting regression
distributions={
    'learning_rate': stats.uniform(loc=0.0, scale=1.0),
    'max_iter': list(range(10, 5000)),
    'max_leaf_nodes': list(range(2, 1000)),
    'max_depth': list(range(2, 1000)),
    'min_samples_leaf': list(range(1, 1000)),
    'l2_regularization': stats.uniform(loc=0.0, scale=1.0),
    'max_features': stats.uniform(loc=0.1, scale=0.9),
    'max_bins': list(range(2, 255)),
    'interaction_cst': ['pairwise', 'no_interactions']
}

### 1.3. Files

In [None]:
# Dataset definition file
datasets_file=f'{config.PROCESSED_DATA}/02.1-dataset_definitions.pkl'

# Save the dataset metadata
with open(datasets_file, 'rb') as input_file:
    datasets=pickle.load(input_file)

# Dataset testing results
regression_test_results_file=f'{config.RESULTS}/{notebook_num}-regression_test_results.pkl'
regression_training_performance_plots=f'{config.PLOTS}/{notebook_num}-regression_training_performance.jpg'
regression_test_performance_plots=f'{config.PLOTS}/{notebook_num}-regression_test_performance.jpg'
regression_test_residuals_plots=f'{config.PLOTS}/{notebook_num}-regression_test_residuals.jpg'
classification_test_results_file=f'{config.RESULTS}/{notebook_num}-classification_test_results.pkl'
classification_test_performance_plots=f'{config.PLOTS}/{notebook_num}-classification_test_performance.jpg'
classification_test_probability_plots=f'{config.PLOTS}/{notebook_num}-classification_test_probability.jpg'

## 2. SciKit-learn regression model

### 2.1. Hyperparameter optimization

In [None]:
%%time

if regression_test == True:

    regression_predictions={
        'Training':{},
        'Testing':{}
    }

    # Define cross-validation strategy
    cross_validation=ShuffleSplit(n_splits=10, test_size=0.3, random_state=315)

    # Loop on the datasets
    for dataset, data_file in datasets.items():

        # Load the data
        with open(data_file, 'rb') as input_file:
            data_dict=pickle.load(input_file)

        training_labels_df=data_dict['Training labels']
        training_features_df=data_dict['Training features']

        # Take log of efs time
        training_labels_df['efs_time']=np.log(training_labels_df['efs_time'])

        # Instantiate the model
        tree_model=HistGradientBoostingRegressor(random_state=315)

        # Set-up the search
        search=RandomizedSearchCV(
            tree_model,
            distributions,
            scoring='neg_root_mean_squared_error',
            n_jobs=cpus,
            cv=cross_validation,
            n_iter=randomsearch_depth,
            random_state=315,
            return_train_score=True
        )

        search_results=search.fit(training_features_df, training_labels_df['efs'])
        
        print(f'\n{dataset}:')
        for parameter, value in search_results.best_params_.items():
            print(f' {parameter}: {value}')

        # Train classifier with best hyperparameters on complete training set
        tree_model=HistGradientBoostingRegressor(**search_results.best_params_, random_state=315)
        result=tree_model.fit(training_features_df, training_labels_df['efs'])

        # Make training predictions
        predictions=tree_model.predict(training_features_df)
        regression_predictions['Training'][dataset]=predictions

        # Make testing predictions
        testing_features_df=data_dict['Testing features']
        predictions=tree_model.predict(testing_features_df)
        regression_predictions['Testing'][dataset]=predictions

    with open(regression_test_results_file, 'wb') as output_file:
        pickle.dump(regression_predictions, output_file)

else:

    # Load last result
    with open(regression_test_results_file, 'rb') as input_file:
        regression_predictions=pickle.load(input_file)

print()

### 2.2. Results

#### 2.2.1. Scores

##### 2.2.1.1. Training data

In [None]:
scoring_results={
    'Model': [],
    'RMSE': [],
    'C-index': [],
    'Stratified C-index': []
}

for dataset in regression_predictions['Training'].keys():

    # Load the data
    data_file=datasets[dataset]

    with open(data_file, 'rb') as input_file:
        data_dict=pickle.load(input_file)

    scoring_results=helper_funcs.score_predictions(
        dataset,
        regression_predictions['Training'][dataset],
        np.log(data_dict['Training labels']['efs_time'].values),
        data_dict['Training labels']['efs'].values,
        data_dict['Training race group'],
        data_dict['Training IDs'],
        results=scoring_results
    )

scoring_results_df=pd.DataFrame(scoring_results)
scoring_results_df.head(len(scoring_results_df))

##### 2.2.1.2. Testing data

In [None]:
scoring_results={
    'Model': [],
    'RMSE': [],
    'C-index': [],
    'Stratified C-index': []
}

for dataset in regression_predictions['Testing'].keys():

    # Load the data
    data_file=datasets[dataset]

    with open(data_file, 'rb') as input_file:
        data_dict=pickle.load(input_file)

    scoring_results=helper_funcs.score_predictions(
        dataset,
        regression_predictions['Testing'][dataset],
        np.log(data_dict['Testing labels']['efs_time'].values),
        data_dict['Testing labels']['efs'].values,
        data_dict['Testing race group'],
        data_dict['Testing IDs'],
        results=scoring_results
    )

scoring_results_df=pd.DataFrame(scoring_results)
scoring_results_df.head(len(scoring_results_df))

#### 2.2.2. Prediction plots

##### 2.2.2.1. Training data

In [None]:
fig, axs=plt.subplots(2,3, figsize=(8,5.5), sharex=True, sharey=True)
axs=axs.flatten()

fig.suptitle('SciKit-learn gradient boosting\nregression performance: training set')

for i, dataset in enumerate(regression_predictions['Training'].keys()):

    # Load the data
    data_file=datasets[dataset]

    with open(data_file, 'rb') as input_file:
        data_dict=pickle.load(input_file)

    axs[i].set_title(dataset.replace(', ', '\n').replace('/', '\n'))
    axs[i].scatter(
        np.log(data_dict['Training labels']['efs_time'].values),
        regression_predictions['Training'][dataset],
        color='black',
        s=0.2
    )
    axs[i].set_xlabel('True EFS time')
    axs[i].set_ylabel('Predicted EFS time')

fig.tight_layout()
fig.savefig(regression_training_performance_plots, dpi=300, bbox_inches='tight')
fig.show()

##### 2.2.2.2. Testing data

In [None]:
fig, axs=plt.subplots(2,3, figsize=(8,5.5), sharex=True, sharey=True)
axs=axs.flatten()

fig.suptitle('SciKit-learn gradient boosting\nregression performance: hold-out test set')

for i, dataset in enumerate(regression_predictions['Testing'].keys()):

    # Load the data
    data_file=datasets[dataset]

    with open(data_file, 'rb') as input_file:
        data_dict=pickle.load(input_file)

    axs[i].set_title(dataset.replace(', ', '\n').replace('/', '\n'))
    axs[i].scatter(
        np.log(data_dict['Testing labels']['efs_time'].values),
        regression_predictions['Testing'][dataset],
        color='black',
        s=0.2
    )
    axs[i].set_xlabel('True EFS time')
    axs[i].set_ylabel('Predicted EFS time')

fig.tight_layout()
fig.savefig(regression_test_performance_plots, dpi=300, bbox_inches='tight')
fig.show()

#### 2.2.3. Residual plots

##### 2.2.3.1. Training data

In [None]:
fig, axs=plt.subplots(2,3, figsize=(8,5.5), sharex=True, sharey=True)
axs=axs.flatten()

fig.suptitle('SciKit-learn gradient boosting\nregression fit residuals: training set')

for i, dataset in enumerate(regression_predictions['Training'].keys()):

    # Load the data
    data_file=datasets[dataset]

    with open(data_file, 'rb') as input_file:
        data_dict=pickle.load(input_file)

    axs[i].set_title(dataset.replace(', ', '\n').replace('/', '\n'))
    axs[i].scatter(
        regression_predictions['Training'][dataset],
        np.log(data_dict['Training labels']['efs_time'].values) - regression_predictions['Training'][dataset],
        color='black',
        s=0.2
    )
    axs[i].set_xlabel('Predicted EFS time')
    axs[i].set_ylabel('EFS time residual')

fig.tight_layout()
fig.savefig(regression_test_residuals_plots, dpi=300, bbox_inches='tight')
fig.show()

##### 2.2.3.2. Testing data

In [None]:
fig, axs=plt.subplots(2,3, figsize=(8,5.5), sharex=True, sharey=True)
axs=axs.flatten()

fig.suptitle('SciKit-learn gradient boosting\nregression fit residuals: hold-out test set')

for i, dataset in enumerate(regression_predictions['Testing'].keys()):

    # Load the data
    data_file=datasets[dataset]

    with open(data_file, 'rb') as input_file:
        data_dict=pickle.load(input_file)

    axs[i].set_title(dataset.replace(', ', '\n').replace('/', '\n'))
    axs[i].scatter(
        regression_predictions['Testing'][dataset],
        np.log(data_dict['Testing labels']['efs_time'].values) - regression_predictions['Testing'][dataset],
        color='black',
        s=0.2
    )
    axs[i].set_xlabel('Predicted EFS time')
    axs[i].set_ylabel('EFS time residual')

fig.tight_layout()
fig.savefig(regression_test_residuals_plots, dpi=300, bbox_inches='tight')
fig.show()

## 3. SciKit-learn classification model

### 3.1. Hyperparameter optimization

In [None]:
%%time

if classification_test == True:

    classification_predictions={
        'Training':{},
        'Testing':{}
    }
    
    classification_models={}

    # Define cross-validation strategy
    cross_validation=ShuffleSplit(n_splits=10, test_size=0.3, random_state=315)

    # Loop on the datasets
    for dataset, data_file in datasets.items():

        # Load the data
        with open(data_file, 'rb') as input_file:
            data_dict=pickle.load(input_file)

        training_labels_df=data_dict['Training labels']
        training_features_df=data_dict['Training features']

        # Instantiate the model
        classification_model=HistGradientBoostingClassifier(class_weight='balanced', random_state=315)

        # Set-up the search
        classification_search=RandomizedSearchCV(
            classification_model,
            distributions,
            scoring='neg_root_mean_squared_error',
            n_jobs=cpus,
            cv=cross_validation,
            n_iter=randomsearch_depth,
            random_state=315,
            return_train_score=True
        )

        classification_search_results=classification_search.fit(training_features_df, training_labels_df['efs'])
        
        print(f'\n{dataset}:')
        for parameter, value in classification_search_results.best_params_.items():
            print(f' {parameter}: {value}')

        # Train classifier with best hyperparameters on complete training set
        classification_model=HistGradientBoostingClassifier(**classification_search_results.best_params_, random_state=315)
        result=classification_model.fit(training_features_df, training_labels_df['efs'])
        classification_models[dataset]=classification_model

        # Make testing predictions
        testing_features_df=data_dict['Testing features']
        classification_testing_predictions=classification_model.predict(testing_features_df)
        classification_predictions['Testing'][dataset]=classification_testing_predictions

    classification_test_results={'Testing predictions': classification_predictions}
    classification_test_results['Tuned models']=classification_models

    with open(classification_test_results_file, 'wb') as output_file:
        pickle.dump(classification_test_results, output_file)

else:

    # Load last result
    with open(classification_test_results_file, 'rb') as input_file:
        classification_test_results=pickle.load(input_file)

    classification_predictions=classification_test_results['Testing predictions']
    classification_models=classification_test_results['Tuned models']

print()

### 3.2. Results

#### 3.2.1. Confusion matrices

In [None]:
fig, axs=plt.subplots(2,3, figsize=(8,5.5), sharex=True, sharey=True)
axs=axs.flatten()

fig.suptitle('SciKit-learn gradient boosting\nclassifier performance: hold-out test set')

for i, dataset in enumerate(classification_predictions['Testing'].keys()):

    # Load the data
    data_file=datasets[dataset]

    with open(data_file, 'rb') as input_file:
        data_dict=pickle.load(input_file)

    # Plot the confusion matrix
    cm=confusion_matrix(data_dict['Testing labels']['efs'], classification_predictions['Testing'][dataset], normalize='true')
    cm_disp=ConfusionMatrixDisplay(confusion_matrix=cm)
    _=cm_disp.plot(ax=axs[i])

    axs[i].set_title(dataset.replace(', ', '\n').replace('/', '\n'))
    axs[i].set_xlabel('Predicted EFS')
    axs[i].set_ylabel('True EFS')

fig.tight_layout()
fig.savefig(classification_test_performance_plots, dpi=300, bbox_inches='tight')
fig.show()

#### 3.2.2. Class probabilities

In [None]:
fig, axs=plt.subplots(2,3, figsize=(8,5.5), sharex=True, sharey=True)
axs=axs.flatten()

fig.suptitle('SciKit-learn gradient boosting\nclassifier probabilities: hold-out test set')

for i, dataset in enumerate(classification_predictions['Testing'].keys()):

    # Load the data
    data_file=datasets[dataset]

    with open(data_file, 'rb') as input_file:
        data_dict=pickle.load(input_file)

    class_probabilities=classification_models[dataset].predict_proba(data_dict['Testing features'])
    class_df=pd.DataFrame.from_dict({
        'EFS': data_dict['Testing labels']['efs'].values,
        'EFS probability': class_probabilities[:,0]
    })

    # Plot the confusion matrix
    sns.histplot(class_df, x='EFS probability', hue='EFS', ax=axs[i])

    axs[i].set_title(dataset.replace(', ', '\n').replace('/', '\n'))
    axs[i].set_xlabel('Predicted EFS')
    axs[i].set_ylabel('True EFS')

fig.tight_layout()
fig.savefig(classification_test_probability_plots, dpi=300, bbox_inches='tight')
fig.show()