In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import warnings
from encoding import *
from Survival_functions import *
from scipy.stats import ttest_ind
import scipy.stats as stats

pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)

### Imputed

In [None]:
# Replace the path with the actual path to your file
file_path = '../../../Both/new_study.xlsx'
imputed_df = pd.read_excel(file_path, index_col='PATNO')
imputed_df = imputed_df.rename(columns={'OS (days)': 'time'})
imputed_df = imputed_df.rename(columns={'Status': 'status'})

imputed_df['status'] = imputed_df['status'].map({'Dead': True, 'Alive': False})

imputed_df.columns


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RepeatedStratifiedKFold
mcv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=173637)
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import (concordance_index_censored, 
                            integrated_brier_score)
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sksurv.linear_model import CoxnetSurvivalAnalysis


# Compare
- Dataset using KNN-imputation vs dataset without imputation method (remove samples, and columns)
- This comparison is done for three models; Coxnet, CoxPH and RSF
- All models find their best model within a range of hyperparameters that they use for comparison.

# COXNET


### COXNET (Unimputed)

For the Unimputed dataset, there are several methods for removing samples and columns. The `preprocess_data` function takes `NaN_threshold` as an argument to determine how many columns it removes before eliminating the samples. This threshold is a percentage, where any columns with a higher percentage of NaN values than the threshold are removed. Then, all samples containing NaN values are removed. The `NaN_threshold` serves as a hyperparameter to determine the optimal value alongside the other two hyperparameters, l1 ratio and alpha.

In [None]:
# Cross validation for CoxPH model in sksurv
alphas = [0, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 3, 5, 10, 20, 50, 70, 100, 200, 500, 700, 1000]
l1_ratios = [0.0001, 0.001, 0.01, 0.1]

preds_coxnet_mean_by_dataset = {}
conc_coxnet_by_dataset = {}

NaN_thresholds = [3, 4, 5]

for NaN_threshold in NaN_thresholds:
    print(NaN_threshold)
    file_path = '../../../Both/new_study.xlsx'
    df = pd.read_excel(file_path, index_col='PATNO')
    df = df.rename(columns={'OS (days)': 'time'})
    df = df.rename(columns={'Status': 'status'})
    df['status'] = df['status'].map({'Dead': True, 'Alive': False})

    df = preprocess_data(df, NaN_threshold)

    X, y, tuple_y, target_columns = x_y_baseline(df)

    unimp_results_coxnet = {}
    unimp_conc_coxnet = {}

    for l1_ratio in l1_ratios:
        for alpha in alphas:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", category=UserWarning)
                
                coxnet = CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, alphas=[alpha], fit_baseline_model=True)
                conc_train = []
                conc_test = []
                brier = []
                permut = []
                coef = []
                feature_importance = []
                
                print(f'alpha: {alpha}, l1_ratio: {l1_ratio}')
            
                for i, (train, test) in enumerate(mcv.split(X, tuple_y)):
                    X_train, X_test = X.iloc[train], X.iloc[test]
                    
                    y_train, y_test = y[train], y[test]
                    
                    X_train, X_test = Preprocessing_without_imputing(X_train=X_train, X_test=X_test, y_train=y_train, target_columns=target_columns)
                    # fix the times            
                    times_train_min = y_train['time'].min()
                    times_train_max = y_train['time'].max()
                    times_train = np.arange(0, times_train_max)
                    times_test_min = y_test['time'].min()
                    times_test_max = y_test['time'].max()
                    if times_test_max > times_train_max:
                        y_test_red_index = y_test['time'] <= times_train_max
                        y_test = y_test[y_test_red_index]
                        X_test = X_test[y_test_red_index]
                        times_test_max = y_test['time'].max()
                    times_test = np.arange(times_test_min, times_test_max)

                    
                    coxnet.fit(X_train, y_train)
                    
                    # Compute the C-index for test data and train data
                    conc_train.append(coxnet.score(X_train, y_train))
                    conc_test.append(coxnet.score(X_test, y_test))

                # Evaluate and record the results after each alpha and l1_ratio combination
                avg_conc_test = np.mean(conc_test)
                std_conc_test= np.std(conc_test)
                avg_conc_train = np.mean(conc_train)

                unimp_results_coxnet[(alpha, l1_ratio)] = [avg_conc_test, std_conc_test, avg_conc_train]

                unimp_conc_coxnet[(alpha, l1_ratio)] = conc_test

    preds_coxnet_mean_by_dataset[NaN_threshold] = unimp_results_coxnet
    conc_coxnet_by_dataset[NaN_threshold] = unimp_conc_coxnet


### Sort the result by the best Concordance (test)

In [None]:
rows = []
for NaN_threshold, results in preds_coxnet_mean_by_dataset.items():
    for (alpha, l1_ratio), metrics in results.items():
        row = {
            "NaN_threshold": NaN_threshold,
            "alpha": alpha,
            "l1_ratio": l1_ratio,
            "Conc test": metrics[0],
            "Std Conc test": metrics[1],
            "Conc train": metrics[2],
        }
        rows.append(row)

unimp_scores_coxnet = pd.DataFrame(rows).sort_values(by='Conc test', ascending=False).reset_index(drop=True)
unimp_scores_coxnet.head(10)

### Concordance to each fold using the best hyperparameter

In [None]:
unimp_alpha = unimp_scores_coxnet['alpha'].iloc[0]
unimp_l1_ratio = unimp_scores_coxnet['l1_ratio'].iloc[0]
unimp_NaN_threshold = unimp_scores_coxnet['NaN_threshold'].iloc[0]

unimp_best_conc_coxnet = conc_coxnet_by_dataset.get(unimp_NaN_threshold, {}).get((unimp_alpha, unimp_l1_ratio), "Value not found")
unimp_best_conc_coxnet

### COXNET (KNN Imputed)

In [None]:
X, y, tuple_y, target_columns = x_y_baseline(imputed_df)

In [None]:
# Cross validation for CoxPH model in sksurv
alphas = [0, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 3, 5, 10, 20, 50, 70, 100, 200, 500, 700, 1000]
l1_ratios = [0.0001, 0.001, 0.01, 0.1]

imp_results_coxnet = {}
imp_feature_importance_coxnet = {}
imp_coefficients_coxnet = {}
imp_conc_coxnet = {}

for l1_ratio in l1_ratios:
    for alpha in alphas:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            
            coxnet = CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, alphas=[alpha], fit_baseline_model=True)
            conc_train = []
            conc_test = []
            brier = []
            permut = []
            coef = []
            feature_importance = []
            
            print(f'alpha: {alpha}, l1_ratio: {l1_ratio}')
        
            for i, (train, test) in enumerate(mcv.split(X, tuple_y)):
                X_train, X_test = X.iloc[train], X.iloc[test]
                y_train, y_test = y[train], y[test]
                
                X_train, X_test = Preprocessing(X_train=X_train, X_test=X_test, y_train=y_train, target_columns=target_columns)
                # fix the times            
                times_train_min = y_train['time'].min()
                times_train_max = y_train['time'].max()
                times_train = np.arange(0, times_train_max)
                times_test_min = y_test['time'].min()
                times_test_max = y_test['time'].max()
                if times_test_max > times_train_max:
                    y_test_red_index = y_test['time'] <= times_train_max
                    y_test = y_test[y_test_red_index]
                    X_test = X_test[y_test_red_index]
                    times_test_max = y_test['time'].max()
                times_test = np.arange(times_test_min, times_test_max)

                
                coxnet.fit(X_train, y_train)
                
                # Compute the C-index for test data and train data
                conc_train.append(coxnet.score(X_train, y_train))
                conc_test.append(coxnet.score(X_test, y_test))

                # Brier Score
                surv_prob_test = np.row_stack([fn(times_test) for fn in coxnet.predict_survival_function(X_test)])
                brier.append(integrated_brier_score(y_train, y_test, surv_prob_test, times_test))

                importance = permutation_importance(coxnet,
                                                    X_test,
                                                    y_test,
                                                    n_repeats=10,
                                                    random_state=1)
                permut.append(importance.importances_mean)

                feature_importance.append(importance)
                coef.append(coxnet.coef_)
        
            imp_feature_importance_coxnet[(alpha, l1_ratio)] = feature_importance
            imp_coefficients_coxnet[(alpha, l1_ratio)] = coef

            # Evaluate and record the results after each alpha and l1_ratio combination
            avg_conc_test = np.mean(conc_test)
            avg_conc_train = np.mean(conc_train)
            avg_brier = np.mean(brier)
            avg_permut = np.mean(permut)

            imp_results_coxnet[(alpha, l1_ratio)] = [avg_conc_test, avg_conc_train, avg_brier, avg_permut]

            imp_conc_coxnet[(alpha, l1_ratio)] = conc_test

result = [{
    'Alpha': alpha,
    'L1 Ratio': l1_ratio,
    'Conc test': avg_conc_test,
    'Conc train': avg_conc_train,
    'Brier Score': avg_brier,
    'Permut': avg_permut
} for (alpha, l1_ratio), (avg_conc_test, avg_conc_train, avg_brier, avg_permut) in imp_results_coxnet.items()]

# Create the DataFrame
imp_results_coxnet = pd.DataFrame(result)


### Sort the result by the best Concordance (test)

In [None]:
imp_scores_coxnet = imp_results_coxnet.sort_values(by='Conc test', ascending=False).reset_index(drop=True)

# Print out the sorted DataFrame
imp_scores_coxnet.head(10)

### Concordance to each fold using the best hyperparameter

In [None]:
imp_alpha = imp_scores_coxnet['Alpha'].iloc[0]
imp_l1_ratio = imp_scores_coxnet['L1 Ratio'].iloc[0]
imp_best_conc_coxnet= imp_conc_coxnet[(imp_alpha, imp_l1_ratio)]
imp_best_conc_coxnet

### Check for normal distribution

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(8, 6))

# Plot Q-Q plot for baseline_best_conc_coxnet in the first subplot
stats.probplot(imp_best_conc_coxnet, dist="norm", plot=axs[0, 0])
axs[0, 0].set_title('Q-Q plot of KNN-Imputation')
axs[0, 0].set_xlabel('Theoretical quantiles')
axs[0, 0].set_ylabel('Ordered Values')

# Plot histogram for baseline_best_conc_coxnet in the second subplot
axs[0, 1].hist(imp_best_conc_coxnet, bins=10, color='salmon', edgecolor='black')
axs[0, 1].set_title('Histogram of KNN-Imputation')
axs[0, 1].set_xlabel('Value')
axs[0, 1].set_ylabel('Frequency')

# Plot Q-Q plot for MI_mean_conc in the third subplot
stats.probplot(unimp_best_conc_coxnet, dist="norm", plot=axs[1, 0])
axs[1, 0].set_title('Q-Q plot of Baseline (Unimputed)')
axs[1, 0].set_xlabel('Theoretical quantiles')
axs[1, 0].set_ylabel('Ordered Values')

# Plot histogram for MI_mean_conc in the fourth subplot
axs[1, 1].hist(unimp_best_conc_coxnet, bins=10, color='salmon', edgecolor='black')
axs[1, 1].set_title('Histogram of Baseline (Unimputed)')
axs[1, 1].set_xlabel('Value')
axs[1, 1].set_ylabel('Frequency')


# Adjust layout
plt.suptitle('Q-Q plot and Histogram of C-index to KNN-Imputation and baseline (Unimputed) - Coxnet', fontsize=20)
plt.subplots_adjust(top=0.88)
plt.tight_layout()

# Show plot
plt.show()


## Comparison

In [None]:
print(f"Means Baseline (Unimputed): {round(np.mean(unimp_best_conc_coxnet), 3)} +/-{round(np.std(unimp_best_conc_coxnet), 3)} \nMeans KNN Imputation: {round(np.mean(imp_best_conc_coxnet), 3)} +/-{round(np.std(imp_best_conc_coxnet), 3)}")

In [None]:
ttest_ind(round(unimp_best_conc_coxnet, 3), round(imp_best_conc_coxnet, 4), alternative="two-sided")

# COX PH

### CoxPH (unimputed)

In [None]:
# Cross validation for CoxPH model in sksurv
alphas = [0, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 3, 5, 10, 20, 50, 70, 100, 200, 500, 700, 1000]

preds_coxph_mean_by_dataset = {}
conc_coxph_by_dataset = {}

NaN_thresholds = [3, 4, 5]

for NaN_threshold in NaN_thresholds:
    print(NaN_threshold)
    file_path = '../../../Both/new_study.xlsx'
    df = pd.read_excel(file_path, index_col='PATNO')
    df = df.rename(columns={'OS (days)': 'time'})
    df = df.rename(columns={'Status': 'status'})
    df['status'] = df['status'].map({'Dead': True, 'Alive': False})
    
    df = preprocess_data(df, NaN_threshold)

    X, y, tuple_y, target_columns = x_y_baseline(df)

    unimp_results_coxph = {}
    unimp_conc_coxph = {}


    for alpha in alphas:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            
            coxnet = CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, alphas=[alpha], fit_baseline_model=True)
            conc_train = []
            conc_test = []
            brier = []
            permut = []
            coef = []
            feature_importance = []
            
            print(f'alpha: {alpha}')
        
            for i, (train, test) in enumerate(mcv.split(X, tuple_y)):
                X_train, X_test = X.iloc[train], X.iloc[test]
                y_train, y_test = y[train], y[test]
                
                X_train, X_test = Preprocessing_without_imputing(X_train=X_train, X_test=X_test, y_train=y_train, target_columns=target_columns)
                # fix the times            
                times_train_min = y_train['time'].min()
                times_train_max = y_train['time'].max()
                times_train = np.arange(0, times_train_max)
                times_test_min = y_test['time'].min()
                times_test_max = y_test['time'].max()
                if times_test_max > times_train_max:
                    y_test_red_index = y_test['time'] <= times_train_max
                    y_test = y_test[y_test_red_index]
                    X_test = X_test[y_test_red_index]
                    times_test_max = y_test['time'].max()
                times_test = np.arange(times_test_min, times_test_max)

                
                coxnet.fit(X_train, y_train)
                
                # Compute the C-index for test data and train data
                conc_train.append(coxnet.score(X_train, y_train))
                conc_test.append(coxnet.score(X_test, y_test))

            # Evaluate and record the results after each alpha and l1_ratio combination
            avg_conc_test = np.mean(conc_test)
            avg_conc_train = np.mean(conc_train)

            unimp_results_coxph[(alpha)] = [avg_conc_test, avg_conc_train]

            unimp_conc_coxph[(alpha)] = conc_test

    preds_coxph_mean_by_dataset[NaN_threshold] = unimp_results_coxph
    conc_coxph_by_dataset[NaN_threshold] = unimp_conc_coxph





### Sort the result by the best Concordance (test)

In [None]:
rows = []
for NaN_threshold, results in preds_coxph_mean_by_dataset.items():
    for (alpha), metrics in results.items():
        row = {
            "NaN_threshold": NaN_threshold,
            "alpha": alpha,
            "Conc test": metrics[0],
            "Conc train": metrics[1],
        }
        rows.append(row)

unimp_scores_coxph = pd.DataFrame(rows).sort_values(by='Conc test', ascending=False).reset_index(drop=True)
unimp_scores_coxph.head()

In [None]:
unimp_alpha = unimp_scores_coxph['alpha'].iloc[0]
unimp_NaN_threshold = unimp_scores_coxph['NaN_threshold'].iloc[0]

unimp_best_conc_coxph = conc_coxph_by_dataset.get(unimp_NaN_threshold, {}).get((unimp_alpha), "Value not found")
unimp_best_conc_coxph

### CoxPH (Imputed)

In [None]:
X, y, tuple_y, target_columns = x_y_baseline(imputed_df)

In [None]:
# Cross validation for CoxPH model in sksurv
alphas = [0, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 3, 5, 10, 20, 50, 70, 100, 200, 500, 700, 1000]


imp_results_coxph = {}
imp_feature_importance_ph = {}
imp_coefficients_coxph = {}
imp_conc_coxph = {}

for ind, alpha in enumerate(alphas):
    # Note: For CoxnetSurvivalAnalysis, alphas should be a list (or array-like) even for a single value
        conc_train = []
        conc_test = []
        brier = []
        permut = []
        feat_impor = []
        coef = []

        coxph = CoxPHSurvivalAnalysis(alpha=alpha, ties="efron")

        print(f'alpha: {alpha}')
        
        for i, (train, test) in enumerate(mcv.split(X, tuple_y)):
            X_train, X_test = X.iloc[train], X.iloc[test]
            y_train, y_test = y[train], y[test]
            
            X_train, X_test = Preprocessing(X_train=X_train, X_test=X_test, y_train=y_train, target_columns=target_columns)
                
            # Train Model            
            times_train_min = y_train['time'].min()
            times_train_max = y_train['time'].max()
            times_train = np.arange(0, times_train_max)
            times_test_min = y_test['time'].min()
            times_test_max = y_test['time'].max()
            if times_test_max > times_train_max:
                y_test_red_index = y_test['time'] <= times_train_max
                y_test = y_test[y_test_red_index]
                X_test = X_test[y_test_red_index]
                times_test_max = y_test['time'].max()
            times_test = np.arange(times_test_min, times_test_max)

            
            coxph.fit(X_train, y_train)
            
            # Compute the C-index for test data and train data
            conc_train.append(coxph.score(X_train, y_train))
            conc_test.append(coxph.score(X_test, y_test))

            # Brier Score
            surv_prob_test = np.row_stack([fn(times_test) for fn in coxph.predict_survival_function(X_test)])
            brier.append(integrated_brier_score(y_train, y_test, surv_prob_test, times_test))

            importance = permutation_importance(coxph,
                                                X_test,
                                                y_test,
                                                n_repeats=10,
                                                random_state=1)
            permut.append(importance.importances_mean)

            feat_impor.append(importance)
            coef.append(coxph.coef_)
    
        imp_feature_importance_ph[(alpha)] = feat_impor
        imp_coefficients_coxph[(alpha)] = coef

        # Evaluate and record the results after each alpha
        avg_conc_test = np.mean(conc_test)
        avg_conc_train = np.mean(conc_train)
        avg_brier = np.mean(brier)
        avg_permut = np.mean(permut)

        imp_results_coxph[(alpha)] = [avg_conc_test, avg_conc_train, avg_brier, avg_permut]
        imp_conc_coxph[(alpha)] = conc_test

result = [{
    'Alpha': alpha,
    'Conc test': avg_conc_test,
    'Conc train': avg_conc_train,
    'Brier Score': avg_brier,
    'Permut': avg_permut
} for (alpha), (avg_conc_test, avg_conc_train, avg_brier, avg_permut) in imp_results_coxph.items()]

# Create the DataFrame
imp_results_coxph = pd.DataFrame(result)



### Sort the result by the best Concordance (test)

In [None]:
imp_scores_coxph = imp_results_coxph.sort_values(by='Conc test', ascending=False).reset_index(drop=True)

# Print out the sorted DataFrame
imp_scores_coxph.head()

In [None]:
imp_alpha = imp_scores_coxph['Alpha'].iloc[0]
imp_best_conc_coxph = imp_conc_coxph[(imp_alpha)]
imp_best_conc_coxph

### Check for normal distribution

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(8, 6))

# Plot Q-Q plot for baseline_best_conc_coxnet in the first subplot
stats.probplot(imp_best_conc_coxph, dist="norm", plot=axs[0, 0])
axs[0, 0].set_title('Q-Q plot of KNN-Imputation')
axs[0, 0].set_xlabel('Theoretical quantiles')
axs[0, 0].set_ylabel('Ordered Values')

# Plot histogram for baseline_best_conc_coxnet in the second subplot
axs[0, 1].hist(imp_best_conc_coxph, bins=10, color='salmon', edgecolor='black')
axs[0, 1].set_title('Histogram of KNN-Imputation')
axs[0, 1].set_xlabel('Value')
axs[0, 1].set_ylabel('Frequency')

# Plot Q-Q plot for MI_mean_conc in the third subplot
stats.probplot(unimp_best_conc_coxph, dist="norm", plot=axs[1, 0])
axs[1, 0].set_title('Q-Q plot of Baseline (Unimputed)')
axs[1, 0].set_xlabel('Theoretical quantiles')
axs[1, 0].set_ylabel('Ordered Values')

# Plot histogram for MI_mean_conc in the fourth subplot
axs[1, 1].hist(unimp_best_conc_coxph, bins=10, color='salmon', edgecolor='black')
axs[1, 1].set_title('Histogram of Baseline (Unimputed)')
axs[1, 1].set_xlabel('Value')
axs[1, 1].set_ylabel('Frequency')


# Adjust layout
plt.suptitle('Q-Q plot and Histogram of C-index to KNN-Imputation and baseline (Unimputed) - CoxPH', fontsize=20)
plt.subplots_adjust(top=0.88)
plt.tight_layout()

# Show plot
plt.show()



## Comparison

In [None]:
print(f"Means Baseline (Unimputed): {round(np.mean(unimp_best_conc_coxph), 3)} +/-{round(np.std(unimp_best_conc_coxph), 3)} \nMeans KNN Imputation: {round(np.mean(imp_best_conc_coxph), 3)} +/-{round(np.std(imp_best_conc_coxph), 3)}")

In [None]:
ttest_ind(unimp_best_conc_coxph, imp_best_conc_coxph, alternative="two-sided")


# RandomSurvivalForest (fewer parameter used than in the Modeling notebook)

#### Model (Unimputed)

In [None]:
n_estimators = [10, 20, 30, 40, 50]  
max_depths = [2, 3, 4, 5]
min_samples_splits = [2, 6, 8] 
min_samples_leafs = [1, 2, 3, 4, 5, 6]  


preds_rsf_mean_by_dataset = {}
conc_rsf_by_dataset = {}

NaN_thresholds = [3, 4, 5]

for NaN_threshold in NaN_thresholds:
    print(NaN_threshold)
    file_path = '../../../Both/new_study.xlsx'
    df = pd.read_excel(file_path, index_col='PATNO')
    df = df.rename(columns={'OS (days)': 'time'})
    df = df.rename(columns={'Status': 'status'})
    df['status'] = df['status'].map({'Dead': True, 'Alive': False})
    
    df = preprocess_data(df, NaN_threshold)

    X, y, tuple_y, target_columns = x_y_baseline(df)

    unimp_results_rf = {}
    unimp_conc_rf = {}

    for n_estimator in n_estimators:
        for max_depth in max_depths:
            for min_samples_split in min_samples_splits:  
                for min_samples_leaf in min_samples_leafs: 
                    rf = RandomSurvivalForest(
                        n_estimators=n_estimator, 
                        max_depth=max_depth,
                        min_samples_split=min_samples_split,
                        min_samples_leaf=min_samples_leaf,
                        random_state=173637)
                    conc_train = []
                    conc_test = []
                    brier = []
                    permut = []
                    feat_impor = []
                    coef = []
                    
                    print(f'n_estimator: {n_estimator}, max_depth: {max_depth}')

                    for i, (train, test) in enumerate(mcv.split(X, tuple_y)):
                        X_train, X_test = X.iloc[train], X.iloc[test]
                        y_train, y_test = y[train], y[test]
                        
                        X_train, X_test = Preprocessing_without_imputing(X_train=X_train, X_test=X_test, y_train=y_train, target_columns=target_columns)
                            
                        # fix the times            
                        times_train_min = y_train['time'].min()
                        times_train_max = y_train['time'].max()
                        times_train = np.arange(0, times_train_max)
                        times_test_min = y_test['time'].min()
                        times_test_max = y_test['time'].max()
                        if times_test_max > times_train_max:
                            y_test_red_index = y_test['time'] <= times_train_max
                            y_test = y_test[y_test_red_index]
                            X_test = X_test[y_test_red_index]
                            times_test_max = y_test['time'].max()
                        times_test = np.arange(times_test_min, times_test_max)

                        
                        rf.fit(X_train, y_train)
                        
                        # Compute the C-index for test data and train data
                        conc_train.append(rf.score(X_train, y_train))
                        conc_test.append(rf.score(X_test, y_test))

                    # Evaluate and record the results after each n_estimator and max_depth combination
                    avg_conc_test = np.mean(conc_test)
                    avg_conc_train = np.mean(conc_train)

                    unimp_results_rf[(n_estimator, max_depth, min_samples_split, min_samples_leaf)] = [avg_conc_test, avg_conc_train, avg_brier, avg_permut]
                    unimp_conc_rf[(n_estimator, max_depth, min_samples_split, min_samples_leaf)] = conc_test

    preds_rsf_mean_by_dataset[NaN_threshold] = unimp_results_rf
    conc_rsf_by_dataset[NaN_threshold] = unimp_conc_rf


### Sort the result by the best Concordance (test)

In [None]:
rows = []
for NaN_threshold, results in preds_rsf_mean_by_dataset.items():
    for (n_estimator, max_depth, min_samples_split, min_samples_leaf), metrics in results.items():
        row = {
            "NaN_threshold": NaN_threshold,
            "n_estimator": n_estimator,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            "Conc test": metrics[0],
            "Conc train": metrics[1],
        }
        rows.append(row)

unimp_scores_rf = pd.DataFrame(rows).sort_values(by='Conc test', ascending=False).reset_index(drop=True)
unimp_scores_rf.head()

In [None]:
unimp_n_estimator = unimp_scores_rf['n_estimator'].iloc[0]
unimp_max_depth = unimp_scores_rf['max_depth'].iloc[0]
unimp_min_samples_split = unimp_scores_rf['min_samples_split'].iloc[0]
unimp_min_samples_leaf = unimp_scores_rf['min_samples_leaf'].iloc[0]
unimp_NaN_threshold = unimp_scores_rf['NaN_threshold'].iloc[0]
unimp_best_conc_rf = conc_rsf_by_dataset.get(unimp_NaN_threshold, {}).get((unimp_n_estimator, unimp_max_depth, unimp_min_samples_split, unimp_min_samples_leaf), "Value not found")
unimp_best_conc_rf

### RSF (Imputed)

In [None]:
X, y, tuple_y, target_columns = x_y_baseline(imputed_df)

In [None]:
imp_results_rf = {}
imp_feature_importance_rf = {}
imp_conc_rf = {}

n_estimators = [10, 20, 30, 40, 50]  
max_depths = [2, 3, 4, 5]
min_samples_splits = [2, 6, 8] 
min_samples_leafs = [1, 2, 3, 4, 5, 6]  

for n_estimator in n_estimators:
    for max_depth in max_depths:
        for min_samples_split in min_samples_splits:  
            for min_samples_leaf in min_samples_leafs: 
                rf = RandomSurvivalForest(
                    n_estimators=n_estimator, 
                    max_depth=max_depth,
                    min_samples_split=min_samples_split,
                    min_samples_leaf=min_samples_leaf,
                    random_state=173637)
                
                conc_train = []
                conc_test = []
                brier = []
                permut = []
                feat_impor = []
                coef = []
                
                print(f'n_estimator: {n_estimator}, max_depth: {max_depth}')
            
                for i, (train, test) in enumerate(mcv.split(X, tuple_y)):
                    X_train, X_test = X.iloc[train], X.iloc[test]
                    y_train, y_test = y[train], y[test]
                    
                    X_train, X_test = Preprocessing(X_train=X_train, X_test=X_test, y_train=y_train, target_columns=target_columns)
                        
                    # fix the times            
                    times_train_min = y_train['time'].min()
                    times_train_max = y_train['time'].max()
                    times_train = np.arange(0, times_train_max)
                    times_test_min = y_test['time'].min()
                    times_test_max = y_test['time'].max()
                    if times_test_max > times_train_max:
                        y_test_red_index = y_test['time'] <= times_train_max
                        y_test = y_test[y_test_red_index]
                        X_test = X_test[y_test_red_index]
                        times_test_max = y_test['time'].max()
                    times_test = np.arange(times_test_min, times_test_max)

                    
                    rf.fit(X_train, y_train)
                    
                    # Compute the C-index for test data and train data
                    conc_train.append(rf.score(X_train, y_train))
                    conc_test.append(rf.score(X_test, y_test))

                    # Brier Score
                    surv_prob_test = np.row_stack([fn(times_test) for fn in rf.predict_survival_function(X_test)])
                    brier.append(integrated_brier_score(y_train, y_test, surv_prob_test, times_test))

                    importance = permutation_importance(rf,
                                                        X_test,
                                                        y_test,
                                                        n_repeats=10,
                                                        random_state=1)
                    permut.append(importance.importances_mean)

                    feat_impor.append(importance)
    
                imp_feature_importance_rf[(n_estimator, max_depth, min_samples_split, min_samples_leaf)] = feat_impor

                # Evaluate and record the results after each n_estimator and max_depth combination
                avg_conc_test = np.mean(conc_test)
                avg_conc_train = np.mean(conc_train)
                avg_brier = np.mean(brier)
                avg_permut = np.mean(permut)

                imp_results_rf[(n_estimator, max_depth, min_samples_split, min_samples_leaf)] = [avg_conc_test, avg_conc_train, avg_brier, avg_permut]
                imp_conc_rf[(n_estimator, max_depth, min_samples_split, min_samples_leaf)] = conc_test



result = [{
    'n_estimator': n_estimator,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'Conc test': avg_conc_test,
    'Conc train': avg_conc_train,
    'Brier Score': avg_brier,
    'Permut': avg_permut
} for (n_estimator, max_depth, min_samples_split, min_samples_leaf), (avg_conc_test, avg_conc_train, avg_brier, avg_permut) in imp_results_rf.items()]

# Create the DataFrame
imp_results_rf = pd.DataFrame(result)


### Sort the result by the best Concordance (test)

In [None]:
imp_scores_rf = imp_results_rf.sort_values(by='Conc test', ascending=False).reset_index(drop=True)

# Print out the sorted DataFrame
imp_scores_rf.head(10)

In [None]:
imp_n_estimator = imp_scores_rf['n_estimator'].iloc[0]
imp_max_depth = imp_scores_rf['max_depth'].iloc[0]
imp_min_samples_split = imp_scores_rf['min_samples_split'].iloc[0]
imp_min_samples_leaf = imp_scores_rf['min_samples_leaf'].iloc[0]
imp_best_conc_rf= imp_conc_rf[(imp_n_estimator, imp_max_depth, imp_min_samples_split, imp_min_samples_leaf)]
imp_best_conc_rf

### Normal Distribution

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(8, 6))

# Plot Q-Q plot for baseline_best_conc_coxnet in the first subplot
stats.probplot(imp_best_conc_rf, dist="norm", plot=axs[0, 0])
axs[0, 0].set_title('Q-Q plot of KNN-Imputation')
axs[0, 0].set_xlabel('Theoretical quantiles')
axs[0, 0].set_ylabel('Ordered Values')

# Plot histogram for baseline_best_conc_coxnet in the second subplot
axs[0, 1].hist(imp_best_conc_rf, bins=10, color='salmon', edgecolor='black')
axs[0, 1].set_title('Histogram of KNN-Imputation')
axs[0, 1].set_xlabel('Value')
axs[0, 1].set_ylabel('Frequency')

# Plot Q-Q plot for MI_mean_conc in the third subplot
stats.probplot(unimp_best_conc_rf, dist="norm", plot=axs[1, 0])
axs[1, 0].set_title('Q-Q plot of Baseline (Unimputed)')
axs[1, 0].set_xlabel('Theoretical quantiles')
axs[1, 0].set_ylabel('Ordered Values')

# Plot histogram for MI_mean_conc in the fourth subplot
axs[1, 1].hist(unimp_best_conc_rf, bins=10, color='salmon', edgecolor='black')
axs[1, 1].set_title('Histogram of Baseline (Unimputed)')
axs[1, 1].set_xlabel('Value')
axs[1, 1].set_ylabel('Frequency')


# Adjust layout
plt.suptitle('Q-Q plot and Histogram of C-index to KNN-Imputation and baseline (Unimputed) - RSF', fontsize=20)
plt.subplots_adjust(top=0.88)
plt.tight_layout()

# Show plot
plt.show()




## Comparison

In [None]:
print(f"Means Baseline (Unimputed): {round(np.mean(unimp_best_conc_rf), 3)} +/-{round(np.std(unimp_best_conc_rf), 3)}\nMeans KNN Imputation: {round(np.mean(imp_best_conc_rf), 3)} +/-{round(np.std(imp_best_conc_rf), 3)}")

In [None]:
ttest_ind(unimp_best_conc_rf, imp_best_conc_rf, alternative="two-sided")

# Componentwise Gradient Boosting

#### Model (Unimputed)

In [None]:
from sksurv.ensemble import ComponentwiseGradientBoostingSurvivalAnalysis

# Cross validation for CoxPH model in sksurv
n_estimators = [10, 20, 30, 40, 50]
learning_rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
subsamples = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

preds_cgb_mean_by_dataset = {}
conc_cgb_by_dataset = {}

NaN_thresholds = [3, 4, 5]

for NaN_threshold in NaN_thresholds:
    print(NaN_threshold)
    file_path = '../../../Both/new_study.xlsx'
    df = pd.read_excel(file_path, index_col='PATNO')
    df = df.rename(columns={'OS (days)': 'time'})
    df = df.rename(columns={'Status': 'status'})
    df['status'] = df['status'].map({'Dead': True, 'Alive': False})
    
    df = preprocess_data(df, NaN_threshold)

    X, y, tuple_y, target_columns = x_y_baseline(df)

    unimp_results_cgb = {}
    unimp_conc_cgb = {}

    for n_estimator in n_estimators:
        for learning_rate in learning_rates:
            for subsample in subsamples:
                cgb = ComponentwiseGradientBoostingSurvivalAnalysis(n_estimators=n_estimator,
                                                                    learning_rate=learning_rate,
                                                                    subsample = subsample,
                                                                    random_state=173637)
                conc_train = []
                conc_test = []
                brier = []
                permut = []
                feat_impor = []
                coef = []
                
                print(f'n_estimator: {n_estimator}')

                for i, (train, test) in enumerate(mcv.split(X, tuple_y)):
                    X_train, X_test = X.iloc[train], X.iloc[test]
                    y_train, y_test = y[train], y[test]
                    
                    X_train, X_test = Preprocessing_without_imputing(X_train=X_train, X_test=X_test, y_train=y_train, target_columns=target_columns)
                        
                    # fix the times            
                    times_train_min = y_train['time'].min()
                    times_train_max = y_train['time'].max()
                    times_train = np.arange(0, times_train_max)
                    times_test_min = y_test['time'].min()
                    times_test_max = y_test['time'].max()
                    if times_test_max > times_train_max:
                        y_test_red_index = y_test['time'] <= times_train_max
                        y_test = y_test[y_test_red_index]
                        X_test = X_test[y_test_red_index]
                        times_test_max = y_test['time'].max()
                    times_test = np.arange(times_test_min, times_test_max)

                    
                    cgb.fit(X_train, y_train)
                    
                    # Compute the C-index for test data and train data
                    conc_train.append(cgb.score(X_train, y_train))
                    conc_test.append(cgb.score(X_test, y_test))

                    # Brier Score
                    surv_prob_test = np.row_stack([fn(times_test) for fn in cgb.predict_survival_function(X_test)])
                    brier.append(integrated_brier_score(y_train, y_test, surv_prob_test, times_test))

                # Evaluate and record the results after each n_estimator and max_depth combination
                avg_conc_test = np.mean(conc_test)
                avg_conc_train = np.mean(conc_train)
                avg_brier = np.mean(brier)

                unimp_results_cgb[(n_estimator, learning_rate, subsample)] = [avg_conc_test, avg_conc_train, avg_brier, avg_permut]
                unimp_conc_cgb[(n_estimator, learning_rate, subsample)] = conc_test

    preds_cgb_mean_by_dataset[NaN_threshold] = unimp_results_cgb
    conc_cgb_by_dataset[NaN_threshold] = unimp_conc_cgb


### Sort the result by the best Concordance (test)

In [None]:
rows = []
for NaN_threshold, results in preds_cgb_mean_by_dataset.items():
    for (n_estimator, learning_rate, subsample), metrics in results.items():
        row = {
            "NaN_threshold": NaN_threshold,
            "n_estimator": n_estimator,
            'learning_rate': learning_rate,
            'subsample': subsample,
            "Conc test": metrics[0],
            "Conc train": metrics[1],
            'Brier': metrics[2]
        }
        rows.append(row)

unimp_scores_cgb = pd.DataFrame(rows).sort_values(by='Conc test', ascending=False).reset_index(drop=True)
unimp_scores_cgb.head()

In [None]:
unimp_n_estimator = unimp_scores_cgb['n_estimator'].iloc[0]
unimp_learning_rate = unimp_scores_cgb['learning_rate'].iloc[0]
unimp_subsample = unimp_scores_cgb['subsample'].iloc[0]
unimp_NaN_threshold = unimp_scores_cgb['NaN_threshold'].iloc[0]
unimp_best_conc_cgb = conc_cgb_by_dataset.get(unimp_NaN_threshold, {}).get((unimp_n_estimator, unimp_learning_rate, unimp_subsample), "Value not found")
unimp_best_conc_cgb

### CGB (Imputed)

In [None]:
X, y, tuple_y, target_columns = x_y_baseline(imputed_df)

In [None]:
from sksurv.ensemble import ComponentwiseGradientBoostingSurvivalAnalysis
# loss: coxph
n_estimators = [10, 20, 30, 40, 50]
learning_rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
subsamples = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

imp_results_cgb = {}
imp_feature_importance_cgb = {}
imp_conc_cgb = {}

for learning_rate in learning_rates: 
        for n_estimator in n_estimators:
            for subsample in subsamples:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=UserWarning)
                    cgb = ComponentwiseGradientBoostingSurvivalAnalysis(n_estimators=n_estimator,
                                                                        learning_rate=learning_rate,
                                                                        subsample = subsample,
                                                                        random_state=173637)
                    conc_train = []
                    conc_test = []
                    brier = []
                    permut = []
                    feat_impor = []
                    coef = []
                    
                    print(f'learning_rate: {learning_rate}, n_estimator: {n_estimator}')
                
                    for i, (train, test) in enumerate(mcv.split(X, tuple_y)):
                        X_train, X_test = X.iloc[train], X.iloc[test]
                        y_train, y_test = y[train], y[test]
                        
                        X_train, X_test = Preprocessing(X_train=X_train, X_test=X_test, y_train=y_train, target_columns=target_columns)
                            
                        # fix the times            
                        times_train_min = y_train['time'].min()
                        times_train_max = y_train['time'].max()
                        times_train = np.arange(0, times_train_max)
                        times_test_min = y_test['time'].min()
                        times_test_max = y_test['time'].max()
                        if times_test_max > times_train_max:
                            y_test_red_index = y_test['time'] <= times_train_max
                            y_test = y_test[y_test_red_index]
                            X_test = X_test[y_test_red_index]
                            times_test_max = y_test['time'].max()
                        times_test = np.arange(times_test_min, times_test_max)

                        
                        cgb.fit(X_train, y_train)
                        
                        # Compute the C-index for test data and train data
                        conc_train.append(cgb.score(X_train, y_train))
                        conc_test.append(cgb.score(X_test, y_test))

                        # Brier Score
                        surv_prob_test = np.row_stack([fn(times_test) for fn in cgb.predict_survival_function(X_test)])
                        brier.append(integrated_brier_score(y_train, y_test, surv_prob_test, times_test))

                        importance = permutation_importance(cgb,
                                                            X_test,
                                                            y_test,
                                                            n_repeats=10,
                                                            random_state=1)
                        permut.append(importance.importances_mean)

                        feat_impor.append(importance)
                
                    imp_feature_importance_cgb[(n_estimator, learning_rate, subsample)] = feat_impor

                    # Evaluate and record the results after each n_estimator and max_depth combination
                    avg_conc_test = np.mean(conc_test)
                    avg_conc_train = np.mean(conc_train)
                    avg_brier = np.mean(brier)
                    avg_permut = np.mean(permut)

                    imp_results_cgb[(n_estimator, learning_rate, subsample)] = [avg_conc_test, avg_conc_train, avg_brier, avg_permut]
                    imp_conc_cgb[(n_estimator, learning_rate, subsample)] = conc_test



result = [{
    'n_estimator': n_estimator,
    'learning_rate': learning_rate, 
    'subsample': subsample,
    'Conc test': avg_conc_test,
    'Conc train': avg_conc_train,   
    'Brier Score': avg_brier,
    'Permut': avg_permut
} for (n_estimator, learning_rate, subsample), (avg_conc_test, avg_conc_train, avg_brier, avg_permut) in imp_results_cgb.items()]

# Create the DataFrame
imp_results_cgb = pd.DataFrame(result)


### Scores

In [None]:
imp_scores_cgb = imp_results_cgb.sort_values(by='Conc test', ascending=False).reset_index(drop=True)

# Print out the sorted DataFrame
imp_scores_cgb.head(10)

### C-index to each fold

In [None]:
imp_n_estimator = imp_scores_cgb['n_estimator'].iloc[0]
imp_learning_rate = imp_scores_cgb['learning_rate'].iloc[0]
imp_subsample = imp_scores_cgb['subsample'].iloc[0]
imp_best_conc_cgb= imp_conc_cgb[(imp_n_estimator, imp_learning_rate, imp_subsample)]
imp_best_conc_cgb

## Normal distributed

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(8, 6))

# Plot Q-Q plot for baseline_best_conc_coxnet in the first subplot
stats.probplot(imp_best_conc_cgb, dist="norm", plot=axs[0, 0])
axs[0, 0].set_title('Q-Q plot of KNN-Imputation')
axs[0, 0].set_xlabel('Theoretical quantiles')
axs[0, 0].set_ylabel('Ordered Values')

# Plot histogram for baseline_best_conc_coxnet in the second subplot
axs[0, 1].hist(imp_best_conc_cgb, bins=10, color='salmon', edgecolor='black')
axs[0, 1].set_title('Histogram of KNN-Imputation')
axs[0, 1].set_xlabel('Value')
axs[0, 1].set_ylabel('Frequency')

# Plot Q-Q plot for MI_mean_conc in the third subplot
stats.probplot(unimp_best_conc_cgb, dist="norm", plot=axs[1, 0])
axs[1, 0].set_title('Q-Q plot of Baseline (Unimputed)')
axs[1, 0].set_xlabel('Theoretical quantiles')
axs[1, 0].set_ylabel('Ordered Values')

# Plot histogram for MI_mean_conc in the fourth subplot
axs[1, 1].hist(unimp_best_conc_cgb, bins=10, color='salmon', edgecolor='black')
axs[1, 1].set_title('Histogram of Baseline (Unimputed)')
axs[1, 1].set_xlabel('Value')
axs[1, 1].set_ylabel('Frequency')


# Adjust layout
plt.suptitle('Q-Q plot and Histogram of C-index to KNN-Imputation and baseline (Unimputed) - RSF', fontsize=20)
plt.subplots_adjust(top=0.88)
plt.tight_layout()

# Show plot
plt.show()


## Comparison

In [None]:
print(f"Means Baseline (Unimputed): {round(np.mean(unimp_best_conc_cgb), 3)} +/-{round(np.std(unimp_best_conc_cgb), 3)} \nMeans KNN Imputation: {round(np.mean(imp_best_conc_cgb), 3)} +/-{round(np.std(imp_best_conc_cgb), 3)}")

In [None]:
ttest_ind(unimp_best_conc_cgb, imp_best_conc_cgb, alternative="two-sided")