In [None]:
import pandas as pd
import numpy as np
import glob
from scipy.stats import ttest_ind
import scipy.stats as stats
import matplotlib.pyplot as plt

from encoding import *
from rubins_rules import *


# Use the model that perform best in Modeling notebook - Coxnet, for multiple imputation



In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RepeatedStratifiedKFold
mcv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=173637)
from sksurv.metrics import (concordance_index_censored, 
                            integrated_brier_score)
from sklearn.inspection import permutation_importance
from sksurv.linear_model import CoxnetSurvivalAnalysis


# MI

### Model

### Loop through all multiple dataset data have been imputed by using multiple imputation methods with a range of different alphas and l1_ratios.

In [None]:
import re

file_pattern = '../../R/datasets/MI/new_studyM*.csv'
file_paths = glob.glob(file_pattern)

alphas = [0.01, 0.05, 0.1, 0.5, 1, 3, 5, 10, 20, 50, 70, 100, 200, 500, 700, 1000]
l1_ratios = [0.0001, 0.001, 0.01, 0.1]

coef_by_dataset = {}
preds_coxnet_mean_by_dataset = {}

for file_path in file_paths:
    df = pd.read_csv(file_path, sep=',', index_col=0)
    # Use a regular expression to find the part of the filename that matches "M" followed by any number(s)
    match = re.search(r'M\d+', file_path)
    if match:
        dataset_name = match.group()  # This will be 'M1', 'M2', etc.
    
    df = df.drop('PATNO', axis=1) 

    X, y, tuple_y, target_columns = x_y_multiple(df)  #kan sette train_df her

    results_coxnet = {}
    print(f'dataset name: {dataset_name}')

    for l1_ratio in l1_ratios:
        for alpha in alphas:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", category=UserWarning)
                warnings.filterwarnings("ignore", message="overflow encountered in exp")
                
                coxnet = CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, alphas=[alpha], fit_baseline_model=True)
                conc_train = []
                conc_test = []
                brier = []
                
                print(f'alpha: {alpha}, l1_ratio: {l1_ratio}')

                # Iterate the folds
                for train, test in mcv.split(X, tuple_y):  # legge inn X_train istedenfor X her
                    X_train, X_test = X.iloc[train], X.iloc[test]
                    y_train, y_test = y[train], y[test]

                    X_train, X_test = Preprocessing_without_imputing(X_train=X_train, X_test=X_test, y_train=y_train, target_columns=target_columns)
    
                    # fix the times            
                    times_train_min = y_train['time'].min()
                    times_train_max = y_train['time'].max()
                    times_train = np.arange(0, times_train_max)
                    times_test_min = y_test['time'].min()
                    times_test_max = y_test['time'].max()
                    if times_test_max > times_train_max:
                        y_test_red_index = y_test['time'] <= times_train_max
                        y_test = y_test[y_test_red_index]
                        X_test = X_test[y_test_red_index]
                        times_test_max = y_test['time'].max()
                    times_test = np.arange(times_test_min, times_test_max)
                            
                    coxnet.fit(X_train, y_train)
                    
                    conc_train.append(coxnet.score(X_train, y_train))
                    conc_test.append(coxnet.score(X_test, y_test))
                    
                    # Brier Score
                    surv_prob_test = np.row_stack([fn(times_test) for fn in coxnet.predict_survival_function(X_test)])
                    brier.append(integrated_brier_score(y_train, y_test, surv_prob_test, times_test))

                # Evaluate and record the results after each alpha and l1_ratio combination
                avg_conc_test = np.mean(conc_test)
                std_conc_test = np.std(conc_test)
                avg_conc_train = np.mean(conc_train)
                avg_brier = np.mean(brier)

                results_coxnet[(alpha, l1_ratio)] = [avg_conc_test, std_conc_test, avg_conc_train, avg_brier]

    preds_coxnet_mean_by_dataset[dataset_name] = results_coxnet

### Best score and parameters to each dataset

This code finds the best hyperparameters for each dataset that yield the highest c-index (test).

In [None]:
best_models_data = []

# Iterate over each dataset
for dataset_name, results_coxnet in preds_coxnet_mean_by_dataset.items():
    best_alpha = best_l1_ratio = None
    best_avg_conc_test = float('-inf')  # Initialize with the lowest possible value
    best_avg_conc_train = best_avg_brier = None
    
    # Find the model with the highest avg_conc_test for the current dataset
    for (alpha, l1_ratio), values in results_coxnet.items():
        avg_conc_test, std_conc_test, avg_conc_train, avg_brier = values
        if avg_conc_test > best_avg_conc_test:
            best_avg_conc_test = avg_conc_test
            best_alpha, best_l1_ratio = alpha, l1_ratio
            best_std_conc_test, best_avg_conc_train, best_avg_brier = std_conc_test, avg_conc_train, avg_brier
    
    # Append the best model data for the current dataset to the list
    best_models_data.append({
        'Dataset Name': dataset_name,
        'Alpha': best_alpha,
        'L1 Ratio': best_l1_ratio,
        'Best Avg Conc Test': best_avg_conc_test,
        'Best Std Conc Test': best_std_conc_test, 
        'Best Avg Conc Train': best_avg_conc_train,
        'Best Avg Brier': best_avg_brier,
    })

# Create a DataFrame from the list of best models data
df_best_models = pd.DataFrame(best_models_data)

# Display the DataFrame
df_best_models

### Find those hyperparameter combinations that contain mostly.

In [None]:
# Count the occurrences of each (Alpha, L1 Ratio) pair
alpha_l1_counts = df_best_models.groupby(['Alpha', 'L1 Ratio']).size().reset_index(name='Counts')

# Find the row(s) with the maximum count
most_common = alpha_l1_counts[alpha_l1_counts['Counts'] == alpha_l1_counts['Counts'].max()]

print("Most common combinations of Alpha and L1 Ratio:")
most_common


### Run the model with the best parameters

In [None]:
MI_alpha= most_common['Alpha'].iloc[0]
MI_l1_ratio= most_common['L1 Ratio'].iloc[0]
print(MI_alpha)
print(MI_l1_ratio)

### Run all the datasets again, but this time only using the best hyperparameters.

In [None]:
import re

file_pattern = '../../R/datasets/MI/new_studyM*.csv'
file_paths = glob.glob(file_pattern)

preds_coxnet_mean_by_dataset = {}
MI_conc_coxnet_by_dataset = {}

for file_path in file_paths:
    df = pd.read_csv(file_path, sep=',', index_col=0)
    # Use a regular expression to find the part of the filename that matches "M" followed by any number(s)
    match = re.search(r'M\d+', file_path)
    if match:
        dataset_name = match.group()  # This will be 'M1', 'M2', etc.

    X, y, tuple_y, target_columns = x_y_multiple(df)

    results_coxnet = {}

   
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=UserWarning)
        
        coxnet = CoxnetSurvivalAnalysis(l1_ratio=MI_l1_ratio, alphas=[MI_alpha], fit_baseline_model=True)
        conc_train = []
        conc_test = []
        brier = []
        
        print(f'alpha: {MI_alpha}, l1_ratio: {MI_l1_ratio}')

        # Iterate the folds
        for train, test in mcv.split(X, tuple_y):
            X_train, X_test = X.iloc[train], X.iloc[test]
            y_train, y_test = y[train], y[test]

            X_train, X_test = Preprocessing_without_imputing(X_train=X_train, X_test=X_test, y_train=y_train, target_columns=target_columns)


            # fix the times            
            times_train_min = y_train['time'].min()
            times_train_max = y_train['time'].max()
            times_train = np.arange(0, times_train_max)
            times_test_min = y_test['time'].min()
            times_test_max = y_test['time'].max()
            if times_test_max > times_train_max:
                y_test_red_index = y_test['time'] <= times_train_max
                y_test = y_test[y_test_red_index]
                X_test = X_test[y_test_red_index]
                times_test_max = y_test['time'].max()
            times_test = np.arange(times_test_min, times_test_max)

                    
            coxnet.fit(X_train, y_train)

            risk_scores = coxnet.predict(X_test)
                        
            conc_train.append(coxnet.score(X_train, y_train))
            conc_test.append(coxnet.score(X_test, y_test))
            
            # Brier Score
            surv_prob_test = np.row_stack([fn(times_test) for fn in coxnet.predict_survival_function(X_test)])
            brier.append(integrated_brier_score(y_train, y_test, surv_prob_test, times_test))

        # Evaluate and record the results after each alpha and l1_ratio combination
        avg_conc_test = np.mean(conc_test)
        std_conc_test = np.std(conc_test)
        avg_conc_train = np.mean(conc_train)
        avg_brier = np.mean(brier)

        results_coxnet[(MI_alpha, MI_l1_ratio)] = [avg_conc_test, std_conc_test, avg_conc_train, avg_brier]

    preds_coxnet_mean_by_dataset[dataset_name] = results_coxnet
    MI_conc_coxnet_by_dataset[dataset_name] = conc_test

### Score for each dataset using the best hyperparameter

In [None]:
import pandas as pd

# Prepare data for the DataFrame
data_for_df = []

# Loop through each dataset in preds_coxnet_mean_by_dataset
for dataset_name, results_coxnet in preds_coxnet_mean_by_dataset.items():
    # Each results_coxnet contains only one (alpha, l1_ratio) mapping to metrics
    for (alpha, l1_ratio), (avg_conc_test, std_conc_test, avg_conc_train, avg_brier) in results_coxnet.items():
        # Append the dataset name, parameters, and metrics to the data list
        data_for_df.append([dataset_name, alpha, l1_ratio, avg_conc_test, std_conc_test, avg_conc_train, avg_brier])

# Create the DataFrame
MI_score = pd.DataFrame(data_for_df, columns=['Dataset Name', 'Alpha', 'L1 Ratio', 'Avg Conc Test', 'Std Conc Test', 'Avg Conc Train', 'Avg Brier'])

# Display the DataFrame
MI_score


In [None]:
print(f"Mean: {np.mean(MI_score['Avg Conc Test']):.3f} +- {np.mean(MI_score['Std Conc Test']):.3f}")

### C-index to each fold

`MI_conc_coxnet_by_dataset`contains a dictionary where each dataset is a key, and the corresponding value is a list containing all c-index values for all folds

Because we have many dataset, we must take the mean across all the datasets. 

In [None]:
df = pd.DataFrame.from_dict(MI_conc_coxnet_by_dataset, orient='index').transpose()

# Calculate the mean of each row
MI_mean_conc = df.mean(axis=1).to_list()
MI_mean_conc

# Baseline model (KNN-imputation)

Do the same now for the baseline model. Find the concordance to each fold to the best hyperparameters. 

In [None]:
# Replace the path with the actual path to your file
file_path = '../../../Both/new_study.xlsx'
baseline_df = pd.read_excel(file_path, index_col='PATNO')
baseline_df = baseline_df.rename(columns={'OS (days)': 'time'})
baseline_df = baseline_df.rename(columns={'Status': 'status'})
baseline_df['status'] = baseline_df['status'].map({'Dead': True, 'Alive': False})

baseline_df.columns

### Finds the best model for Baseline

In [None]:
X, y, tuple_y, target_columns = x_y_baseline(baseline_df)

In [None]:
# Cross validation for CoxPH model in sksurv
alphas = [0, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 3, 5, 10, 20, 50, 70, 100, 200, 500, 700, 1000]
l1_ratios = [0.0001, 0.001, 0.01, 0.1]

results_coxnet = {}
coefficients_coxnet = {}
conc_coxnet = {}

for l1_ratio in l1_ratios:
    for alpha in alphas:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            
            coxnet = CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, alphas=[alpha], fit_baseline_model=True)
            conc_train = []
            conc_test = []
            brier = []
            coef = []
            
            print(f'alpha: {alpha}, l1_ratio: {l1_ratio}')
        
            for i, (train, test) in enumerate(mcv.split(X, tuple_y)):
                X_train, X_test = X.iloc[train], X.iloc[test]
                y_train, y_test = y[train], y[test]
                
                X_train, X_test = Preprocessing(X_train=X_train, X_test=X_test, y_train=y_train, target_columns=target_columns)
                # fix the times            
                times_train_min = y_train['time'].min()
                times_train_max = y_train['time'].max()
                times_train = np.arange(0, times_train_max)
                times_test_min = y_test['time'].min()
                times_test_max = y_test['time'].max()
                if times_test_max > times_train_max:
                    y_test_red_index = y_test['time'] <= times_train_max
                    y_test = y_test[y_test_red_index]
                    X_test = X_test[y_test_red_index]
                    times_test_max = y_test['time'].max()
                times_test = np.arange(times_test_min, times_test_max)

                
                coxnet.fit(X_train, y_train)
                
                # Compute the C-index for test data and train data
                conc_train.append(coxnet.score(X_train, y_train))
                conc_test.append(coxnet.score(X_test, y_test))

                # Brier Score
                surv_prob_test = np.row_stack([fn(times_test) for fn in coxnet.predict_survival_function(X_test)])
                brier.append(integrated_brier_score(y_train, y_test, surv_prob_test, times_test))

        
            coefficients_coxnet[(alpha, l1_ratio)] = coef

            # Evaluate and record the results after each alpha and l1_ratio combination
            avg_conc_test = np.mean(conc_test)
            avg_conc_train = np.mean(conc_train)
            avg_brier = np.mean(brier)

            results_coxnet[(alpha, l1_ratio)] = [avg_conc_test, avg_conc_train, avg_brier]

            conc_coxnet[(alpha, l1_ratio)] = conc_test

result = [{
    'Alpha': alpha,
    'L1 Ratio': l1_ratio,
    'Conc test': avg_conc_test,
    'Conc train': avg_conc_train,
    'Brier Score': avg_brier,
} for (alpha, l1_ratio), (avg_conc_test, avg_conc_train, avg_brier) in results_coxnet.items()]

# Create the DataFrame
results_coxnet = pd.DataFrame(result)


### Score

In [None]:
scores_coxnet = results_coxnet.sort_values(by='Conc test', ascending=False).reset_index(drop=True)

# Print out the sorted DataFrame
scores_coxnet.head(10)

### Finds the c-index to each fold to the best hyperparamets

In [None]:
baseline_alpha = scores_coxnet['Alpha'].iloc[0]
baseline_l1_ratio = scores_coxnet['L1 Ratio'].iloc[0]
baseline_best_conc_coxnet= conc_coxnet[(baseline_alpha, baseline_l1_ratio)]
baseline_best_conc_coxnet

# Unimputed Coxnet

### Model

In [None]:
# Cross validation for CoxPH model in sksurv
alphas = [0, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 3, 5, 10, 20, 50, 70, 100, 200, 500, 700, 1000]
l1_ratios = [0.0001, 0.001, 0.01, 0.1]

preds_coxnet_mean_by_dataset = {}
conc_coxnet_by_dataset = {}

NaN_thresholds = [3, 4, 5]

for NaN_threshold in NaN_thresholds:
    print(NaN_threshold)
    file_path = '../../../Both/new_study.xlsx'
    df = pd.read_excel(file_path, index_col='PATNO')
    df = df.rename(columns={'OS (days)': 'time'})
    df = df.rename(columns={'Status': 'status'})
    df['status'] = df['status'].map({'Dead': True, 'Alive': False})
    
    df = preprocess_data(df, NaN_threshold)

    X, y, tuple_y, target_columns = x_y_baseline(df)

    unimp_results_coxnet = {}
    unimp_conc_coxnet = {}

    for l1_ratio in l1_ratios:
        for alpha in alphas:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", category=UserWarning)
                
                coxnet = CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, alphas=[alpha], fit_baseline_model=True)
                conc_train = []
                conc_test = []
                brier = []
                permut = []
                coef = []
                feature_importance = []
                
                print(f'alpha: {alpha}, l1_ratio: {l1_ratio}')
            
                for i, (train, test) in enumerate(mcv.split(X, tuple_y)):
                    X_train, X_test = X.iloc[train], X.iloc[test]
                    y_train, y_test = y[train], y[test]
                    
                    X_train, X_test = Preprocessing_without_imputing(X_train=X_train, X_test=X_test, y_train=y_train, target_columns=target_columns)
                    # fix the times            
                    times_train_min = y_train['time'].min()
                    times_train_max = y_train['time'].max()
                    times_train = np.arange(0, times_train_max)
                    times_test_min = y_test['time'].min()
                    times_test_max = y_test['time'].max()
                    if times_test_max > times_train_max:
                        y_test_red_index = y_test['time'] <= times_train_max
                        y_test = y_test[y_test_red_index]
                        X_test = X_test[y_test_red_index]
                        times_test_max = y_test['time'].max()
                    times_test = np.arange(times_test_min, times_test_max)

                    
                    coxnet.fit(X_train, y_train)
                    
                    # Compute the C-index for test data and train data
                    conc_train.append(coxnet.score(X_train, y_train))
                    conc_test.append(coxnet.score(X_test, y_test))

                # Evaluate and record the results after each alpha and l1_ratio combination
                avg_conc_test = np.mean(conc_test)
                avg_conc_train = np.mean(conc_train)

                unimp_results_coxnet[(alpha, l1_ratio)] = [avg_conc_test, avg_conc_train]

                unimp_conc_coxnet[(alpha, l1_ratio)] = conc_test

    preds_coxnet_mean_by_dataset[NaN_threshold] = unimp_results_coxnet
    conc_coxnet_by_dataset[NaN_threshold] = unimp_conc_coxnet


## Scores

In [None]:
rows = []
for NaN_threshold, results in preds_coxnet_mean_by_dataset.items():
    for (alpha, l1_ratio), metrics in results.items():
        row = {
            "NaN_threshold": NaN_threshold,
            "alpha": alpha,
            "l1_ratio": l1_ratio,
            "Conc test": metrics[0],
            "Conc train": metrics[1],
        }
        rows.append(row)

unimp_scores_coxnet = pd.DataFrame(rows).sort_values(by='Conc test', ascending=False).reset_index(drop=True)
unimp_scores_coxnet.head(10)

## C-index for each fold

In [None]:
unimp_alpha = unimp_scores_coxnet['alpha'].iloc[0]
unimp_l1_ratio = unimp_scores_coxnet['l1_ratio'].iloc[0]
unimp_NaN_threshold = unimp_scores_coxnet['NaN_threshold'].iloc[0]

unimp_best_conc_coxnet = conc_coxnet_by_dataset.get(unimp_NaN_threshold, {}).get((unimp_alpha, unimp_l1_ratio), "Value not found")
unimp_best_conc_coxnet

# Normal distribution
To use t-test, we must check that the data is normal distributed. 

In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt

fig, axs = plt.subplots(2, 2, figsize=(8, 6))

# Plot Q-Q plot for baseline_best_conc_coxnet in the first subplot
stats.probplot(unimp_best_conc_coxnet, dist="norm", plot=axs[0, 0])
axs[0, 0].set_title('Q-Q plot of Baseline (Unimputed)')
axs[0, 0].set_xlabel('Theoretical quantiles')
axs[0, 0].set_ylabel('Ordered Values')

# Plot histogram for baseline_best_conc_coxnet in the second subplot
axs[0, 1].hist(unimp_best_conc_coxnet, bins=10, color='salmon', edgecolor='black')
axs[0, 1].set_title('Histogram of Baseline (Unimputed)')
axs[0, 1].set_xlabel('Value')
axs[0, 1].set_ylabel('Frequency')

# Plot Q-Q plot for MI_mean_conc in the third subplot
stats.probplot(MI_mean_conc, dist="norm", plot=axs[1, 0])
axs[1, 0].set_title('Q-Q plot of MI')
axs[1, 0].set_xlabel('Theoretical quantiles')
axs[1, 0].set_ylabel('Ordered Values')

# Plot histogram for MI_mean_conc in the fourth subplot
axs[1, 1].hist(MI_mean_conc, bins=10, color='salmon', edgecolor='black')
axs[1, 1].set_title('Histogram of MI')
axs[1, 1].set_xlabel('Value')
axs[1, 1].set_ylabel('Frequency')

# Adjust layout
plt.suptitle('Q-Q plot and Histogram of concordances in MI and Baseline (Unimputed)', fontsize=20)
plt.subplots_adjust(top=0.88)
plt.tight_layout()

# Show plot
plt.show()


In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt

fig, axs = plt.subplots(2, 2, figsize=(8, 6))

# Plot Q-Q plot for baseline_best_conc_coxnet in the first subplot
stats.probplot(baseline_best_conc_coxnet, dist="norm", plot=axs[0, 0])
axs[0, 0].set_title('Q-Q plot of KNN-Imputation')
axs[0, 0].set_xlabel('Theoretical quantiles')
axs[0, 0].set_ylabel('Ordered Values')

# Plot histogram for baseline_best_conc_coxnet in the second subplot
axs[0, 1].hist(baseline_best_conc_coxnet, bins=10, color='salmon', edgecolor='black')
axs[0, 1].set_title('Histogram of KNN-Imputation')
axs[0, 1].set_xlabel('Value')
axs[0, 1].set_ylabel('Frequency')

# Plot Q-Q plot for MI_mean_conc in the third subplot
stats.probplot(MI_mean_conc, dist="norm", plot=axs[1, 0])
axs[1, 0].set_title('Q-Q plot of MI')
axs[1, 0].set_xlabel('Theoretical quantiles')
axs[1, 0].set_ylabel('Ordered Values')

# Plot histogram for MI_mean_conc in the fourth subplot
axs[1, 1].hist(MI_mean_conc, bins=10, color='salmon', edgecolor='black')
axs[1, 1].set_title('Histogram of MI')
axs[1, 1].set_xlabel('Value')
axs[1, 1].set_ylabel('Frequency')

# Adjust layout
plt.suptitle('Q-Q plot and Histogram of concordances in MI and KNN-Imputation', fontsize=20)
plt.subplots_adjust(top=0.88)
plt.tight_layout()

# Show plot
plt.show()


# Comparison between KNN-imputation and Mulitple Imputation by using the t-test.

In [None]:
print(f"Means MI: {np.mean(MI_score['Avg Conc Test']):.3f} +/-{np.mean(MI_score['Std Conc Test']):.3f}\nMeans KNN Imputation: {round(np.mean(baseline_best_conc_coxnet), 3)} +/-{round(np.std(baseline_best_conc_coxnet), 3)}")


In [None]:
ttest_ind(MI_mean_conc, baseline_best_conc_coxnet, alternative="two-sided")

# Comparison between Unimputation and Mulitple Imputation by using the t-test.

In [None]:
#print(f"Means MI: {round(np.mean(MI_mean_conc), 3)}\nBaseline (Unimputed) : {round(np.mean(unimp_best_conc_coxnet), 3)}")
print(f"Means MI: {np.mean(MI_score['Avg Conc Test']):.3f} +/-{np.mean(MI_score['Std Conc Test']):.3f}\nBaseline (Unimputed) : {round(np.mean(unimp_best_conc_coxnet), 3)} +/-{round(np.std(unimp_best_conc_coxnet), 3)}")


In [None]:
ttest_ind(MI_mean_conc, unimp_best_conc_coxnet, alternative="two-sided")

# Model-based imputation (single)

### Model

In [None]:
file_pattern = '/Users/ninarebeccalizana/Documents/Master/code/New-Study/R/datasets/singlenew_studyM1.csv'

df = pd.read_csv(file_pattern, sep=',', index_col=0)
df = df.drop('PATNO', axis=1) 

X, y, tuple_y, target_columns = x_y_multiple(df)  

In [None]:
# Cross validation for CoxPH model in sksurv
alphas = [0, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 3, 5, 10, 20, 50, 70, 100, 200, 500, 700, 1000]
l1_ratios = [0.0001, 0.001, 0.01, 0.1]

single_results_coxnet = {}
single_coefficients_coxnet = {}
single_conc_coxnet = {}

for l1_ratio in l1_ratios:
    for alpha in alphas:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            
            coxnet = CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, alphas=[alpha], fit_baseline_model=True)
            conc_train = []
            conc_test = []
            brier = []
            coef = []
            
            print(f'alpha: {alpha}, l1_ratio: {l1_ratio}')
        
            for i, (train, test) in enumerate(mcv.split(X, tuple_y)):
                X_train, X_test = X.iloc[train], X.iloc[test]
                y_train, y_test = y[train], y[test]
                
                X_train, X_test = Preprocessing(X_train=X_train, X_test=X_test, y_train=y_train, target_columns=target_columns)
                # fix the times            
                times_train_min = y_train['time'].min()
                times_train_max = y_train['time'].max()
                times_train = np.arange(0, times_train_max)
                times_test_min = y_test['time'].min()
                times_test_max = y_test['time'].max()
                if times_test_max > times_train_max:
                    y_test_red_index = y_test['time'] <= times_train_max
                    y_test = y_test[y_test_red_index]
                    X_test = X_test[y_test_red_index]
                    times_test_max = y_test['time'].max()
                times_test = np.arange(times_test_min, times_test_max)

                
                coxnet.fit(X_train, y_train)
                
                # Compute the C-index for test data and train data
                conc_train.append(coxnet.score(X_train, y_train))
                conc_test.append(coxnet.score(X_test, y_test))

                # Brier Score
                surv_prob_test = np.row_stack([fn(times_test) for fn in coxnet.predict_survival_function(X_test)])
                brier.append(integrated_brier_score(y_train, y_test, surv_prob_test, times_test))

        
            single_coefficients_coxnet[(alpha, l1_ratio)] = coef

            # Evaluate and record the results after each alpha and l1_ratio combination
            avg_conc_test = np.mean(conc_test)
            avg_conc_train = np.mean(conc_train)
            avg_brier = np.mean(brier)

            single_results_coxnet[(alpha, l1_ratio)] = [avg_conc_test, avg_conc_train, avg_brier]

            single_conc_coxnet[(alpha, l1_ratio)] = conc_test

result = [{
    'Alpha': alpha,
    'L1 Ratio': l1_ratio,
    'Conc test': avg_conc_test,
    'Conc train': avg_conc_train,
    'Brier Score': avg_brier,
} for (alpha, l1_ratio), (avg_conc_test, avg_conc_train, avg_brier) in single_results_coxnet.items()]

# Create the DataFrame
single_results_coxnet = pd.DataFrame(result)


### Score

In [None]:
single_scores_coxnet = single_results_coxnet.sort_values(by='Conc test', ascending=False).reset_index(drop=True)

# Print out the sorted DataFrame
single_scores_coxnet.head(10)

### Find the c-index

In [None]:
single_alpha = single_scores_coxnet['Alpha'].iloc[0]
single_l1_ratio = single_scores_coxnet['L1 Ratio'].iloc[0]
single_best_conc_coxnet= single_conc_coxnet[(baseline_alpha, baseline_l1_ratio)]
single_best_conc_coxnet

# Comparison between KNN-imputation and single based-model by using the t-test.

In [None]:
print(f"Model-based (single): {round(np.mean(single_best_conc_coxnet), 3)} +/-{round(np.std(single_best_conc_coxnet), 3)}\nKNN imputation : {round(np.mean(baseline_best_conc_coxnet), 3)} +/-{round(np.std(baseline_best_conc_coxnet), 3)}")


In [None]:
ttest_ind(single_best_conc_coxnet, baseline_best_conc_coxnet, alternative="two-sided")

# Comparison between Unimputed and single based-model by using the t-test.

In [None]:
print(f"Model-based (single): {round(np.mean(single_best_conc_coxnet),3)}\nBaseline (unimputed) : {round(np.mean(unimp_best_conc_coxnet), 3)}")

In [None]:
ttest_ind(single_best_conc_coxnet, unimp_best_conc_coxnet, alternative="two-sided")