# Px Models Benchmarks

## Load PaCMAP-LGBM data

In [1]:
import pandas as pd
import sys
sys.path.append('../')
from source.bokeh_plots import *
from source.data_visualization import *
output_notebook()

file_path = '../data/al_atlas_main_results.xlsx'
model_name = 'AML Epigenomic Risk'

# Read the data
df = pd.read_excel(file_path, index_col=0).sort_index()

# Define train and test samples
df_train = df[df['Train-Test']=='Train Sample']
df_test = df[df['Train-Test'] == 'Test Sample']

# Drop the samples with missing labels for the selected column
df_px = df_train[~df_train['Vital Status'].isna()]

# drop the samples with missing labels for the ELN AML 2022 Diagnosis
df_dx = df_train[~df_train['WHO 2022 Diagnosis'].isna()]

# exclude the classes with fewer than 10 samples
df_dx = df_dx[~df_dx['WHO 2022 Diagnosis'].isin([
                                       'MPAL with t(v;11q23.3)/KMT2A-r',
                                       'B-ALL with hypodiploidy',
                                       'AML with t(16;21); FUS::ERG',
                                       'AML with t(9;22); BCR::ABL1'
                                       ])]

### Select samples from COG AAML1031, 0531, and 03P1 Dx samples
df_cog = df[df['Clinical Trial'].isin(['AAML0531', 'AAML1031', 'AAML03P1'])]
df_cog = df_cog[df_cog['Sample Type'].isin(['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow',
                                            'Primary Blood Derived Cancer - Peripheral Blood'])]
df_cog = df_cog[~df_cog['Patient_ID'].duplicated(keep='last')]

df_cog = df_cog.rename(columns={'AML Epigenomic Risk_int': 'PaCMAP-LGBM', 'P(High Risk)': 'PaCMAP-LGBM P(High Risk)'})

## Load EWAS-Cox datasets

In [2]:
output_path = '/mnt/d/MethylScore/Processed_Data/'
path_os = output_path + 'multivariate_cox_lasso/ewas_cog_os_MethylScoreAML_Px.xlsx'
path_efs = output_path + 'multivariate_cox_lasso/ewas_cog_efs_MethylScoreAML_Px_efs.xlsx'

ewas_cox_OS_48CpGs = pd.read_excel(path_os, index_col=0)[['MethylScoreAML_Px','MethylScoreAML_Px_cat_bin']]
ewas_cox_EFS_55CpGs = pd.read_excel(path_efs, index_col=0)[['MethylScore55_NewRiskEFS','MethylScore55_NewRiskEFS_cat_bin']]

ewas_cox_OS_48CpGs = ewas_cox_OS_48CpGs.rename(columns={'MethylScoreAML_Px':'EWASCox_OS_48CpGs (cont)', 'MethylScoreAML_Px_cat_bin':'EWASCox_OS_48CpGs'})
ewas_cox_EFS_55CpGs = ewas_cox_EFS_55CpGs.rename(columns={'MethylScore55_NewRiskEFS':'EWASCox_EFS_55CpGs (cont)', 'MethylScore55_NewRiskEFS_cat_bin':'EWASCox_EFS_55CpGs'})

## Combine datasets

In [27]:
df_combined = df_cog.join(ewas_cox_OS_48CpGs).join(ewas_cox_EFS_55CpGs)

df_cat = df_combined[['os.evnt', 'EWASCox_OS_48CpGs', 'EWASCox_EFS_55CpGs', 'PaCMAP-LGBM']]
df_cont = df_combined[['os.evnt', 'EWASCox_OS_48CpGs (cont)', 'EWASCox_EFS_55CpGs (cont)', 'PaCMAP-LGBM P(High Risk)']]

df_cont = df_cont.rename(columns={'PaCMAP-LGBM P(High Risk)':'PaCMAP-LGBM',
                                  'EWASCox_OS_48CpGs (cont)': 'EWASCox_OS_48CpGs',
                                  'EWASCox_EFS_55CpGs (cont)': 'EWASCox_EFS_55CpGs'})

risk = df_cog[['Risk Group AAML1831','Risk Group']]

low_high_dict = {'Low': 0, 'Low Risk': 0,
                'Standard':0.5, 'Standard Risk': 0.5,
                'High': 1, 'High Risk': 1}

risk['Risk Group'] = risk['Risk Group'].map(low_high_dict)
risk['Risk Group AAML1831'] = risk['Risk Group AAML1831'].map(low_high_dict)

df_cont_risk = df_cont.join(risk)
df_cat_risk = df_cat.join(risk)

df_cont_risk = df_cont_risk.dropna()
df_cat_risk = df_cat_risk.dropna()



## ROC-AUC comparison

In [42]:
def plot_roc_auc(df, target, title=None):
    """
    Plots ROC AUC flexibly using Bokeh.

    """
    
    # colors = itertools.cycle(Spectral11)
    colors = ['blue', 'green', 'red', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan', 'black']

    if title:
        title_ = title + ', n=' + str(len(df))
    else:
        title_ = ''

    p = figure(title=title_,
               x_axis_label='False Positive Rate',
               y_axis_label='True Positive Rate',
               width=425, height=425,
               tools='save,reset,pan')
    
    p.line([0, 1], [0, 1], line_dash="dashed", color="gray", line_width=1)

    for column, color in zip(df.columns.difference([target]), colors):
        fpr, tpr, _ = roc_curve(df[target], df[column])
        roc_auc = auc(fpr, tpr)
        p.line(fpr, tpr, legend_label=f"{column} ({roc_auc:.2f})",
               color=color, line_width=2, alpha=0.8)

    p.legend.location = "bottom_right"
    p.legend.click_policy="hide"
    p.toolbar.logo = None
    p.legend.label_text_font_size = '8pt'
    p.legend.spacing = 2
    p.xaxis.axis_label_text_font_style = "normal"
    p.yaxis.axis_label_text_font_style = "normal"
    p.legend.background_fill_alpha = 0.8
    p.title.text_font_size = '10pt'

    return p

p1 = plot_roc_auc(df_cont_risk, 'os.evnt',title= 'Continuous (prob. of high risk)')
p2 = plot_roc_auc(df_cat_risk, 'os.evnt',title= 'Categorical (high-low risk)')

# Create a gridplot
p = gridplot([[p1, p2]], toolbar_location='above')

show(p)