In [None]:
### Please use this when running code in Google Colab.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! pip install umap-learn

In [None]:
### Module ###
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib.cm import ScalarMappable
from tqdm.notebook import tqdm
tqdm.pandas(desc="progress: ")

from sklearn.preprocessing import StandardScaler
import umap

import warnings

# Ignore warnings
warnings.filterwarnings('ignore', category=UserWarning, message='n_jobs value.*overridden to 1 by setting random_state')


In [None]:
###
### Preparing dataset for UMAP ###
###
def Dataset_for_UMAP():

    bar = tqdm(total=90)
    bar.set_description("Preparing dataset for UMAP >>> ")

    df_all = pd.DataFrame()
    ls_cm_label = ["tp", "fn", "fp", "tn"]

    for time_point in range(1, 90+1):

        df_time_point = pd.DataFrame()

        for cm_label in ls_cm_label:

            shap_filename = "shap_values_"+cm_label+"_tp_"+str(time_point)+".pkl"
            df_shap_one_cmlabel = pd.read_pickle("/content/drive/MyDrive/res_death_destiny/data/shap_values/shap_values_"+cm_label+"/"+shap_filename)

            testval_filename = "test_values_"+cm_label+"_tp_"+str(time_point)+".pkl"
            df_testval_one_cmlabel = pd.read_pickle("/content/drive/MyDrive/res_death_destiny/data/test_values/test_values_"+cm_label+"/"+testval_filename)

            columns_shap = {"PT_ID":"PT_ID"}
            columns_testval = {"PT_ID":"PT_ID"}

            for test_item in df_shap_one_cmlabel.columns[:-1]:
                columns_shap[test_item] = test_item + "_shap"
                columns_testval[test_item] = test_item + "_test"

            df_shap_one_cmlabel = df_shap_one_cmlabel.rename(columns=columns_shap)
            df_testval_one_cmlabel = df_testval_one_cmlabel.rename(columns=columns_testval)

            df_merged_one_cmlabel = pd.merge(df_shap_one_cmlabel, df_testval_one_cmlabel, on=["Patient_ID"], how="inner")
            df_merged_one_cmlabel["Cm_label"] = cm_label

            df_time_point = pd.concat([df_time_point, df_merged_one_cmlabel])

        df_time_point["Time_point"] = time_point

        new_cols = list(df_time_point.columns)
        new_cols.remove("Patient_ID")
        new_cols.remove("Cm_label")
        new_cols.remove("Time_point")
        num_test_item = int(len(new_cols) / 2)
        new_cols = ["Time_point", "Cm_label", "Patient_ID"] + new_cols

        df_time_point = df_time_point.reset_index(drop=True).reindex(columns=new_cols)

        df_all = pd.concat([df_all, df_time_point])

        bar.update(1)

    df_all = df_all.reset_index(drop=True)

    ### Standardize Laboratory Test values ###
    df_all_testval = df_all.iloc[:,3+num_test_item:]

    scaler = StandardScaler()
    df_all_testval_standardized = pd.DataFrame(scaler.fit_transform(df_all_testval), columns=df_all_testval.columns)

    df_all_standardized = df_all.iloc[:,:3+num_test_item].join(df_all_testval_standardized, how="inner")

    ### Save DataFrame ###
    filename = "shap_test_values_all_before_UMAP.pkl"
    df_all.to_pickle("/content/drive/MyDrive/res_death_destiny/data/shap_test_values/"+filename)

    filename_std = "shap_test_values_all_before_UMAP_standardized.pkl"
    df_all_standardized.to_pickle("/content/drive/MyDrive/res_death_destiny/data/shap_test_values/"+filename_std)



###
### Reduction to the 2nd component by UMAP ###
###
def Run_UMAP(metric="euclidean", random_seed=42, n_neighbors=20, min_dist=0.1):

    ### Dataset ###
    filename = "shap_test_values_all_before_UMAP_standardized.pkl"
    df = pd.read_pickle("/content/drive/MyDrive/res_death_destiny/data/shap_test_values/"+filename)

    num_test_item = sum("shap" in col for col in df.columns)

    header = ["Time_point", "Cm_label", "Patient_ID"]
    df_header = df.loc[:, header]

    df_shap = df.iloc[:, len(header):len(header)+num_test_item]
    df_testval = df.iloc[:, len(header)+num_test_item:]

    ### Reduction by UMAP; SHAP values and test values respectively ###
    fit_shap = umap.UMAP(metric=metric, random_state=random_seed, n_neighbors=n_neighbors, min_dist=min_dist)
    fit_testval = umap.UMAP(metric=metric, random_state=random_seed, n_neighbors=n_neighbors, min_dist=min_dist)
    u_shap = fit_shap.fit_transform(df_shap)
    print("Umaped SHAP values")
    u_testval = fit_testval.fit_transform(df_testval)
    print("Umaped test values")


    df_shap_umaped = pd.DataFrame(u_shap, columns=["SHAP_umap1","SHAP_umap2"])
    df_testval_umaped = pd.DataFrame(u_testval, columns=["Test_umap1","Test_umap2"])

    df_umaped = df_header.join(df_shap_umaped).join(df_testval_umaped).join(df_shap).join(df_testval)

    ### Save DataFrame ###
    filename = "shap_test_values_all_after_UMAP_standardized.pkl"
    df_umaped.to_pickle("/content/drive/MyDrive/res_death_destiny/data/shap_test_values/"+filename)


In [None]:
###
### UMAP Visualization ###
###
def Visualize_SHAP_behavior_via_UMAP(val_type, cm_label="tp", fontsize=15, cbar_labelsize=10, figsize=7, alpha=0.3, size=10):

    ### Dataset ###
    filename = "shap_test_values_all_after_UMAP_standardized.pkl"
    df = pd.read_pickle("/content/drive/MyDrive/res_death_destiny/data/shap_test_values/"+filename)

    fig, ax = plt.subplots(figsize=(figsize,figsize))

    # Tips: The "val_type" argument is either "SHAP" or "Test"
    c_1 = val_type+"_umap1"
    c_2 = val_type+"_umap2"

    (xmin, xmax, ymin, ymax) = (df[c_1].min(), df[c_1].max(), df[c_2].min(), df[c_2].max())

    x_bks = (xmax - xmin)*0.05
    y_bks = (ymax - ymin)*0.05

    df_cm_label = df.query("Cm_label==@cm_label")

    label = np.array(df_cm_label.loc[:,"Time_point"])
    (cbar_min, cbar_max) = (np.amin(label), np.amax(label))
    cm_name = "viridis_r"
    cmap = plt.get_cmap(cm_name,90)

    ### Scatter Plot ###
    scatter = ax.scatter(df_cm_label.loc[:,c_1], df_cm_label.loc[:,c_2], alpha=alpha, s=size, c=label, cmap=cmap)

    ax.set_title("Confusion_matrix_label: "+str(cm_label), fontsize=fontsize*1.2)
    ax.set_xlabel("umap1 ("+val_type+")", fontsize=fontsize*1.3)
    ax.set_ylabel("umap2 ("+val_type+")", fontsize=fontsize*1.3)

    ax.tick_params(axis='x', labelsize=fontsize*1.5)
    ax.tick_params(axis='y', labelsize=fontsize*1.5)

    ax.set_xlim(xmin-x_bks, xmax+x_bks)
    ax.set_ylim(ymin-y_bks, ymax+y_bks)

    ax.grid(linestyle=":", linewidth=1)

    ### Colorbar ###
    axpos = ax.get_position()
    cbar_ax = fig.add_axes([1.0, axpos.y0, 0.05, axpos.height])
    norm = colors.Normalize(vmin=np.nanmin(label),vmax=np.nanmax(label))
    mappable = ScalarMappable(cmap=cmap,norm=norm)
    mappable.set_clim(cbar_min,cbar_max)
    mappable._A = []
    pp = fig.colorbar(mappable, cax=cbar_ax)

    pp.set_label(label="Time_point", size=fontsize*1.2)

    pp.ax.tick_params(labelsize=cbar_labelsize*2.0)

    plt.show()


In [None]:
### Preparing dataset for UMAP ###
Dataset_for_UMAP()

### Reduction to the 2nd component by UMAP ###
Run_UMAP()

### UMAP Visualization ###
Visualize_SHAP_behavior_via_UMAP(val_type="SHAP", cm_label="tp")
Visualize_SHAP_behavior_via_UMAP(val_type="Test", cm_label="tp")