In [None]:
### Please use this when running code in Google Colab.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! pip install shap==0.46.0

In [None]:
### Module ###
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm.notebook import tqdm
tqdm.pandas(desc="progress: ")

from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
from sklearn import metrics
import shap

import warnings

# Ignore UserWarning
warnings.filterwarnings('ignore', category=UserWarning)

In [None]:
### Flatten Doubled List ###
def flatten(l: list):

    f_l = []
    for item in l:
        f_l.append(item[0])

    return f_l

### F1score ###
def f1_score(i, precision, recall):
    f1_score = 2*precision[i]*recall[i]/(precision[i]+recall[i])
    return f1_score


In [None]:
###
##### Main #####
###
def model_construction_main(time_point: int):

    ### Load dataset ###
    prediction_dataset = load_prediction_dataset(time_point=time_point)

    ls_pt_id = prediction_dataset["Patient_ID"].unique()
    df_pt_id = pd.DataFrame(ls_pt_id, columns=["Patient_ID"])

    ls_auroc = []

    ### 5-fold cross validation ###
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    cv_index = 1

    # CV Progress bar #
    bar = tqdm(total = 5)
    bar.set_description("5-fold cross validation >>> ")

    for train_index, test_index in kf.split(df_pt_id):
        s_train = flatten(df_pt_id.iloc[train_index].values)
        s_test = flatten(df_pt_id.iloc[test_index].values)

        ### Model Training and Evaluation ###
        KFold_one(time_point, s_train, s_test, prediction_dataset, cv_index, ls_auroc, ls_pt_id)

        cv_index += 1
        bar.update(1)

    ### AUROC Statistics ###
    AUROC_stat(time_point, ls_auroc)



###
##### Load Datasets #####
###
def load_prediction_dataset(time_point: int):

    #df_all = pd.read_pickle("data/dummy_time_series_EHRdata.pkl")
    df_all = pd.read_csv("/content/drive/MyDrive/res_death_destiny/data/dummy_time_series_EHRdata.csv")

    ### Labeling ###
    df_death_positive = df_all.query("Time_point == @time_point").copy()
    df_death_negative = df_all.query("Time_point == 168").copy()
    df_death_positive.loc[:, "Answer"] = 1
    df_death_negative.loc[:, "Answer"] = 0

    prediction_dataset = pd.concat([df_death_positive, df_death_negative],axis=0).drop("Time_point", axis=1)

    return prediction_dataset



###
##### Model Training and Evaluation #####
###
def KFold_one(time_point: int, s_train: list, s_test: list, prediction_dataset: pd.DataFrame,
                cv_index: int, ls_auroc: list, ls_pt_id: list):

    random.seed(42)
    random.shuffle(s_train)
    split_idx = int(len(s_train) * 0.8)
    s_train_train = s_train[:split_idx]
    s_train_valid = s_train[split_idx:]

    df_train = prediction_dataset.query("Patient_ID==@s_train_train").copy()
    df_valid = prediction_dataset.query("Patient_ID==@s_train_valid").copy()
    df_test = prediction_dataset.query("Patient_ID==@s_test").copy()

    for df in [df_train, df_valid, df_test]:
        df["Answer"] = df["Answer"].astype(int)

    exclude_col = ["Patient_ID", "Answer"]
    objective_col = ["Answer"]

    X_train = df_train.drop(columns=exclude_col).values
    y_train = flatten(df_train[objective_col].values)

    X_valid = df_valid.drop(columns=exclude_col).values
    y_valid = flatten(df_valid[objective_col].values)

    X_test = df_test.drop(columns=exclude_col).values
    y_test = flatten(df_test[objective_col].values)

    g = (df_train.groupby("Patient_ID").size().values).astype(int)

    train_set = lgb.Dataset(X_train, label=y_train, group=g)
    valid_set = lgb.Dataset(X_valid, label=y_valid, reference=train_set)

    ### Hyperparameter tuning ###
    lgb_params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "num_threads": 1,
        "feature_pre_filter": False,
        "deterministic": True,
        "force_row_wise": True,
        "verbosity": -1
    }

    folds = GroupKFold(n_splits=4)
    tuner_cv = lgb.LightGBMTunerCV(
        lgb_params, train_set, folds=folds,
        num_boost_round=100, early_stopping_rounds=10,
        verbose_eval=False, verbosity=-1,
        show_progress_bar=False, seed=0
    )
    tuner_cv.run()

    best_params = tuner_cv.best_params
    print("  Params: ")
    for key, value in best_params.items():
        print("    {}: {}".format(key, value))

    model = lgb.train(
        best_params, train_set,
        valid_sets=valid_set, 
        early_stopping_rounds=10,
        verbose_eval=-1
    )

    model_name = f"mpmodel_tp_{time_point}_cv_{cv_index}.pkl"
    file = f"/content/drive/MyDrive/res_death_destiny/data/models/{model_name}"
    pickle.dump(model, open(file, 'wb'))

    ##### Evaluation Part #####

    ### Dataset for Evaluation ###
    eval_dataset = prediction_dataset.copy()

    eval_test_set = eval_dataset.query("Patient_ID in @s_test")

    X_test = eval_test_set.drop(["Patient_ID", "Answer"], axis=1).astype(float)
    y_test = eval_test_set["Answer"].astype(float)

    ### Prediction ###
    y_pred = model.predict(X_test)

    precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred)
    k=len(thresholds)

    df_f1_score = pd.DataFrame([f1_score(i, precision, recall) for i in range(len(thresholds))], columns=["f1_score"])
    df_f1_score["precision"] = precision[0:k]
    df_f1_score["recall"] = recall[0:k]
    df_f1_score["thresholds"] = thresholds

    ### Threshold which maximize F1-score ###
    thresholds_f1max = df_f1_score.sort_values(["f1_score"]).iloc[-1,-1]

    df_prediction_label = pd.DataFrame(y_test)
    df_prediction_label["pred_proba"] = y_pred.tolist()
    df_prediction_label["pred_label"] = 0
    df_prediction_label["pred_label"].mask(df_prediction_label["pred_proba"] >= thresholds_f1max, 1, inplace=True)

    ### ROC curve and AUROC ###
    AUROC(time_point, y_test, y_pred, cv_index, ls_auroc)



###
##### ROC curve and AUROC #####
###
def AUROC(time_point: int, y_test: pd.Series, y_pred: pd.Series,
          cv_index: int, ls_auroc: list):

    ### AUC ###
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    AUROC = metrics.auc(fpr, tpr)
    ls_auroc.append(AUROC)

    ### ROC curve ###
    fig = plt.figure(figsize=(2.8,2.8), dpi=300, tight_layout=True, facecolor="w")

    ax= fig.add_subplot(111)
    ax.plot(fpr, tpr,label="AUROC = %.3f"%AUROC, lw=1.0, c="blue")
    ax.plot([0,1],[0,1],"k--",lw=0.6)

    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")

    ls_ticks = [0,0.2,0.4,0.6,0.8,1.0]
    ax.set_xticks(ls_ticks)
    ax.set_xticklabels(ls_ticks)
    ax.tick_params(axis="x", labelsize=8)
    ax.tick_params(axis="y", labelsize=8)
    ax.set_yticks(ls_ticks)
    ax.set_yticklabels(ls_ticks)
    plt.title("ROC Curve")

    legend = ax.legend(frameon=False, loc='lower right') # 凡例

    ### Save ROC curve ###
    figname = "roc_curve_tp_"+str(time_point)+"_cv_"+str(cv_index)+".png"
    plt.savefig("/content/drive/MyDrive/res_death_destiny/data/roc_curves/"+figname)
    plt.show()



###
##### AUROC Statistics #####
###
def AUROC_stat(time_point: int, ls_auroc: list):

    mean = np.mean(ls_auroc)
    std = np.std(ls_auroc)
    cv = std/mean
    ls_auroc.append(mean)
    ls_auroc.append(std)
    ls_auroc.append(cv)
    print("Mean of AUROC    : {:.5f} ".format(mean))
    print("SD of AUROC : {:.5f} ".format(std))
    print("CV of AUROC : {:.5f} ".format(cv))

    ### Save Results ###
    df_desc = pd.DataFrame(ls_auroc, index=["auroc_1","auroc_2","auroc_3","auroc_4","auroc_5","mean","std","cv"]).T
    filename = "auroc_tp_"+str(time_point)+".csv"
    df_desc.to_csv("/content/drive/MyDrive/res_death_destiny/data/auroc/"+filename, index=False)



In [None]:
###
### SHAP value calculation: From 1 to 90 time point ###
###
bar = tqdm(total = 90)
bar.set_description("Calculating SHAP values >>> ")

for time_point in range(1,1+1):
  shap_value_calsulation_main(time_point=time_point, shap_summary_plot=True)
  bar.update(1)
