# Import required files for Kaplan Meier By Group

In [None]:
import pandas as pd
import numpy as np
import scipy.stats
import pickle
import fnmatch
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid", palette="colorblind", color_codes=True)

from survive import datasets
from survive import SurvivalData
from survive import KaplanMeier, Breslow, NelsonAalen
from sksurv.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

%matplotlib inline

from pprint import pprint
pd.set_option('display.width', None)
pd.set_option('display.max_column',None)
pd.set_option('display.max_rows',None)

import warnings
warnings.filterwarnings('ignore')

# KM Functions

In [None]:
def generate_kaplan_meier_group_for_status(survival_type="OS", years=1, save_to_csv_filename = "None"):
    
    """
    This is the main Kaplan Meier Function to generate a chart and DF with an option to save the DF as csv. 
    """
    
    #declare Variables:
    output_dfs_list = []
    y_features = list(['status', survival_type + "_days"])
    groups = ["group 1", "group 2", "group 3"]
    group_dict = { 
                "group 1": {
                             "stage": ['stage 4'],\
                           },\
                "group 2": {
                             'stage': ['dcis/lcis non-invasive'],\
                           },\
                "group 3": {
                             "stage": ['stage 4','dcis/lcis non-invasive'],\
                           },
                }
    
    # Building Base df_dict
    listToDrop = ['NRIC','dob','Has Bills?','Side','Hospital','KKH','NCCS','SGH','END_OF_ENTRY']
    clinical = kaplan_meier_group_dataSetting(listToDrop)

    year_list = list([1,5,10])
    df_dict = kaplan_meier_group_ComputeYears(clinical,year_list)
    

    model_data_dict = kaplan_meier_group_layeredData(df_dict, group_dict,y_features,years, survival_type)

    # Build DF's for each group
    for group in groups:
        group_df = pd.DataFrame(model_data_dict[group])
#         display(group_df.head())
        temp_df = pd.DataFrame()
        temp_df["status"] = group_df["status"].apply(lambda status: 0 if status == False else 1)
        temp_df["time"] = group_df[survival_type + "_days"].apply(lambda time: time/365.25)

        #build KM object
        surv = SurvivalData(time="time", status="status", data=temp_df)
        km = KaplanMeier()
        km.fit(surv)
    
        #Show KM df
        temp_KM_df = kaplan_meier_group_to_df(km)
        temp_KM_df["group_label"] = group
        output_dfs_list.append(temp_KM_df)
        print("Generated df for ", group,  survival_type)
        
    output_df = pd.concat(output_dfs_list, ignore_index=True)
    
    #if csv filename given, save as filename else, end function
    if save_to_csv_filename == "None":
        #End of function
        return output_df
        
    else: 
        output_df.to_csv(save_to_csv_filename)
        return output_df
        
def generate_kaplan_meier_group_with_class(years=1, save_to_csv_filename="None"):
    
    survival_types = ["OS", "DFS", "CSS"]
    output_dfs_list = []
    
    for survival_type in survival_types:
        temp_df = generate_kaplan_meier_group_for_status(survival_type=survival_type, years=years, save_to_csv_filename="None")
        temp_df["class_label"] = survival_type
        output_dfs_list.append(temp_df)
        
    output_df = pd.concat(output_dfs_list, ignore_index=True)
#     display(output_df)
    
    if save_to_csv_filename == "None":
        #End of function
        return output_df
        
    else: 
        output_df.to_csv(save_to_csv_filename)
        return output_df

def kaplan_meier_group_to_df(KM_object):
    
    # Process the summary as string
    
    summary_lines_list = str(KM_object.summary).split("\n")
    
    header = ["time", "events", "at_risk",  "estimate",  "std_error",  "95%_CI_lower",  "95%_CI_upper"]
    rows = summary_lines_list[6:]
    
    row_values = []
    
    for row in rows:
        
        elements = row.split(" ")
        tmp = []
        for element in elements:
            if element.isnumeric() or ("." in element):
                tmp.append(element)
                
        row_values.append(tmp)
        
    #Build df
    output_df = pd.DataFrame()
    temp_df = pd.DataFrame(row_values, columns=header)
    output_df["time"] = temp_df["time"]
    output_df["estimate"] = temp_df["estimate"]
    output_df["lower"] = temp_df["95%_CI_lower"]
    output_df["upper"] = temp_df["95%_CI_upper"]
                
    return output_df



# Data Processing Functions

In [None]:
def kaplan_meier_group_drop_by_index(X,indexes):
    """
    helper function to drop rows of dataframe and return new dataframe without those rows with indexes resetted
    """
    X = X.drop(indexes)
    X = X.reset_index().drop(columns="index")
    return(X)

def kaplan_meier_group_dataSetting(dropCol,FILE_FOLDER = "C:\\SMU_v2\\"):
    '''
    function to read the pkl from from datasource
        1. Remove dx_date that is NULL.
        2. Drop all rows where crucial fields for X_features are NULL.
        3. Convert Date columns into datetime format
        4. Derive OS, CSS, DFS days based on dx_date
        5. Create status column to indicate if the patient is dead or alive base on if death_age exists
    '''
    df = pd.read_pickle(FILE_FOLDER + "clinical_output.pkl").reset_index().drop(columns="index")
    to_drop = df[df['dx_date']=="NA"].index
    df = kaplan_meier_group_drop_by_index(df,to_drop)

    df.drop(columns=dropCol,inplace = True)

    # drop all rows where dates are null
    df.dropna(axis=0,\
                    subset=['Date_for_DFS','Date_for_OS','Date_for_CSS','dx_date','Age_@_Dx'],\
                    inplace=True)
    
    # convert all datetime in dataframe into dateime format for processing
    df["Date_for_DFS"] = pd.to_datetime(df["Date_for_DFS"])
    df["Date_for_OS"] = pd.to_datetime(df["Date_for_OS"])
    df["Date_for_CSS"] = pd.to_datetime(df["Date_for_CSS"])
    df["dx_date"] = pd.to_datetime(df["dx_date"])
    df['last_seen']= pd.to_datetime(df["dx_date"])
    df['dob']= pd.to_datetime(df["dx_date"])

    # calculate in days
    df["DFS_days"] = (df["Date_for_DFS"] - df['dx_date'] )/np.timedelta64(1, 'D')
    df["OS_days"] = (df["Date_for_OS"] - df['dx_date'] )/np.timedelta64(1, 'D')
    df["CSS_days"] = (df["Date_for_CSS"] - df['dx_date'] )/np.timedelta64(1, 'D')

    # alive or dead
    df['status'] = np.where(df['Count_as_OS'] == "dead", False, True)

    return df

def kaplan_meier_group_ComputeYears(df, Year_list):
    '''
    Create a list to contain df for different years of survival
    The df will filter those patient that has deceased or days of survival longer than the defined years.
    '''

    df_dict = {}

    for i in Year_list:
        tmp = {}
        for x in list(["DFS", "CSS", "OS"]):
            df['{}_{}_years'.format(x, i)] = np.where(
                                                      np.logical_or(df['death_age'] > 0,\
                                                      df['{}_days'.format(x)]/(365.25*i) >= i),\
                                                      True,False)
            tmp[x] = df[df['{}_{}_years'.format(x, i)] == True]
        df_dict['{}_years'.format(i)] = tmp
    return df_dict



# Group Processing Functions

In [None]:
def kaplan_meier_group_settingXY(df, X_features, Y_features, OHE_LOCATION = "C:\\SMU_v2\\OHE\\", name=""):
    '''
    This function returns the X and Y features need for model training
        - The function also generates one pkl that contains the One Hot Encoder for new raw data 
    
    X_features = features to use for X
    Y_features = features to use for Y 
    YEAR = years of patient record interested
    SYTPE = survival type (OS, DFS, CSS)
    OHE_LOCATION = location to store the pkl file
    '''
    for i in  X_features:
        if not (i in ['nodespos','Age_@_Dx','size_precise']):
            df.loc[:,i] = df[i].astype("category")
        else:
            df.loc[:,i] = df[i].astype("float32")
    
    X = df[X_features]
    Y = df[Y_features]

    # Save enconder so that we can OHE new data
    enc = OneHotEncoder()
    enc.fit(X)
    
    # OHE for probability
    X = enc.transform(X)
    with open(OHE_LOCATION + name + '_encoder.pickle', 'wb') as f:
        pickle.dump(enc, f) 
                  
    # convert Y to structured array
    s = Y.dtypes
    Y = np.array([tuple(x) for x in Y.values], dtype=list(zip(s.index, s)))
   
    return X, Y

def kaplan_meier_group_layeredData(df, group_dict,y_features, YEAR, STYPE):
    
    '''
        this function generates the dataframe required for specific groups we hope to analyze
        there are total 3 different groups but group 3 consist of multiple subgroups which leads a total of 5
        dataframe.
        Group 1: patient with stage 4 cancer
        Group 2: patient which unknown records or at initial diagnosis stage
        Group 3: make up of patient that does not belong to the groups above
    '''
    model_data_dict = {}
    TO_USE = df['{}_years'.format(YEAR)][STYPE]
    
    print("Overall initial size: {} \n".format(TO_USE.shape[0]))
        
    for key,value in group_dict.items():
        TO_USE_COPY = TO_USE.copy()

        tmp = {}
    
        if key != "group 3":
            # for group 1 and group 2 select rows that contains either stage 4/non invasive in Stage
            TO_USE_COPY = TO_USE_COPY.loc[TO_USE_COPY['Stage'] == group_dict[key]['stage'][0]]
        else:
            # for group 3 do not select rows that contains either stage 4 or non invasive in c_Staging or p_Staging
            stage = np.logical_and(TO_USE_COPY['Stage'] != group_dict[key]['stage'][0],\
                                    TO_USE_COPY['Stage'] != group_dict[key]['stage'][1])
            
            TO_USE_COPY = TO_USE_COPY.loc[stage]
        
        tmp = TO_USE_COPY[y_features]

    
        model_data_dict[key] = tmp
        
        
    return model_data_dict

def kaplan_meier_group_loadOHE(df,OHE_LOCATION = "C:\\SMU_v2\\OHE\\", name=""):
    '''
    load enconder to OHE new raw data for prediction
    '''
    with open( "{}{}{}".format(OHE_LOCATION, name, '_encoder.pickle'), 'rb') as f:
        enc = pickle.load(f) 
    
    #type case object to category
    typeCastList = list(df.select_dtypes(include=[object]).columns)
    df[typeCastList] = df[typeCastList].astype("category")
    OHE_New_Data = enc.transform(df)
    
    return OHE_New_Data

# Kaplan Meier Analysis By Groups

In [None]:
output = generate_kaplan_meier_group_with_class(years=1, save_to_csv_filename="None")
display(output.head())
