### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import fnmatch
import pickle
import gc
import matplotlib.pyplot as plt
from sksurv.preprocessing import OneHotEncoder
from sklearn.externals import joblib 
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sklearn.model_selection import train_test_split
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import concordance_index_censored

pd.set_option('display.width', None)
pd.set_option('display.max_column',None)
pd.set_option('display.max_rows',None)

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline



### Functions

In [2]:
def drop_by_index(X,indexes):
    """
    helper function to drop rows of dataframe and return new dataframe without those rows with indexes resetted
    """
    X = X.drop(indexes)
    X = X.reset_index().drop(columns="index")
    return(X)

def dataSetting(dropCol,FILE_FOLDER = "C:\\SMU_v2\\"):
    '''
    function to read the pkl from from datasource
        1. Remove dx_date that is NULL.
        2. Drop all rows where crucial fields for X_features are NULL.
        3. Convert Date columns into datetime format
        4. Derive OS, CSS, DFS days based on dx_date
        5. Create status column to indicate if the patient is dead or alive base on if death_age exists
    '''
    df = pd.read_pickle(FILE_FOLDER + "clinical_output.pkl").reset_index().drop(columns="index")
    to_drop = df[df['dx_date']=="NA"].index
    df = drop_by_index(df,to_drop)

    df.drop(columns=dropCol,inplace = True)

    # drop all rows where dates are null
    df.dropna(axis=0,\
                    subset=['Date_for_DFS','Date_for_OS','Date_for_CSS','dx_date','Age_@_Dx'],\
                    inplace=True)
    
    # convert all datetime in dataframe into dateime format for processing
    df["Date_for_DFS"] = pd.to_datetime(df["Date_for_DFS"])
    df["Date_for_OS"] = pd.to_datetime(df["Date_for_OS"])
    df["Date_for_CSS"] = pd.to_datetime(df["Date_for_CSS"])
    df["dx_date"] = pd.to_datetime(df["dx_date"])
    df['last_seen']= pd.to_datetime(df["dx_date"])
    df['dob']= pd.to_datetime(df["dx_date"])

    # calculate in days
    df["DFS_days"] = (df["Date_for_DFS"] - df['dx_date'] )/np.timedelta64(1, 'D')
    df["OS_days"] = (df["Date_for_OS"] - df['dx_date'] )/np.timedelta64(1, 'D')
    df["CSS_days"] = (df["Date_for_CSS"] - df['dx_date'] )/np.timedelta64(1, 'D')

    # alive or dead
#     df['status'] = np.where(df['Count_as_OS'] == "dead", False, True)
    df['status'] = np.where(df['death_age'].isnull(), False, True)

    return df

def ComputeYears(df, Year_list):
    '''
    Create a list to contain df for different years of survival
    The df will filter those patient that has deceased or days of survival longer than the defined years.
    '''

    df_dict = {}

    for i in Year_list:
        tmp = {}
        for x in list(["DFS", "CSS", "OS"]):
            df['{}_{}_years'.format(x, i)] = np.where(
                                                      np.logical_or(df['death_age'] > 0,\
                                                      df['{}_days'.format(x)]/(365.25*i) >= i),\
                                                      True,False)
            tmp[x] = df[df['{}_{}_years'.format(x, i)] == True]
        df_dict['{}_years'.format(i)] = tmp
    return df_dict

def dropSubGroup(df,colToDropSubGroup,subgroups, notDropSubgroups):
    '''
    drop patient records that contains T,N.M subgroups ('a','b','c')
    
    notDropSubGroups is a list of values that we want to be cautious and not drop during processing
    '''
    index_list = set()
    index_not_drop = set()
    for col in colToDropSubGroup:
        for subgroup in subgroups:
            index_list.update(list(df[df['{}'.format(col)].str.contains(subgroup)].index))
        for notDropgroup in notDropSubgroups:
            index_not_drop.update(list(df[df['{}'.format(col)].str.contains(notDropgroup)].index))
    
    # in-depth filtering
    tmp = set()
    for i in index_not_drop:
        chance = 1
        for col in colToDropSubGroup:
            for subgroup in subgroups: 
                for notDropgroup in notDropSubgroups:
                    if subgroup in df[col][i] and not (notDropgroup in df[col][i]):
                        chance = 0
        if chance == 1:
            tmp.add(i)
    
    cfm_index_list = list(index_list.difference(tmp))
    
    return drop_by_index(df,cfm_index_list)

def train_test(X, Y, test_size = 0.33, random_state = 42):
    '''
    Splitting the dataset into the Training set and Test set
    '''
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y,  test_size=test_size, random_state=random_state)
    
    return X_train, X_test, Y_train, Y_test

def Cox(X_train,Y_train,alpha = 1e-4, verbose = 0):
        
    # since features are highly corelated, reducing alpha values to smaller values allows the learning
    model = CoxPHSurvivalAnalysis(alpha = alpha, verbose = verbose)
    model.fit(X_train, Y_train)

    return model

def fit_and_score_features(X, y):
    '''
    Based on the Cox model, rank the scores of each feature to understand which X features plays the key role in
    modelling
    '''
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis(alpha = 1e-4)
    for j in range(n_features):
        Xj = X[:, j:j+1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores

def plotGraph(df, YEAR, STYPE, UNITS = 0, ):
    '''
    function to plot the graph
    UNITS: {0: days, 1: years}
    '''
    if UNITS == 1:
        unit = "Years"
    else:
        unit = "Days"
        
    time, survival_prob = kaplan_meier_estimator(df['{}_years'.format(YEAR)][STYPE]['status'], 
                                                 df['{}_years'.format(YEAR)][STYPE]['{}_days'.format(STYPE)])
    
    if UNITS == 1:
        time = time/365.25
    plt.step(time, survival_prob, where="post")
    
    plt.ylabel("est. probability of survival $\hat{S}(t)$")
    plt.xlabel("time $t$ ({})".format(unit))
    plt.title("{} Years Survival Rate for {}".format(YEAR,STYPE))
    plt.grid(True)
    return plt

def FeaturesPriority(df,x_features,y_features):
    # change unknown to nx
    df['cNstage'].replace(to_replace ="unknown", 
                                       value ="nx", inplace = True) 
    df['nstage'].replace(to_replace ="unknown", 
                                   value ="nx", inplace = True) 

    # to prevent changing of the orginial df
    working_df = df.copy()
    
    t_score = {
                "t4d": 1, 't4c':2,'t4b':3,'t4a':4,'t4':5,\
                't3': 6,'t2':7,\
                't1c': 8,'t1b':9,'t1a': 10,'t1mic':11,'t1': 12,\
                't0':13,'tis':14,'tx': 15
               }
    
    m_score = {'m1a': 1, 'm1': 2, 'm0': 3, 'mx': 4}
    
    n_score = {
                'n3c':1,'n3b':2,'n3a':3,'n3':4,\
                'n2b':5,'n2a':6,'n2':7,\
                'n1c':8,'n1b':9,'n1a':10,'n1mic':11,'n1':12,\
                'n0 (i+)':13,'n0':14,'nx':15
                }
    
    working_df['t_1'] = working_df['tstage'].map(t_score)
    working_df['t_2'] = working_df['c_tstage'].map(t_score)
    
    working_df['m_1'] = working_df['Mstage'].map(m_score)
    working_df['m_2'] = working_df['cMstage'].map(m_score)
    
    working_df['n_1'] = working_df['cNstage'].map(n_score)
    working_df['n_2'] = working_df['nstage'].map(n_score)
    
    # convert as new columns are categorical
    for x in ['t_1','t_2','m_1','m_2','n_1','n_2']:
        working_df.loc[:,x] = working_df[x].astype("int16")
    
    working_df.loc[working_df['t_1'] > working_df['t_2'] ,'tstage_tmp'] = working_df['c_tstage']
    working_df.loc[working_df['t_1'] <= working_df['t_2'] ,'tstage_tmp'] = working_df['tstage']
    
    working_df.loc[working_df['m_1'] > working_df['m_2'] ,'Mstage_tmp'] = working_df['cMstage']
    working_df.loc[working_df['m_1'] <= working_df['m_2'] ,'Mstage_tmp'] = working_df['Mstage']
    
    working_df.loc[working_df['n_1'] > working_df['n_2'] ,'nstage_tmp'] = working_df['nstage']
    working_df.loc[working_df['n_1'] <= working_df['n_2'] ,'nstage_tmp'] = working_df['cNstage']
    
    working_df.drop(columns = ["Mstage",'nstage','tstage'], inplace = True)
    working_df.rename(columns={'Mstage_tmp':'Mstage', 'nstage_tmp':'nstage','tstage_tmp':'tstage'}, inplace=True)
    
    working_df.loc[:,'tstage'] = working_df['tstage'].astype("category")
    working_df.loc[:,'nstage'] = working_df['nstage'].astype("category")
    working_df.loc[:,'Mstage'] = working_df['Mstage'].astype("category")
    
    x_features = [e for e in x_features if e not in ['c_tstage','cNstage','cMstage']]
    return working_df[x_features+y_features]

def settingXY(df, X_features, Y_features, OHE_LOCATION = "C:\\SMU_v2\\OHE\\", name=""):
    '''
    This function returns the X and Y features need for model training
        - The function also generates one pkl that contains the One Hot Encoder for new raw data 
    
    X_features = features to use for X
    Y_features = features to use for Y 
    YEAR = years of patient record interested
    SYTPE = survival type (OS, DFS, CSS)
    OHE_LOCATION = location to store the pkl file
    '''
    for i in  X_features:
        if not (i in ['nodespos','Age_@_Dx','size_precise']):
            df.loc[:,i] = df[i].astype("category")
        else:
            df.loc[:,i] = df[i].astype("float32")
    
    X = df[X_features]
    Y = df[Y_features]

    # Save enconder so that we can OHE new data
    enc = OneHotEncoder()
    enc.fit(X)
    
    # OHE for probability
    X = enc.transform(X)
    with open(OHE_LOCATION + name + '_encoder.pickle', 'wb') as f:
        pickle.dump(enc, f) 
                  
    # convert Y to structured array
    s = Y.dtypes
    Y = np.array([tuple(x) for x in Y.values], dtype=list(zip(s.index, s)))
   
    return X, Y
def layeredData(df, group_dict,y_features, YEAR, STYPE):
    
    '''
        this function generates the dataframe required for specific groups we hope to analyze
        there are total 3 different groups but group 3 consist of multiple subgroups which leads a total of 5
        dataframe.
        Group 1: patient with stage 4 cancer
        Group 2: patient which unknown records or at initial diagnosis stage
        Group 3: make up of patient that does not belong to the groups above
    '''
    model_data_dict = {}
    TO_USE = df['{}_years'.format(YEAR)][STYPE]
    
    print("Overall initial size: {} \n".format(TO_USE.shape[0]))
        
    for key,value in group_dict.items():
        TO_USE_COPY = TO_USE.copy()

        tmp = {}
        
        waves = value['wave']
    
        if key != "group 3":
            # for group 1 and group 2 select rows that contains either stage 4/non invasive in Stage
            TO_USE_COPY = TO_USE_COPY.loc[TO_USE_COPY['Stage'] == group_dict[key]['stage'][0]]
        else:
            # for group 3 do not select rows that contains either stage 4 or non invasive in c_Staging or p_Staging
            stage = np.logical_and(TO_USE_COPY['Stage'] != group_dict[key]['stage'][0],\
                                    TO_USE_COPY['Stage'] != group_dict[key]['stage'][1])
            
            TO_USE_COPY = TO_USE_COPY.loc[stage]
            
        print("{} data size: {}".format(key,len(TO_USE_COPY)))
        
        for wave in waves:
            TO_USE_COPY2 = TO_USE_COPY.copy()
            TO_USE_COPY2 = TO_USE_COPY2[waves[wave] + y_features]
            
            len_before = len(TO_USE_COPY2)
            print("\t{} data size before dropping nan: {}".format(wave,len_before))
            
            TO_USE_COPY2.dropna(axis=0,subset=waves[wave]+ y_features, inplace=True)
            TO_USE_COPY2.reset_index(drop=True)

            len_after = len(TO_USE_COPY2)
            print("\t\t after dropping nan: {}".format(len_after))
            
            X, Y = settingXY(TO_USE_COPY2, waves[wave], y_features,name= "{}_{}".format(key,wave))   
            
            TO_USE_COPY2.to_pickle("C:\\SMU_v2\\Layered Folder\\{}_{}.pkl".format(key,wave))

            tmp[wave] = {
                            "X": X,\
                            "Y":Y      
                        }    
    
        model_data_dict[key] = tmp
        
        
    return model_data_dict

def loadOHE(df,OHE_LOCATION = "C:\\SMU_v2\\OHE\\", name=""):
    '''
    load enconder to OHE new raw data for prediction
    '''
    with open( "{}{}{}".format(OHE_LOCATION, name, '_encoder.pickle'), 'rb') as f:
        enc = pickle.load(f) 
    
    #type case object to category
    typeCastList = list(df.select_dtypes(include=[object]).columns)
    df[typeCastList] = df[typeCastList].astype("category")
    OHE_New_Data = enc.transform(df)
    
    return OHE_New_Data

def survivalTable(modelName, raw_data,OHE_LOCATION = "C:\\SMU_v2\\OHE\\",interval = list([0.5,1,2,5,10])):
    '''
    Calculate survival rate in years of interest
    '''

    for k,v in raw_data.items():
        if str(v[0]).isalpha():
            raw_data[k] = v[0].lower()
        
    raw_data = pd.DataFrame.from_dict(raw_data)
    
    model = joblib.load('Model_folder\\{}.pkl'.format(modelName))

    with open( "{}{}{}".format(OHE_LOCATION, modelName[:-4], '_encoder.pickle'), 'rb') as f:
            enc = pickle.load(f) 
        
    #type case object to category
    typeCastList = list(raw_data.select_dtypes(include=[object]).columns)
    raw_data[typeCastList] = raw_data[typeCastList].astype("category")
    data = enc.transform(raw_data)

    surv = model.predict_survival_function(data)
    
    dic = {}
    
    for i, s in enumerate(surv):
        x = model.event_times_
        y = s
    graphaxis = pd.DataFrame({'x':x,'y':y}, columns = ['x','y'])
    for i in interval:
        result = np.where(x > (365.25*(i+1)))[0][0]
        dic[i] = y[result]

    return dic,graphaxis

def ann_structure(input_shape,output_units):
    """
    function to declare ANN structure. just for code cleaniness
    """
    model = tf.keras.Sequential()
    model.add(layers.Dense(32, input_shape=(input_shape,)))         # input layer
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dropout(.5))
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, 
                                        beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', 
                                        moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, 
                                        beta_constraint=None, gamma_constraint=None)
) 
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dropout(.5))
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dropout(.5))
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(output_units, activation=tf.nn.leaky_relu))   # one output layer with 1 outputs
    return model
def display_graph(scope,predictions_scaled_reverse,y_test_scaled_reverse):
    graph = pd.DataFrame(np.arange(0,2,.01),columns=["Percentage"])
    graph["viz"] = graph.applymap(lambda x: get_percentage(predictions_scaled_reverse,y_test_scaled_reverse,x))
    show = graph.plot.area(x="Percentage")
    show.set_title("Model performance ({})".format(scope))
    show.set_xlabel("Percentage Difference from Ground Truth")
    show.set_ylabel("Percentage of all our predictions")
    show.xaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    show.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))

In [3]:
path = 'C:\\SMU_v2\\Layered Folder\\'

y_features = list(['status','OS_days'])

if len(fnmatch.filter(os.listdir(path), '*.pkl')) > 0:
    
    model_data_dict = {}
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path):
        for file in f:
            if '.pkl' in file:
                group = str(file).split("_")[0]
                wave = str(file).split("_")[1].split(".")[0]
                tmp = pd.read_pickle(path+file)

                x_features = [i for i in tmp.columns if i not in y_features]
                
                X, Y = settingXY(tmp, x_features, y_features,name= "{}_{}".format(group,wave))  
                
                if not (group in model_data_dict): 
                    model_data_dict[group] = {wave : { "X": X,\
                                                       "Y":Y
                                                      }}
                else:
                    model_data_dict[group].update({wave : { "X": X,\
                                                            "Y":Y
                                                          }} )
                del X
                del Y
                del tmp
    print("Data loaded!")                
else:
    # Data Processing
    listToDrop = ['NRIC','dob','Has Bills?','Side','Hospital','KKH','NCCS','SGH','END_OF_ENTRY']
    clinical = dataSetting(listToDrop)
    print(clinical.shape)

    # Data of our interest are 5 and 10 years, patient that are new 
    # (does not have sufficient records will disturb and mess up our accuracy level
    # only return data that has longer timeframe than the given interval

    year_list = list([1,5,10])
    df_dict = ComputeYears(clinical,year_list)

    # Display shape of data after filtering
    for i in df_dict: 
        for s_type in df_dict[i]:
            print("Year: {}, survival category: {}, size: {}".format(i,s_type,df_dict[i][s_type].shape[0]))
            
    YEAR = 1
    STYPE = "OS"

    group_dict = { 
                    "group 1": {
                                 "stage": ['stage 4'],\
                                 'wave': {
                                             "layer 1": ['Age_@_Dx', 'diff', 'ER', 'PR','Her2','Stage'],\
                                             "layer 2": ['Age_@_Dx', 'diff', 'ER', 'PR','Her2',\
                                                         'T (no subgroup)', 'N (no subgroup)'],\
                                             "layer 3": ['Age_@_Dx', 'diff', 'ER', 'PR','Her2', 'T', 'N'],\
                                             "layer 4": ['Age_@_Dx', 'diff', 'ER', 'PR','Her2', 'size_precise', 'nodespos']
                                         }
                               },\
                    "group 2": {
                                 'stage': ['dcis/lcis non-invasive'],\
                                 'wave': {
                                             "layer 1": ['Age_@_Dx', 'diff', 'ER', 'PR','Her2','Size'],\
                                             "layer 2": ['Age_@_Dx', 'diff', 'ER', 'PR','Her2','size_precise']
                                         }
                               },\
                    "group 3": {
                                 "stage": ['stage 4','dcis/lcis non-invasive'],\
                                 'wave': {
                                             "layer 1": ['Age_@_Dx', 'diff', 'ER', 'PR','Her2','Stage'],\
                                             "layer 2": ['Age_@_Dx', 'diff', 'ER', 'PR','Her2',\
                                                         'T (no subgroup)', 'N (no subgroup)', 'M (no subgroup)'],\
                                             "layer 3": ['Age_@_Dx', 'diff', 'ER', 'PR','Her2', 'T', 'N', 'M'],\
                                             "layer 4": ['Age_@_Dx', 'diff', 'ER', 'PR','Her2', 'size_precise',\
                                                         'nodespos','M'],\
                                             'layer 5':['T','N', 'M', 'ER', 'PR', 'Her2',\
                                                        'size_precise', 'nodespos', 'Age_@_Dx']
                                         }
                               },
                    }
    model_data_dict = layeredData(df_dict, group_dict,y_features,YEAR, STYPE)
    print("Processing Done!")
model_data_dict['group 3']['layer 1']['X'].head(1)

Data loaded!


Unnamed: 0,Age_@_Dx,diff=grade 2,diff=grade 3,diff=unknown,ER=negative,ER=positive,ER=unknown,PR=negative,PR=positive,PR=unknown,Her2=negative,Her2=not done,Her2=positive,Her2=unknown,Stage=stage 0,Stage=stage 1,Stage=stage 1a,Stage=stage 1b,Stage=stage 2,Stage=stage 2a,Stage=stage 2b,Stage=stage 3,Stage=stage 3a,Stage=stage 3b,Stage=stage 3c,Stage=stage 4,Stage=unknown
2,46.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
def model_creation(X, Y, label, cox_alpha=1e-4, n_estimators = 1000, random_state = 20, file_location = "Model_folder\\"):
    
    result = {}
    X_train, X_test, Y_train, Y_test = train_test(X, Y)
    print("\t\t X_train:{}, X_test:{}".format(X_train.shape,X_test.shape))
    
    rsf = RandomSurvivalForest(n_estimators= n_estimators ,
                               max_depth=None,
                               max_leaf_nodes=None, 
                               bootstrap=True,
                               oob_score=False,
                               min_samples_split=10,
                               min_samples_leaf=15,
                               max_features="sqrt",
                               n_jobs=-1,
                               random_state=random_state)
    
    model_rsf = rsf.fit(X_train, Y_train)
    result["rsf".format(label)] = model_rsf.score(X_test, Y_test)
    
    # save the model to disk
    filename = '{}{}_rsf.pkl'.format(file_location,label)
    joblib.dump(model_rsf, filename)
    
    del model_rsf
    gc.collect()
    
    model_cox = Cox(X_train,Y_train, cox_alpha)
    prediction = model_cox.predict(X_test)
    result["cox".format(label)] = concordance_index_censored(Y_test["status"],\
                                                                          Y_test["OS_days"], prediction)[0]
    filename = '{}{}_cox.pkl'.format(file_location,label)
    joblib.dump(model_cox, filename) 
    
    del model_cox
    gc.collect()
    
    for k,v in result.items():
        print("\t\t{}:{}".format(k,v))  
    return result

In [5]:
# listToDrop = ['NRIC','dob','Has Bills?','Side','Hospital','KKH','NCCS','SGH',\
#               'Count_as_DFS','Count_as_CSS']

# clinical = dataSetting(listToDrop)
# year_list = list([1,5,10])
# df_dict = ComputeYears(clinical,year_list)

# x_features = list(['T','N', 'M', 'ER', 'PR', 'Her2', 'size_precise', 'nodespos', 'Age_@_Dx'])
# y_features = list(['status','OS_days'])
# tmp = df_dict['1_years']["OS"]

# tmp.dropna(axis=0,\
#             subset=x_features,\
#             inplace=True)
# X, Y = settingXY(tmp, x_features, y_features)

# model_creation(X, Y, "group 3_layer 5", cox_alpha=1e-4, n_estimators = 1000, random_state = 20)

In [6]:
import tensorflow as tf
from tensorflow.keras import layers
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler

for group,wave_dict in model_data_dict.items():
    print("{}:".format(group))
    tmp = {}
    sub_data_dict = {}
    
    if group == "group 3":
        cox_alpha = 1e-2
        n_estimators = 400
    else:
        cox_alpha = 1e-4
        n_estimators = 1000
    
    for wave in wave_dict:
        print("\t {}:".format(wave))
        X = wave_dict[wave]['X']
        Y = wave_dict[wave]['Y']

        result = model_creation(X,Y, "{}_{}".format(group,wave), cox_alpha, n_estimators )

#             #### ANN
#             Y = pd.DataFrame(Y.tolist(), columns = ['Status','OS_days'])
#             Y['Status'] = np.where(Y['Status']== True, 1, 0)
#             X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

#             model = ann_structure(X.shape[1],Y.shape[1])
#             model.compile(optimizer=tf.keras.optimizers.Adam(),\
#                                       loss='mean_squared_error')
#             # Run the stochastic gradient descent for specified epochs
#             epochs = 100
#             filepath="weights.best.{}.h5".format(wave)
#             callbacks_list = []
#             callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
#             # callbacks_list.append(LearningRateScheduler(scheduler))

#             model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

#             pred = pd.DataFrame(model.predict(X_test))
#             o+=1

group 1:
	 layer 1:
		 X_train:(1359, 27), X_test:(670, 27)
		rsf:0.6313131313131313
		cox:0.6324765415185222
	 layer 2:
		 X_train:(1359, 25), X_test:(670, 25)
		rsf:0.648603907753531
		cox:0.6497455719737744
	 layer 3:
		 X_train:(1359, 42), X_test:(670, 42)
		rsf:0.650446879994781
		cox:0.650060888758413
	 layer 4:
		 X_train:(306, 16), X_test:(151, 16)
		rsf:0.6971401781528364
		cox:0.6901078293483357
group 2:
	 layer 1:
		 X_train:(1313, 20), X_test:(648, 20)
		rsf:0.6639286469717267
		cox:0.6738701262074315
	 layer 2:
		 X_train:(1333, 15), X_test:(657, 15)
		rsf:0.6638031208499336
		cox:0.6493193891102258
group 3:
	 layer 1:
		 X_train:(12699, 27), X_test:(6255, 27)
		rsf:0.738791538076107
		cox:0.7326479878165673
	 layer 2:
		 X_train:(12699, 27), X_test:(6255, 27)
		rsf:0.7410006491577815
		cox:0.7365595953994541
	 layer 3:
		 X_train:(12699, 45), X_test:(6255, 45)
		rsf:0.7382000763521447
		cox:0.737469876692932
	 layer 4:
		 X_train:(8616, 19), X_test:(4244, 19)
		rsf:0.7427

In [7]:
# group 1
raw_data = {
            'ER': ['Positive'],\
            'PR': ['positive'],\
            'Her2': ['negative'],\
            'size_precise': [1.3],\
            'nodespos': [0],\
            'Age_@_Dx': [21],\
            'diff': ['grade 3']
           }

# # group 2
# raw_data = {
#             'ER': ['Positive'],\
#             'PR': ['positive'],\
#             'Her2': ['negative'],\
#             'Size': [">"],\
#             'Age_@_Dx': [21],\
#             'diff': ['m0']
#            }

# # group 3
# raw_data = {
#             'ER': ['Positive'],\
#             'PR': ['positive'],\
#             'Her2': ['negative'],\
#             'size_precise': [1.3],\
#             'nodespos': [0],\
#             'Age_@_Dx': [21],\
#             'T':['Tis'],\
#             'N': ['n0'],\
#             'M': ['m0']
#            }


z,df = survivalTable("group 1_layer 4_rsf",raw_data)
df

Unnamed: 0,x,y
0,21.0,0.998539
1,23.0,0.998539
2,25.0,0.998492
3,59.0,0.998354
4,63.0,0.998354
5,77.0,0.997593
6,81.0,0.997593
7,85.0,0.997503
8,90.0,0.997483
9,91.0,0.997343


### Predicting

For prediction, a sample is dropped down each tree in the forest until it reaches a terminal node. Data in each terminal is used to non-parametrically estimate the survival and cumulative hazard function using the Kaplan-Meier and Nelson-Aalen estimator, respectively. In addition, a risk score can be computed that represents the expected number of events for one particular terminal node. The ensemble prediction is simply the average across all trees in the forest.

In [None]:
X_test_values = X_test.values
a = np.empty(X_test.shape[0], dtype=[("Age_@_Dx", float), ("nodespos", float)])
a["Age_@_Dx"] = X_test_values[:, -1]
a["nodespos"] = X_test_values[:, -2]

sort_idx = np.argsort(a, order=["nodespos", "Age_@_Dx"])

X_test_sel = pd.DataFrame(
    X_test_values[(sort_idx[:1])],
    columns=list(X_test.columns))

In [None]:
#predict risk score
pd.Series(rsf.predict(X_test_sel))


### Feature Selection - (C index)

'''
In survival analysis, the hazard ratio (HR) is the ratio of the hazard rates corresponding
to the conditions described by two levels of an explanatory variable. 
    For example, in a drug study, the treated population may die at twice the rate per unit time
    as the control population. The hazard ratio would be 2, indicating higher hazard of death from the treatment. 
    Or in another study, men receiving the same treatment may suffer a certain complication ten times more
    frequently per unit time than women, giving a hazard ratio of 10. - wiki
'''

In [None]:
# scores = fit_and_score_features(X_test.values, Y_test)
# pd.Series(scores, index=X_test.columns).sort_values(ascending=False).head(10)

### Measuring the Performance of Survival Models
Our test data is usually subject to censoring (only verified records are available, events happening in between is skipped), therefore metrics like root mean squared error or correlation are unsuitable. Instead, we use generalization of the area under the receiver operating characteristic (ROC) curve called Harrell's concordance index or c-index.

The interpretation is identical to the traditional area under the ROC curve metric for binary classification:

- a value of 0.5 denotes a random model,
- a value of 1.0 denotes a perfect model,
- a value of 0.0 denotes a perfectly wrong model.

### Calculation matrix for CoxPHSurvivalAnalysis - Cox's proportional hazard's model

tol is like the p value
|1 - (new neg. log-likelihood / old neg. log-likelihood) | < tol

In [None]:
### Determine features that are useful - Cox
# from sklearn.feature_selection import SelectKBest
# from sklearn.pipeline import Pipeline

# pipe = Pipeline([('encode', OneHotEncoder()),
#                  ('select', SelectKBest(fit_and_score_features, k=3)),
#                  ('model', CoxPHSurvivalAnalysis(alpha = 1e-6, tol= 1e-6))])

# from sklearn.model_selection import GridSearchCV

# param_grid = {'select__k': np.arange(1, X.shape[1] + 1)}
# gcv = GridSearchCV(pipe, param_grid, return_train_score=True, cv=3, iid=True)
# gcv.fit(X, Y)

# pd.DataFrame(gcv.cv_results_).sort_values(by='mean_test_score', ascending=False)

# pipe.set_params(**gcv.best_params_)
# pipe.fit(X, Y)

# encoder, transformer, final_estimator = [s[1] for s in pipe.steps]
# pd.Series(final_estimator.coef_, index=encoder.encoded_columns_[transformer.get_support()])

### Permutation-based Feature Importance

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

feature_names = X.columns.tolist()
perm = PermutationImportance(rsf, n_iter=15, random_state=random_state)
perm.fit(X_test, Y_test)
eli5.show_weights(perm, feature_names=feature_names)

### Testing

In [None]:
# new_X_features = X_features.dropna(axis = 0, how ='any')  
# print("Old data frame length:", len(X_features)) 
# print("New data frame length:", len(new_X_features)) 
# print("Number of rows with at least 1 NA value: ", (len(X_features)-len(new_X_features))) 

In [None]:
    # from sksurv.nonparametric import kaplan_meier_estimator

    # YEAR = 5
    # STYPE = "OS"
    # # k = plotGraph(df_dict,YEAR, STYPE, 0)

    # for survival in ("DFS", "OS", 'CSS'):
    #     time_treatment, survival_prob_treatment = kaplan_meier_estimator(
    #         df_dict['{}_years'.format(YEAR)][survival]['status'],
    #         df_dict['{}_years'.format(YEAR)][survival]['{}_days'.format(survival)])

    #     plt.step(time_treatment/365.25, survival_prob_treatment, where="post",
    #              label="Survival Type = {}".format(survival))

    # plt.ylabel("est. probability of survival $\hat{S}(t)$")
    # plt.xlabel("time $t$ (Days)")
    # plt.title("{} Years Surivial Rate For Each Category".format(YEAR))
    # plt.grid(True)
    # plt.legend(loc="best")

    # # plt.rcParams["figure.figsize"] = (30,10)

    # # for value in df_dict['{}_years'.format(YEAR)]['OS']["TNM_Stage"].unique():
    # #     mask = df_dict['{}_years'.format(YEAR)]['OS']["TNM_Stage"] == value
    # #     time_cell, survival_prob_cell = kaplan_meier_estimator(df_dict['{}_years'.format(YEAR)][STYPE]['status'][mask],
    # #                                                            df_dict['{}_years'.format(YEAR)][STYPE]['{}_days'.format(STYPE)][mask])
    # #     plt.step(time_cell, survival_prob_cell, where="post",
    # #              label= '{} (n = {})'.format(value, mask.sum()))

    # # plt.ylabel("est. probability of survival $\hat(t)$")
    # # plt.xlabel("time $t$")
    # # plt.grid(True)
    # # plt.legend(loc="best")

In [None]:
# hyperparams = {
#     'L2_reg': 10.0,
#     'batch_norm': True,
#     'dropout': 0.4,
#     'hidden_layers_sizes': [25, 25],
#     'learning_rate': 1e-05,
#     'lr_decay': 0.001,
#     'momentum': 0.9,
#     'n_in': train_data['x'].shape[1],
#     'standardize': True
# }

### Limitation: Not enough data per group