In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sksurv.preprocessing import OneHotEncoder
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sklearn.model_selection import train_test_split

pd.set_option('display.width', None)
pd.set_option('display.max_column',None)
pd.set_option('display.max_rows',None)

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [6]:
def drop_by_index(X,indexes):
    """
    helper function to drop rows of dataframe and return new dataframe without those rows with indexes resetted
    """
    X = X.drop(indexes)
    X = X.reset_index().drop(columns="index")
    return(X)

def dataSetting(dropCol,FILE_FOLDER = "C:\\SMU_v2\\"):
    '''
    function to read the pkl from from datasource
        1. Remove dx_date that is NULL.
        2. Drop all rows where crucial fields for X_features are NULL.
        3. Convert Date columns into datetime format
        4. Derive OS, CSS, DFS days based on dx_date
        5. Create status column to indicate if the patient is dead or alive base on if death_age exists
    '''
    df = pd.read_pickle(FILE_FOLDER + "clinical_output.pkl").reset_index().drop(columns="index")
    to_drop = df[df['dx_date']=="NA"].index
    df = drop_by_index(df,to_drop)

    df.drop(columns=dropCol,inplace = True)

    # drop all rows where dates are null
    df.dropna(axis=0,\
                    subset=['Date_for_DFS','Date_for_OS','Date_for_CSS','dx_date','size_precise', 'nodespos'],\
                    inplace=True)
    # convert all datetime in dataframe into dateime format for processing
    df["Date_for_DFS"] = pd.to_datetime(df["Date_for_DFS"])
    df["Date_for_OS"] = pd.to_datetime(df["Date_for_OS"])
    df["Date_for_CSS"] = pd.to_datetime(df["Date_for_CSS"])
    df["dx_date"] = pd.to_datetime(df["dx_date"])
    df['last_seen']= pd.to_datetime(df["dx_date"])

    # calculate in days
    df["DFS_days"] = (df["Date_for_DFS"] - df['dx_date'] )/np.timedelta64(1, 'D')
    df["OS_days"] = (df["Date_for_OS"] - df['dx_date'] )/np.timedelta64(1, 'D')
    df["CSS_days"] = (df["Date_for_CSS"] - df['dx_date'] )/np.timedelta64(1, 'D')

    # alive or dead
    df['status'] = np.where(df['Count_as_OS'].isnull(), False, True)
    
    return df

def ComputeYears(df, Year_list):
    '''
    Create a list to contain df for different years of survival
    The df will filter those patient that has deceased or days of survival longer than the defined years.
    '''

    df_dict = {}
    for i in Year_list:
        tmp = {}
        for x in list(["DFS", "CSS", "OS"]):
            df['{}_{}_years'.format(x, i)] = np.where(
                                                      np.logical_or(df['death_age'] > 0,\
                                                      df['{}_days'.format(x)]/(365.25*i) >= i),\
                                                      True,False)
            tmp[x] = df[df['{}_{}_years'.format(x, i)] == True]
        df_dict['{}_years'.format(i)] = tmp
    return df_dict

def settingXY(df_dict, X_features, Y_features, YEAR, STYPE, OHE_LOCATION = "C:\\SMU_v2\\"):
    '''
    This function returns the X and Y features need for model training
        - The function also generates one pkl that contains the One Hot Encoder for new raw data 
    
    X_features = features to use for X
    Y_features = features to use for Y 
    YEAR = years of patient record interested
    SYTPE = survival type (OS, DFS, CSS)
    OHE_LOCATION = location to store the pkl file
    '''
    import pickle
    
    DF_TO_USE = df_dict['{}_years'.format(YEAR)][STYPE]

    X = DF_TO_USE[X_features]
    Y = DF_TO_USE[Y_features]
    # convert to int since some fields fro age_@_dx is null
    X.loc[:,"Age_@_Dx"] = X["Age_@_Dx"].astype("int16")

    # Save enconder so that we can OHE new data
    enc = OneHotEncoder()
    enc.fit(X)
    
    # OHE for probability
    X = enc.transform(X)
    with open(OHE_LOCATION + 'encoder.pickle', 'wb') as f:
        pickle.dump(enc, f) 
                  
    # convert Y to structured array
    s = Y.dtypes
    Y = np.array([tuple(x) for x in Y.values], dtype=list(zip(s.index, s)))
   
    return X, Y

def train_test(X, Y, test_size = 0.33, random_state = 42,):
    '''
    Splitting the dataset into the Training set and Test set
    '''
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y,  test_size=test_size, random_state=random_state)
    
    return X_train, X_test, Y_train, Y_test

def Cox(X_train,Y_train,alpha = 1e-4, verbose = 0):
        
    # since features are highly corelated, reducing alpha values to smaller values allows the learning
    model = CoxPHSurvivalAnalysis(alpha = alpha, verbose = verbose)
    model.fit(X_train, Y_train)
    
    '''
    In survival analysis, the hazard ratio (HR) is the ratio of the hazard rates corresponding
    to the conditions described by two levels of an explanatory variable. 
        For example, in a drug study, the treated population may die at twice the rate per unit time
        as the control population. The hazard ratio would be 2, indicating higher hazard of death from the treatment. 
        Or in another study, men receiving the same treatment may suffer a certain complication ten times more
        frequently per unit time than women, giving a hazard ratio of 10. - wiki
    '''
    tmp = pd.Series(model.coef_, index=X_train.columns)
    tmp = tmp.to_frame("Log Hazarad Ratio")
    tmp = tmp.sort_values(by=['Log Hazarad Ratio'])
    
    return model, tmp

def fit_and_score_features(X, y):
    '''
    Based on the Cox model, rank the scores of each feature to understand which X features plays the key role in
    modelling
    '''
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis(alpha = 1e-4)
    for j in range(n_features):
        Xj = X[:, j:j+1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores

def loadOHE(df,OHE_LOCATION = "C:\\SMU_v2\\"):
    '''
    load enconder to OHE new raw data for prediction
    '''
    import pickle
    with open(OHE_LOCATION + 'encoder.pickle', 'rb') as f:
        enc = pickle.load(f) 
    
    #type case object to category
    typeCastList = list(df.select_dtypes(include=[object]).columns)
    df[typeCastList] = df[typeCastList].astype("category")
    OHE_New_Data = enc.transform(df)
    
    return OHE_New_Data

In [7]:
listToDrop = ['NRIC','dob','Has Bills?','Side','Hospital','KKH','NCCS','SGH',\
              'Count_as_DFS','Count_as_CSS']

clinical = dataSetting(listToDrop)
year_list = list([1,5,10])
df_dict = ComputeYears(clinical,year_list)
YEAR = 1
STYPE = "OS"
x_features = list(['tstage','nstage', 'Mstage', 'ER', 'PR', 'Her2', 'size_precise', 'nodespos', 'Age_@_Dx'])
y_features = list(['status','OS_days'])

X, Y = settingXY(df_dict, x_features, y_features, YEAR, STYPE)

X_train, X_test, Y_train, Y_test = train_test(X, Y)

print("X shape: {}".format(X.shape))
print("Y shape: {}".format(Y.shape))


CoxEstimator, Log_Hazard_Ratio = Cox(X,Y)

from sksurv.metrics import concordance_index_censored

prediction = CoxEstimator.predict(X_test)
result = concordance_index_censored(Y_test["status"], Y_test["OS_days"], prediction)
result[0]

KeyError: 'Count_as_OS'

In [None]:
from sksurv.ensemble import RandomSurvivalForest

random_state = 20
rsf = RandomSurvivalForest(n_estimators=1000,
                           min_samples_split=10,
                           min_samples_leaf=15,
                           max_features="sqrt",
                           n_jobs=-1,
                           random_state=random_state)
rsf.fit(X_train, Y_train)

In [None]:
rsf.score(X_test, Y_test)