## Include libraries

In [1]:
import os
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sksurv.column import encode_categorical

## dataset loading...

In [11]:
surv_data = pd.read_csv(os.path.join('..', '..', 'data', '(v4)STROKE_VITAL_SIGN_missForest.csv'))

## data preprocessing

In [12]:
from sklearn import preprocessing
from sklearn_pandas import DataFrameMapper

def data_processing(data_df):
    data_df_x = data_df.drop(['LOC', 'UID', 'Mortality', 'SurvivalDays', 'CVDeath', 'SurvivalMonths',
                              'admission_date', 'discharge_date', 'death_date'], axis=1)

    data_df_y = data_df[['Mortality', 'SurvivalDays']]

    X_temp = data_df_x[(data_df.LOC == '3') | (data_df.LOC == '2') | (data_df.LOC == '6')]
    y_temp = data_df_y[(data_df.LOC == '3') | (data_df.LOC == '2') | (data_df.LOC == '6')]
    X_df_train, X_df_val, y_df_train, y_df_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=369)

    X_df_test_kao = data_df_x[data_df.LOC == '8']
    y_df_test_kao = data_df_y[data_df.LOC == '8']

    categorical_columns = ['Sex', 'AF', 'DM', 'HTN', 'Dyslipidemia', 'CHF', 'Smoking',
                           'Cancer.before.adm']
    
    numerical_columns = np.setdiff1d(data_df_x.columns, categorical_columns).tolist()

    categorical_ix = [data_df_x.columns.get_loc(col) for col in categorical_columns]
    numerical_ix =  np.setdiff1d(list(range(0, len(data_df_x.columns))), categorical_ix).tolist()

    scaler = preprocessing.StandardScaler()

    standardize = [([col], scaler) for col in numerical_columns]
    leave = [(col, None) for col in categorical_columns]

    x_mapper = DataFrameMapper(standardize + leave)

    X_df_train = pd.DataFrame(data=x_mapper.fit_transform(X_df_train),
                              columns=numerical_columns+categorical_columns,
                              index=X_df_train.index)

    X_df_val = pd.DataFrame(data=x_mapper.fit_transform(X_df_val),
                            columns=numerical_columns+categorical_columns,
                            index=X_df_val.index)

    X_df_test_kao = pd.DataFrame(data=x_mapper.fit_transform(X_df_test_kao),
                                 columns=numerical_columns+categorical_columns,
                                 index=X_df_test_kao.index)
    # since those categorical variable are all boolen... no need to one-hot them
    # https://stackoverflow.com/questions/43515877/should-binary-features-be-one-hot-encoded
    
#     X_df_train = encode_categorical(X_df_train, columns=categorical_columns)
#     X_df_val = encode_categorical(X_df_val, columns=categorical_columns)
#     X_df_test_kao = encode_categorical(X_df_test_kao, columns=categorical_columns)
    
    return X_df_train, X_df_val, y_df_train, y_df_val, X_df_test_kao, y_df_test_kao


get_target = lambda df: (df['SurvivalDays'].values, df['Mortality'].values)

In [13]:
X_train, X_val, y_train, y_val, X_test_kao, y_test_kao = data_processing(surv_data)

In [40]:
print(X_train)

            ALT       Age      CHOL       CKD   DBPCV.G    HRCV.G  Mean.PP.G  \
146   -0.617113 -0.400944  0.524164 -1.211742 -0.320314 -0.003625  -1.420171   
671    0.094539  0.510618 -0.964399 -1.211742 -1.438132 -1.042963  -0.096977   
2591   0.213148 -0.628834  0.710234  0.075247 -0.320314 -1.042963  -1.420171   
16130  0.055003  0.510618  0.291576 -1.211742  0.797503  1.035713  -1.420171   
15659 -0.142678 -0.173053  0.152023  0.075247  0.797503  2.075052   1.226216   
...         ...       ...       ...       ...       ...       ...        ...   
10536 -0.458968  0.282728 -0.569000  0.075247 -0.320314 -1.042963   1.226216   
6468  -0.458968 -0.552871 -0.313153 -1.211742 -0.320314 -0.003625  -1.420171   
2055   7.171525 -0.932688 -0.196859  0.075247 -0.320314 -1.042963   0.564619   
386   -0.340359  1.270253 -0.313153  1.362235  0.797503 -0.003625   0.564619   
10545  0.331757 -0.249017 -0.313153  1.362235 -0.320314 -0.003625  -0.096977   

       Mean.SBP.G  MeanDBP.G  MeanHR.G 

## Feature selection

In [62]:
rf_list = pd.read_csv(os.path.join('rfs_FS.csv')).sort_values(by='weight', ascending=False).feature
stg_list = pd.read_csv(os.path.join('stg_FS.csv')).sort_values(by='weight', ascending=False).feature

## Deepsur

In [64]:
import torch
import torchtuples as tt
from pycox.evaluation import EvalSurv
from pycox.models import CoxPH

def Deepsur(Xtrain, Xval, Ytrain, Yval):    
    # preprocessing data
    Xtrain = Xtrain.values.astype('float32')
    Xval = Xval.values.astype('float32')
    Ytrain = get_target(Ytrain)
    Yval = get_target(Yval)
    val = Xval, Yval
    
    # parameters
    in_features = Xtrain.shape[1]
    num_nodes = [25, 25]
    out_features = 1
    batch_norm = True
    dropout = 0.1
    output_bias = False
    batch_size = 128
    if Xtrain.shape[0]%batch_size == 1:
        batch_size = batch_size - 1
    epochs = 100
    callbacks = [tt.callbacks.EarlyStopping(patience=20)]
    verbose = False
    
    # network
    net = tt.practical.MLPVanilla(in_features, num_nodes, out_features, batch_norm,
                                  dropout, output_bias=output_bias)
    model = CoxPH(net, tt.optim.Adam)
    
    # train
    lrfinder = model.lr_finder(Xtrain, Ytrain, batch_size, tolerance=10)
    #best_lr = lrfinder.get_best_lr()
    #_ = lrfinder.plot()
    model.optimizer.set_lr(0.01)
    log = model.fit(Xtrain, Ytrain, batch_size, epochs, callbacks, verbose,
                    val_data=val, val_batch_size=batch_size)
    return model
    

### DeepSur

In [66]:
durations_test_kao, events_test_kao = get_target(y_test_kao)

for i in range(len(rf_list)):
    seed = 369
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    #
    features = rf_list.iloc[0:i+1].values
    all_features = X_train.columns.values
    drop_features = np.setdiff1d(all_features, features)
    if len(features) > 0:   
        X_train_fs = X_train.drop(drop_features, axis=1)
        X_val_fs = X_val.drop(drop_features, axis=1)
        X_test_kao_fs = X_test_kao.drop(drop_features, axis=1).values.astype('float32')
        
        deep_sur = Deepsur(X_train_fs, X_val_fs, y_train, y_val)
        # prediction
        _ = deep_sur.compute_baseline_hazards()
        surv_kao = deep_sur.predict_surv_df(X_test_kao_fs)
        # evaluation
        ev_kao = EvalSurv(surv_kao, durations_test_kao, events_test_kao, censor_surv='km')
        print(X_train_fs.shape[1])
        print('Kao C-index = %.3f' %(ev_kao.concordance_td()))

1
Kao C-index = 0.685
2
Kao C-index = 0.753
3
Kao C-index = 0.773
4
Kao C-index = 0.790
5
Kao C-index = 0.798
6
Kao C-index = 0.798
7
Kao C-index = 0.803
8
Kao C-index = 0.804
9
Kao C-index = 0.801
10
Kao C-index = 0.806
11
Kao C-index = 0.803
12
Kao C-index = 0.804
13
Kao C-index = 0.804
14
Kao C-index = 0.808
15
Kao C-index = 0.808
16
Kao C-index = 0.806
17
Kao C-index = 0.805
18
Kao C-index = 0.808
19
Kao C-index = 0.810
20
Kao C-index = 0.805
21
Kao C-index = 0.808
22
Kao C-index = 0.807
23
Kao C-index = 0.804
24
Kao C-index = 0.806
25
Kao C-index = 0.808


## RandomSurvivalForest

In [8]:
from sksurv.ensemble import RandomSurvivalForest

for i in range(len(rf_list)):
    features = rf_list.iloc[0:i+1].values
    if len(features) > 0:
        X_train_fs = X_train[features]
        X_val_fs = X_val[features]
        X_test_kao_fs = X_test_kao[features].values.astype('float32')
        
        y_train_cox =  y_train.copy()
        y_train_cox.loc[:, 'Mortality'] = y_train_cox['Mortality'].astype(bool)
        y_train_cox = np.array(list(y_train_cox.to_records(index=False)))
        y_test_kao_cox =  y_test_kao.copy()
        y_test_kao_cox.loc[:, 'Mortality'] = y_test_kao_cox['Mortality'].astype(bool)
        y_test_kao_cox = np.array(list(y_test_kao_cox.to_records(index=False)))
        
        rsf = RandomSurvivalForest(n_estimators=100,
                           min_samples_split=10,
                           min_samples_leaf=15,
                           max_features="sqrt",
                           n_jobs=-1,
                           random_state=369)
        rsf.fit(X_train_fs, y_train_cox)
        print(X_train_fs.shape[1])
        print('Kao C-index = %.3f' %(rsf.score(X_test_kao_fs, y_test_kao_cox)))

1
Kao C-index = 0.685
2
Kao C-index = 0.750
3
Kao C-index = 0.765
4
Kao C-index = 0.781
5
Kao C-index = 0.792
6
Kao C-index = 0.797
7
Kao C-index = 0.801
8
Kao C-index = 0.802
9
Kao C-index = 0.801
10
Kao C-index = 0.802
11
Kao C-index = 0.804
12
Kao C-index = 0.804
13
Kao C-index = 0.805
14
Kao C-index = 0.806
15
Kao C-index = 0.806
16
Kao C-index = 0.807
17
Kao C-index = 0.806
18
Kao C-index = 0.808
19
Kao C-index = 0.807
20
Kao C-index = 0.806
21
Kao C-index = 0.808
22
Kao C-index = 0.807
23
Kao C-index = 0.807
24
Kao C-index = 0.807
25
Kao C-index = 0.808
