# Include libraries

In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sksurv.column import encode_categorical
from sklearn import preprocessing
from sklearn_pandas import DataFrameMapper

# data preprocessing

In [2]:
def data_processing(data_df):
    data_df_x = data_df.drop(['LOC', 'UID', 'Hospital_ID', 'SurvivalWeeks', 'admission_date',
                              'discharge_date', 'death_date', 'Mortality', 'CVDeath', 'SurvivalDays', 'CAD'], axis=1)

    data_df_y = data_df[['Mortality', 'SurvivalWeeks']]

    data_df_x = data_df_x.drop(['ICU'], axis=1)

    X_temp = data_df_x[(data_df.LOC == '3') | (data_df.LOC == '2') | (data_df.LOC == '6')]
    y_temp = data_df_y[(data_df.LOC == '3') | (data_df.LOC == '2') | (data_df.LOC == '6')]
    X_df_train, X_df_val, y_df_train, y_df_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=369)

    X_df_test_kao = data_df_x[data_df.LOC == '8']
    y_df_test_kao = data_df_y[data_df.LOC == '8']

    categorical_columns = ['Sex', 'AF', 'DM', 'HTN', 'Hyperlipidemia', 'CHF', 'Smoking',
                           'Cancer.before.adm', 'Foley', 'NG', 'Dyslipidemia']
    numerical_columns = np.setdiff1d(data_df_x.columns, categorical_columns).tolist()

    categorical_ix = [data_df_x.columns.get_loc(col) for col in categorical_columns]
    numerical_ix = np.setdiff1d(list(range(0, len(data_df_x.columns))), categorical_ix).tolist()

    scaler = preprocessing.StandardScaler()

    standardize = [([col], scaler) for col in numerical_columns]
    leave = [(col, None) for col in categorical_columns]

    x_mapper = DataFrameMapper(standardize + leave)

    X_df_train = pd.DataFrame(data=x_mapper.fit_transform(X_df_train),
                              columns=numerical_columns + categorical_columns,
                              index=X_df_train.index)

    X_df_val = pd.DataFrame(data=x_mapper.fit_transform(X_df_val),
                            columns=numerical_columns + categorical_columns,
                            index=X_df_val.index)

    X_df_test_kao = pd.DataFrame(data=x_mapper.fit_transform(X_df_test_kao),
                                 columns=numerical_columns + categorical_columns,
                                 index=X_df_test_kao.index)

    X_df_train = encode_categorical(X_df_train, columns=categorical_columns)
    X_df_val = encode_categorical(X_df_val, columns=categorical_columns)
    X_df_test_kao = encode_categorical(X_df_test_kao, columns=categorical_columns)

    return X_df_train, X_df_val, y_df_train, y_df_val, X_df_test_kao, y_df_test_kao

# STG setting

In [3]:
from stg import STG
import stg.utils as utils

In [4]:
data = pd.read_csv(os.path.join('..', '..', 'data', '(v2)STROKE_VITAL_SIGN_MICE.csv'))
X_train, X_val, y_train, y_val, X_test_kao, y_test_kao = data_processing(data)

train_X = X_train.values
train_y = {'e': y_train['Mortality'].values, 't': y_train['SurvivalWeeks'].values}
valid_X = X_val.values
valid_y = {'e': y_val['Mortality'].values, 't': y_val['SurvivalWeeks'].values}
test_X = X_test_kao.values
test_y = {'e': y_test_kao['Mortality'].values, 't': y_test_kao['SurvivalWeeks'].values}


train_data={}
train_data['X'], train_data['E'], \
        train_data['T'] = utils.prepare_data(train_X, train_y)
train_data['ties'] = 'noties'

valid_data={}
valid_data['X'], valid_data['E'], \
        valid_data['T'] = utils.prepare_data(valid_X, valid_y)
valid_data['ties'] = 'noties'

test_data = {}
test_data['X'], test_data['E'], \
        test_data['T'] = utils.prepare_data(test_X, test_y)
test_data['ties'] = 'noties'

In [5]:
import torch
seed = 369
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

model = STG(task_type='cox', input_dim=train_data['X'].shape[1], output_dim=1, hidden_dims=[46, 32, 8], activation='relu',
    optimizer='Adam', learning_rate=0.0005, batch_size=train_data['X'].shape[0], feature_selection=True,
    sigma=0.5, lam=0.004, random_state=369, device='cpu')

In [6]:
model.fit(train_data['X'], {'E': train_data['E'], 'T': train_data['T']}, nr_epochs=600,
        valid_X=valid_data['X'], valid_y={'E': valid_data['E'], 'T': valid_data['T']}, print_interval=100)

Epoch: 100: CI=0.763156 loss=50.775986 valid_CI=0.756817 valid_loss=7.278597
Epoch: 200: CI=0.798891 loss=49.069801 valid_CI=0.798499 valid_loss=7.111500
Epoch: 300: CI=0.800245 loss=47.332664 valid_CI=0.806494 valid_loss=7.080393
Epoch: 400: CI=0.799623 loss=45.455093 valid_CI=0.807413 valid_loss=7.073556
Epoch: 500: CI=0.789933 loss=43.396740 valid_CI=0.808727 valid_loss=7.098788
Epoch: 600: CI=0.805910 loss=41.111671 valid_CI=0.807766 valid_loss=7.116194


In [7]:
model.evaluate(test_data['X'], {'E': test_data['E'], 'T': test_data['T']})

 test_CI=0.813013 test_loss=7.779222


In [8]:
gates_porb = model.get_gates(mode='prob')
gates_porb

array([0.15060955, 0.15631801, 0.18764704, 0.1767317 , 0.17185378,
       0.16733521, 0.15187135, 0.17705327, 0.16521072, 0.16717038,
       0.17274958, 0.16694689, 0.18363988, 0.16668785, 0.17750025,
       0.16266236, 0.15730035, 0.17983463, 0.15913281, 0.16195878,
       0.16702688, 0.17219648, 0.16437775, 0.17542702, 0.1696685 ,
       0.17900321, 0.18218982, 0.1707021 , 0.16304517, 0.15504864,
       0.16012797, 0.17580304, 0.17532668, 0.1604434 , 0.19385886,
       0.17422047, 0.16233423, 0.16567776, 0.16683578, 0.17302325,
       0.18504089, 0.180648  , 0.18246707, 0.17109063, 0.15805474,
       0.16836125, 0.17077863, 0.16559455, 0.1839453 , 0.16380775,
       0.17442945, 0.15704015], dtype=float32)

In [10]:
df = pd.DataFrame(data={'feature': X_train.columns.values, 'weight': gates_porb}, columns=['feature', 'weight']).sort_values(by=['weight'], ascending=False)
df.to_csv('stg_FS.csv', index=False)