# Include libraries

In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sksurv.column import encode_categorical
from sklearn import preprocessing
from sklearn_pandas import DataFrameMapper

# data preprocessing

In [2]:
from sklearn import preprocessing
from sklearn_pandas import DataFrameMapper

def data_processing(data_df):
    data_df_x = data_df.drop(['LOC', 'UID', 'Mortality', 'SurvivalDays'], axis=1)

    data_df_y = data_df[['Mortality', 'SurvivalDays']]

    X_temp = data_df_x[(data_df.LOC == '3') | (data_df.LOC == '2') | (data_df.LOC == '6')]
    y_temp = data_df_y[(data_df.LOC == '3') | (data_df.LOC == '2') | (data_df.LOC == '6')]
    X_df_train, X_df_val, y_df_train, y_df_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=369)

    X_df_test_kao = data_df_x[data_df.LOC == '8']
    y_df_test_kao = data_df_y[data_df.LOC == '8']

    categorical_columns = ['Sex', 'AF', 'DM', 'HTN', 'Dyslipidemia', 'CHF', 'Smoking',
                           'Cancer.before.adm']
    
    numerical_columns = np.setdiff1d(data_df_x.columns, categorical_columns).tolist()

    categorical_ix = [data_df_x.columns.get_loc(col) for col in categorical_columns]
    numerical_ix =  np.setdiff1d(list(range(0, len(data_df_x.columns))), categorical_ix).tolist()

    scaler = preprocessing.StandardScaler()

    standardize = [([col], scaler) for col in numerical_columns]
    leave = [(col, None) for col in categorical_columns]

    x_mapper = DataFrameMapper(standardize + leave)

    X_df_train = pd.DataFrame(data=x_mapper.fit_transform(X_df_train),
                              columns=numerical_columns+categorical_columns,
                              index=X_df_train.index)

    X_df_val = pd.DataFrame(data=x_mapper.fit_transform(X_df_val),
                            columns=numerical_columns+categorical_columns,
                            index=X_df_val.index)

    X_df_test_kao = pd.DataFrame(data=x_mapper.fit_transform(X_df_test_kao),
                                 columns=numerical_columns+categorical_columns,
                                 index=X_df_test_kao.index)

    X_df_train = encode_categorical(X_df_train, columns=categorical_columns)
    X_df_val = encode_categorical(X_df_val, columns=categorical_columns)
    X_df_test_kao = encode_categorical(X_df_test_kao, columns=categorical_columns)
    
    return X_df_train, X_df_val, y_df_train, y_df_val, X_df_test_kao, y_df_test_kao

# STG setting

In [3]:
from stg import STG
import stg.utils as utils

### 3_1

In [4]:
data_1 = pd.read_csv(os.path.join('..', '..', 'data', '(v3_1)STROKE_VITAL_SIGN_MICE.csv'))
X_train_1, X_val_1, y_train_1, y_val_1, X_test_kao_1, y_test_kao_1 = data_processing(data_1)

train_X_1 = X_train_1.values
train_y_1 = {'e': y_train_1['Mortality'].values, 't': y_train_1['SurvivalDays'].values}
valid_X_1 = X_val_1.values
valid_y_1 = {'e': y_val_1['Mortality'].values, 't': y_val_1['SurvivalDays'].values}
test_X_1 = X_test_kao_1.values
test_y_1 = {'e': y_test_kao_1['Mortality'].values, 't': y_test_kao_1['SurvivalDays'].values}


train_data_1={}
train_data_1['X'], train_data_1['E'], \
        train_data_1['T'] = utils.prepare_data(train_X_1, train_y_1)
train_data_1['ties'] = 'noties'

valid_data_1={}
valid_data_1['X'], valid_data_1['E'], \
        valid_data_1['T'] = utils.prepare_data(valid_X_1, valid_y_1)
valid_data_1['ties'] = 'noties'

test_data_1 = {}
test_data_1['X'], test_data_1['E'], \
        test_data_1['T'] = utils.prepare_data(test_X_1, test_y_1)
test_data_1['ties'] = 'noties'

In [5]:
import torch
seed = 369
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

model_1 = STG(task_type='cox', input_dim=train_data_1['X'].shape[1], output_dim=1, hidden_dims=[46, 32, 8], activation='selu',
    optimizer='Adam', learning_rate=0.0005, batch_size=train_data_1['X'].shape[0], feature_selection=True,
    sigma=0.5, lam=0.004, random_state=369, device='cpu')

In [6]:
model_1.fit(train_data_1['X'], {'E': train_data_1['E'], 'T': train_data_1['T']}, nr_epochs=600,
        valid_X=valid_data_1['X'], valid_y={'E': valid_data_1['E'], 'T': valid_data_1['T']}, print_interval=100)

Epoch: 100: CI=0.774042 loss=31.928207 valid_CI=0.793443 valid_loss=7.115953
Epoch: 200: CI=0.762109 loss=31.099302 valid_CI=0.802943 valid_loss=7.080545
Epoch: 300: CI=0.787539 loss=30.091082 valid_CI=0.803953 valid_loss=7.081430
Epoch: 400: CI=0.796311 loss=28.968315 valid_CI=0.804640 valid_loss=7.085434
Epoch: 500: CI=0.734156 loss=28.092934 valid_CI=0.804710 valid_loss=7.095154
Epoch: 600: CI=0.772636 loss=26.636143 valid_CI=0.803914 valid_loss=7.145357


In [7]:
model_1.evaluate(test_data_1['X'], {'E': test_data_1['E'], 'T': test_data_1['T']})

 test_CI=0.809185 test_loss=7.802935


In [8]:
gates_porb_1 = model_1.get_gates(mode='prob')
gates_porb_1

array([0.1720022 , 0.1650821 , 0.17899826, 0.17026329, 0.17516837,
       0.17714384, 0.16902727, 0.15841234, 0.16204837, 0.17817828,
       0.18751878, 0.18225992, 0.17757505, 0.16896787, 0.17426854,
       0.15224138, 0.17449906, 0.16896147, 0.17244554, 0.17359254,
       0.16986838, 0.1601704 , 0.1668942 , 0.16205293, 0.17217636,
       0.17719975, 0.16024968, 0.16388535, 0.15852427], dtype=float32)

In [9]:
df = pd.DataFrame(data={'feature': X_train_1.columns.values, 'weight': gates_porb_1}, columns=['feature', 'weight']).sort_values(by=['weight'], ascending=False)
df.to_csv('(v3_1)stg_FS.csv', index=False)

### 3_2

In [10]:
data_2 = pd.read_csv(os.path.join('..', '..', 'data', '(v3_2)STROKE_VITAL_SIGN_MICE.csv'))
X_train_2, X_val_2, y_train_2, y_val_2, X_test_kao_2, y_test_kao_2 = data_processing(data_2)

train_X_2 = X_train_2.values
train_y_2 = {'e': y_train_2['Mortality'].values, 't': y_train_2['SurvivalDays'].values}
valid_X_2 = X_val_2.values
valid_y_2 = {'e': y_val_2['Mortality'].values, 't': y_val_2['SurvivalDays'].values}
test_X_2 = X_test_kao_2.values
test_y_2 = {'e': y_test_kao_2['Mortality'].values, 't': y_test_kao_2['SurvivalDays'].values}


train_data_2={}
train_data_2['X'], train_data_2['E'], \
        train_data_2['T'] = utils.prepare_data(train_X_2, train_y_2)
train_data_2['ties'] = 'noties'

valid_data_2={}
valid_data_2['X'], valid_data_2['E'], \
        valid_data_2['T'] = utils.prepare_data(valid_X_2, valid_y_2)
valid_data_2['ties'] = 'noties'

test_data_2 = {}
test_data_2['X'], test_data_2['E'], \
        test_data_2['T'] = utils.prepare_data(test_X_2, test_y_2)
test_data_2['ties'] = 'noties'

In [11]:
import torch
seed = 369
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

model_2 = STG(task_type='cox', input_dim=train_data_2['X'].shape[1], output_dim=1, hidden_dims=[46, 32, 8], activation='selu',
    optimizer='Adam', learning_rate=0.0005, batch_size=train_data_2['X'].shape[0], feature_selection=True,
    sigma=0.5, lam=0.004, random_state=369, device='cpu')

In [12]:
model_2.fit(train_data_2['X'], {'E': train_data_2['E'], 'T': train_data_2['T']}, nr_epochs=600,
        valid_X=valid_data_2['X'], valid_y={'E': valid_data_2['E'], 'T': valid_data_2['T']}, print_interval=100)

Epoch: 100: CI=0.790208 loss=33.551567 valid_CI=0.793581 valid_loss=7.109633
Epoch: 200: CI=0.807377 loss=32.544941 valid_CI=0.802513 valid_loss=7.067464
Epoch: 300: CI=0.804721 loss=31.531528 valid_CI=0.804314 valid_loss=7.066954
Epoch: 400: CI=0.801127 loss=30.392361 valid_CI=0.804688 valid_loss=7.084258
Epoch: 500: CI=0.754313 loss=29.325293 valid_CI=0.802205 valid_loss=7.106374
Epoch: 600: CI=0.771502 loss=27.945999 valid_CI=0.802088 valid_loss=7.145290


In [13]:
model_2.evaluate(test_data_2['X'], {'E': test_data_2['E'], 'T': test_data_2['T']})

 test_CI=0.809027 test_loss=7.797143


In [14]:
gates_porb_2 = model_2.get_gates(mode='prob')
gates_porb_2

array([0.17555398, 0.18069726, 0.16855249, 0.15197012, 0.16840973,
       0.15350133, 0.16618699, 0.16162458, 0.1710538 , 0.17726776,
       0.17689234, 0.17532003, 0.1706869 , 0.141705  , 0.17739707,
       0.16627815, 0.15794629, 0.17076465, 0.17205364, 0.1738455 ,
       0.17375329, 0.17092082, 0.17168385, 0.15695411, 0.16742298,
       0.18028265, 0.18107969, 0.17271113, 0.17209682, 0.1871933 ,
       0.18056929], dtype=float32)

In [15]:
df = pd.DataFrame(data={'feature': X_train_2.columns.values, 'weight': gates_porb_2}, columns=['feature', 'weight']).sort_values(by=['weight'], ascending=False)
df.to_csv('(v3_2)stg_FS.csv', index=False)