# Include libraries

In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sksurv.column import encode_categorical
from sklearn import preprocessing
from sklearn_pandas import DataFrameMapper

# data preprocessing

In [2]:
from sklearn import preprocessing
from sklearn_pandas import DataFrameMapper

def data_processing(data_df):
    data_df_x = data_df.drop(['LOC', 'UID', 'Mortality', 'SurvivalDays', 'CVDeath'], axis=1)

    data_df_y = data_df[['CVDeath', 'SurvivalDays']]

    X_temp = data_df_x[(data_df.LOC == '3') | (data_df.LOC == '2') | (data_df.LOC == '6')]
    y_temp = data_df_y[(data_df.LOC == '3') | (data_df.LOC == '2') | (data_df.LOC == '6')]
    X_df_train, X_df_val, y_df_train, y_df_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=369)

    X_df_test_kao = data_df_x[data_df.LOC == '8']
    y_df_test_kao = data_df_y[data_df.LOC == '8']

    categorical_columns = ['Sex', 'AF', 'DM', 'HTN', 'Dyslipidemia', 'CHF', 'Smoking',
                           'Cancer.before.adm']
    
    numerical_columns = np.setdiff1d(data_df_x.columns, categorical_columns).tolist()

    categorical_ix = [data_df_x.columns.get_loc(col) for col in categorical_columns]
    numerical_ix =  np.setdiff1d(list(range(0, len(data_df_x.columns))), categorical_ix).tolist()

    scaler = preprocessing.StandardScaler()

    standardize = [([col], scaler) for col in numerical_columns]
    leave = [(col, None) for col in categorical_columns]

    x_mapper = DataFrameMapper(standardize + leave)

    X_df_train = pd.DataFrame(data=x_mapper.fit_transform(X_df_train),
                              columns=numerical_columns+categorical_columns,
                              index=X_df_train.index)

    X_df_val = pd.DataFrame(data=x_mapper.fit_transform(X_df_val),
                            columns=numerical_columns+categorical_columns,
                            index=X_df_val.index)

    X_df_test_kao = pd.DataFrame(data=x_mapper.fit_transform(X_df_test_kao),
                                 columns=numerical_columns+categorical_columns,
                                 index=X_df_test_kao.index)

    X_df_train = encode_categorical(X_df_train, columns=categorical_columns)
    X_df_val = encode_categorical(X_df_val, columns=categorical_columns)
    X_df_test_kao = encode_categorical(X_df_test_kao, columns=categorical_columns)
    
    return X_df_train, X_df_val, y_df_train, y_df_val, X_df_test_kao, y_df_test_kao

# STG setting

In [3]:
from stg import STG
import stg.utils as utils

### 3_2

In [4]:
data_2 = pd.read_csv(os.path.join('..', '..', 'data', '(v3_2)STROKE_VITAL_SIGN_MICE.csv'))
X_train_2, X_val_2, y_train_2, y_val_2, X_test_kao_2, y_test_kao_2 = data_processing(data_2)

train_X_2 = X_train_2.values
train_y_2 = {'e': y_train_2['CVDeath'].values, 't': y_train_2['SurvivalDays'].values}
valid_X_2 = X_val_2.values
valid_y_2 = {'e': y_val_2['CVDeath'].values, 't': y_val_2['SurvivalDays'].values}
test_X_2 = X_test_kao_2.values
test_y_2 = {'e': y_test_kao_2['CVDeath'].values, 't': y_test_kao_2['SurvivalDays'].values}


train_data_2={}
train_data_2['X'], train_data_2['E'], \
        train_data_2['T'] = utils.prepare_data(train_X_2, train_y_2)
train_data_2['ties'] = 'noties'

valid_data_2={}
valid_data_2['X'], valid_data_2['E'], \
        valid_data_2['T'] = utils.prepare_data(valid_X_2, valid_y_2)
valid_data_2['ties'] = 'noties'

test_data_2 = {}
test_data_2['X'], test_data_2['E'], \
        test_data_2['T'] = utils.prepare_data(test_X_2, test_y_2)
test_data_2['ties'] = 'noties'

In [5]:
import torch
seed = 369
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

model_2 = STG(task_type='cox', input_dim=train_data_2['X'].shape[1], output_dim=1, hidden_dims=[46, 32, 8], activation='selu',
    optimizer='Adam', learning_rate=0.0005, batch_size=train_data_2['X'].shape[0], feature_selection=True,
    sigma=0.5, lam=0.004, random_state=369, device='cpu')

In [6]:
model_2.fit(train_data_2['X'], {'E': train_data_2['E'], 'T': train_data_2['T']}, nr_epochs=600,
        valid_X=valid_data_2['X'], valid_y={'E': valid_data_2['E'], 'T': valid_data_2['T']}, print_interval=100)

Epoch: 100: CI=0.817955 loss=33.461906 valid_CI=0.829371 valid_loss=6.969312
Epoch: 200: CI=0.832722 loss=32.457500 valid_CI=0.835709 valid_loss=6.925262
Epoch: 300: CI=0.836591 loss=31.424000 valid_CI=0.836032 valid_loss=6.925278
Epoch: 400: CI=0.833038 loss=30.270580 valid_CI=0.836663 valid_loss=6.942208
Epoch: 500: CI=0.785350 loss=29.237690 valid_CI=0.836521 valid_loss=6.958551
Epoch: 600: CI=0.809851 loss=27.814484 valid_CI=0.834234 valid_loss=7.001463


In [7]:
model_2.evaluate(test_data_2['X'], {'E': test_data_2['E'], 'T': test_data_2['T']})

 test_CI=0.828722 test_loss=7.736737


In [8]:
gates_porb_2 = model_2.get_gates(mode='prob')
gates_porb_2

array([0.17559525, 0.18007356, 0.16827187, 0.15200248, 0.1684562 ,
       0.15307623, 0.16590935, 0.16243443, 0.17195392, 0.17685339,
       0.17721921, 0.17527494, 0.17090687, 0.14169383, 0.17714873,
       0.16640091, 0.15833268, 0.17076302, 0.17180958, 0.17389861,
       0.1752742 , 0.17102194, 0.17582846, 0.15670225, 0.16815946,
       0.18006286, 0.18107843, 0.17268446, 0.17217767, 0.18713242,
       0.17846012], dtype=float32)

In [9]:
df = pd.DataFrame(data={'feature': X_train_2.columns.values, 'weight': gates_porb_2}, columns=['feature', 'weight']).sort_values(by=['weight'], ascending=False)
df.to_csv('(v3_2_CVD)stg_FS.csv', index=False)