In [1]:
from npsurvival.FeatureEngineer import FeatureEngineer
from npsurvival.Utils import model_prepare
from npsurvival.Utils import calculate_dataset_size
from npsurvival.Utils import evaluate_predict_result
from npsurvival.IPEC import IPEC

from npsurvival.RandomSurvivalForest import RandomSurvivalForest
from npsurvival.CoxPHModel import CoxPHModel
from npsurvival.KNNKaplanMeier import KNNKaplanMeier
from npsurvival.AalenAdditiveModel import AalenAdditiveModel

import numpy as np


Using TensorFlow backend.


In [2]:
fe = FeatureEngineer(verbose=False, data_path="../../dataset/")
sources = fe.get_diseases_list()


In [3]:
dataset_idx = 0
file_prefix = ""
low_event_thd = 0.03
low_value_thd = low_event_thd / 3
patient_dict, feature_set, train_id_list, test_id_list = \
    fe.load_data_as_dict(dataset_idx, file_prefix=file_prefix, 
                         low_freq_event_thd=low_event_thd, 
                         low_freq_value_thd=low_value_thd)
train_df, test_df, feature_list = \
    model_prepare(patient_dict, feature_set, train_id_list, test_id_list)


  return (np.sqrt(ssd) / old_norm) < self.convergence_threshold


In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd

pca = PCA(n_components=5)
scaler = StandardScaler()
train_x = train_df.drop(columns=["LOS", "OUT"]).values
scaler.fit(train_x)
train_x = scaler.transform(train_x)
pca.fit(train_x)
reduced_x = pca.transform(train_x)
print(sum(pca.explained_variance_ratio_))
reduced_df = pd.DataFrame(reduced_x).set_index(train_df.index)
reduced_df["LOS"] = train_df["LOS"]
reduced_df["OUT"] = train_df["OUT"]
reduced_df.head(5)


0.7832160578632994


Unnamed: 0,0,1,2,3,4,LOS,OUT
1030.0,44.311848,-14.000458,10.972961,11.88474,-1.351692,3.429572,1
82960.0,23.445461,38.557584,-11.105762,14.825832,-4.548299,2.721134,1
77842.0,42.55929,13.425733,13.134972,2.314506,14.306481,32.856181,1
15379.0,-35.822291,-6.13045,-0.151834,5.678025,-0.825627,0.98125,1
81938.0,-10.160845,1.253623,0.958453,-3.389403,1.539441,2.917454,1


In [5]:
reduced_df.head()

Unnamed: 0,0,1,2,3,4,LOS,OUT
1030.0,-2429.670608,3858.513236,-1844.603853,276.895009,-1380.11065,3.429572,1
82960.0,14931.3644,3867.634699,-107.109033,-346.569587,-117.876588,2.721134,1
77842.0,2099.80175,1694.666033,-1575.863347,7388.614597,2077.368343,32.856181,1
15379.0,-510.68516,-1761.47992,356.68046,-260.077089,258.367008,0.98125,1
81938.0,-321.637477,-1175.026586,162.049916,-26.137027,-51.236292,2.917454,1


In [70]:
import pandas as pd
reduced_df = pd.DataFrame(reduced_x).set_index(train_df.index)
reduced_df["LOS"] = train_df["LOS"]
reduced_df["OUT"] = train_df["OUT"]
reduced_df.head(5)


Unnamed: 0,0,1,2,3,4,LOS,OUT
1030.0,-2429.88385,3858.220846,-1844.501245,277.247874,-1379.772314,3.429572,1
82960.0,14931.139979,3868.522574,-107.090993,-346.488908,-117.740921,2.721134,1
77842.0,2099.760858,1694.533988,-1575.693809,7388.664863,2077.027921,32.856181,1
15379.0,-510.488548,-1761.770685,356.692987,-259.990861,258.477208,0.98125,1
81938.0,-321.548745,-1175.08335,162.037378,-26.171006,-51.265014,2.917454,1


In [80]:
print(list(reduced_df.head(5)[4]))


[-1379.7723144867496, -117.74092149473688, 2077.0279211339307, 258.47720772832037, -51.26501446098022]


## Models


### Random Survival Forest

In [9]:
%%time

n_trees = 10
max_features = 20
max_depth = 5

rd = RandomSurvivalForest(n_trees=n_trees, max_features=max_features, max_depth=max_depth)
rd.fit(train_df, duration_col='LOS', event_col='OUT', num_workers=2)


CPU times: user 92.1 ms, sys: 82.8 ms, total: 175 ms
Wall time: 20.9 s


In [10]:
test_time_median_pred = rd.pred_median_time(test_df)
concordance = evaluate_predict_result(test_time_median_pred, test_df, print_result=True)


concordance: 0.5766841853798376


In [11]:
proba_pred = rd.pred_proba(test_df.iloc[[3]], time=1.0)
print(proba_pred)


[0.8707142857142858]


In [14]:
ipec = IPEC(train_df, rd.pred_proba, g_type="All_One", t_thd=1, t_step=0.05, time_col='LOS', death_identifier='OUT', verbose=True)
ipec_score = ipec.avg_ipec(test_df, num_workers=2, print_result=True)


T: 101.73895833333333
number of check points: 2036
IPEC: 0.04558912103312208


In [15]:
ipec = IPEC(train_df, rd.pred_proba, g_type="All_One", t_thd=1, t_step="obs", time_col='LOS', death_identifier='OUT', verbose=True)
ipec_score = ipec.avg_ipec(test_df, num_workers=2, print_result=True)


T: 101.73895833333333
number of check points: 280
IPEC: 0.0438401651289349


In [56]:
%%time
ipec = IPEC(train_df, rd.pred_proba, g_type="All_One", t_thd=0.8, t_step="obs", time_col='LOS', death_identifier='OUT', verbose=True)
ipec_score = ipec.avg_ipec(test_df, num_workers=2, print_result=True)


T: 11.147627314814814
number of check points: 224
IPEC: 0.19312927661172513
CPU times: user 29.1 s, sys: 799 ms, total: 29.9 s
Wall time: 37.2 s


### Cox


In [37]:
alpha = 1.
lambd = 0.07

cox = CoxPHModel(alpha=alpha, lambda_=lambd)
cox.fit(train_df, duration_col='LOS', event_col='OUT')


In [38]:
test_time_median_pred = cox.pred_median_time(test_df)
concordance = evaluate_predict_result(test_time_median_pred, test_df, print_result=True)


concordance: 0.559483994266603


In [41]:
%%timeit
proba_pred = cox.pred_proba(test_df.iloc[[3]], time=1.0)
print(proba_pred)


[0.8667383355455066]
[0.8667383355455066]
[0.8667383355455066]
[0.8667383355455066]
[0.8667383355455066]
[0.8667383355455066]
[0.8667383355455066]
[0.8667383355455066]
151 ms ± 38.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [52]:
%%time

ipec = IPEC(train_df, cox.pred_proba, g_type="All_One", t_thd=0.8, t_step="obs", time_col='LOS', death_identifier='OUT', verbose=True)
ipec_score = ipec.avg_ipec(test_df, num_workers=2, print_result=True, use_multiprocess=False)


T: 11.147627314814814
number of check points: 224
IPEC: 0.1944921555046284
CPU times: user 37min 8s, sys: 6.59 s, total: 37min 15s
Wall time: 37min 32s


KeyboardInterrupt: 

Process ForkPoolWorker-5:
  File "/usr/local/lib/python3.6/site-packages/multiprocess/pool.py", line 108, in worker
    task = get()
Process ForkPoolWorker-6:
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/site-packages/multiprocess/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/site-packages/multiprocess/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.6/site-packages/multiprocess/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/lib/python3.6/site-packages/multiprocess/pool.py", line 108, in worker
    task = get()
  File "/usr/local/lib/python3.6/site-packages/multiprocess/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.6/site-packages/multiprocess/queues.py", line 337, in get
    with self._rlock:
  File "/usr/local/lib/python3.6/site-packages/multiprocess

### Aalen Additive Model

In [46]:
coef_penalizer = 0.1

aaf = AalenAdditiveModel(coef_penalizer=coef_penalizer)
aaf.fit(train_df, duration_col='LOS', event_col='OUT')


In [47]:
test_time_median_pred = aaf.pred_median_time(test_df)
concordance = evaluate_predict_result(test_time_median_pred, test_df, print_result=True)


concordance: 0.5126612517916865


In [50]:
proba_pred = aaf.pred_proba(test_df.iloc[[3]], time=1.0)
print(proba_pred)


[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182

[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182838548]
[0.2636051182

### KNN Kaplan Meier Model

In [48]:
n_neighbors = 12

knn_based_kmf = KNNKaplanMeier(n_neighbors=n_neighbors)
knn_based_kmf.fit(train_df, duration_col="LOS", event_col="OUT")


In [49]:
test_time_median_pred = knn_based_kmf.pred_median_time(test_df)
concordance = evaluate_predict_result(test_time_median_pred, test_df, print_result=True)


concordance: 0.5857620640229336


In [51]:
proba_pred = knn_based_kmf.pred_proba(test_df.iloc[[3]], time=1.0)
print(proba_pred)


[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316136398]
[0.9194243316

## Data Preparation


In [None]:
dataset_idx = 0
train_dfs = []
test_dfs = []

for i in range(5):
    print(i)
    patient_dict, feature_set, train_id_list, test_id_list = \
        fe.load_data_as_dict(dataset_idx, file_prefix="cross_val/10-5fold_"+str(i)+"_", 
                             low_freq_event_thd=0.03, low_freq_value_thd=0.01)
    train_x, train_y, test_x, test_y, train_df, test_df, feature_list = \
        model_prepare(patient_dict, feature_set, train_id_list, test_id_list)
    train_dfs.append(train_df)
    test_dfs.append(test_df)


## Read Result


In [2]:
import pickle


### KNN

In [None]:
with open('KNN.pickle', 'rb') as f:
    knn_small_result = pickle.load(f)
knn_small_value = [5, 10, 15, 20, 30]
print(knn_value)

with open('KNN_large.pickle', 'rb') as f:
    knn_large_result = pickle.load(f)
knn_large_value = [50, 80, 100, 150, 200]
print(knn_large_value)

with open('AAM.pickle', 'rb') as f:
    aam_small_result = pickle.load(f)
aam_small_value = [0.05, 0.08, 0.1, 0.12, 0.15, 0.2, 0.3]
print(aam_small_value)

with open('AAM_large.pickle', 'rb') as f:
    aam_large_result = pickle.load(f)
aam_large_value = [0.3, 0.35, 0.4, 0.45, 0.5, 0.6]
print(aam_large_value)


In [22]:
knn_small_value + knn_large_value


[5, 10, 15, 20, 30, 50, 80, 100, 150, 200]

In [27]:
print("KNN, ICH, concordance")
print(list(knn_small_result[0]["ich"]) + list(knn_large_result[0]["ich"]))


KNN, ICH, concordance
[0.5729085627977228, 0.5761260601835716, 0.5753041710235854, 0.5694813523875915, 0.5817065179505054, 0.580925641919368, 0.5851056117113977, 0.5811187405600092, 0.582252236551644, 0.582298361798536]


In [32]:
print("KNN on ICH, IPEC")
print(list(knn_small_result[1]["ich"]) + list(knn_large_result[1]["ich"]))


KNN on ICH, IPEC
[0.20196751757442835, 0.19296079998560986, 0.18821019543353534, 0.1865315687278688, 0.1823964997497527, 0.18069443220029754, 0.18058193441643783, 0.18090214450003048, 0.18094019267265044, 0.18195513032279723]


In [30]:
print("KNN, pancreatitis, concordance")
print(list(knn_small_result[0]["pancreatitis"]) + list(knn_large_result[0]["pancreatitis"]))


KNN, pancreatitis, concordance
[0.5569867724867725, 0.5537400192400193, 0.5455050505050505, 0.5540447330447331, 0.535077441077441, 0.5358395863395864, 0.5170675805675806, 0.5121450216450216, 0.5064367484367485, 0.49512578162578164]


In [33]:
print("KNN on pancreatitis, IPEC")
print(list(knn_small_result[1]["pancreatitis"]) + list(knn_large_result[1]["pancreatitis"]))


KNN on pancreatitis, IPEC
[0.20042574754000636, 0.1906304152136668, 0.18946485692283332, 0.18613777947475532, 0.18686605112389412, 0.18639644724663376, 0.18622910181153443, 0.18628138147036263, 0.18528973831190673, 0.18542862670993487]


### AAM

In [35]:
print(aam_small_value + aam_large_value)

print("AAM, ICH, concordance")
print(list(aam_small_result[0]["ich"]) + list(aam_large_result[0]["ich"]))

print("AAM on ICH, IPEC")
print(list(aam_small_result[1]["ich"]) + list(aam_large_result[1]["ich"]))

print("AAM, pancreatitis, concordance")
print(list(aam_small_result[0]["pancreatitis"]) + list(aam_large_result[0]["pancreatitis"]))

print("AAM on pancreatitis, IPEC")
print(list(aam_small_result[1]["pancreatitis"]) + list(aam_large_result[1]["pancreatitis"]))


AAM, ICH, concordance
[0.5523046357615894, 0.5604739165795283, 0.5627684442895318, 0.5596718949692111, 0.5611541768328104, 0.5629286627163936, 0.5646888579063554, 0.5649883815499013, 0.5681879865225979, 0.5707011734634599, 0.5704489369118161, 0.571710119670036, 0.5722579295921923]
AAM on ICH, IPEC
[0.322790796513818, 0.3160184153638755, 0.3126440685512023, 0.30981338323329644, 0.30605018405875056, 0.30089251065984113, 0.29305254802558156, 0.2930766261057086, 0.29028580098181733, 0.28786780218632974, 0.285733940611933, 0.28390186029190767, 0.28083039626243467]
AAM, pancreatitis, concordance
[0.5100310245310246, 0.5102688792688793, 0.5102388167388168, 0.510380230880231, 0.5098114478114477, 0.51010101010101, 0.5115495430495431, 0.5108840788840789, 0.5105363155363155, 0.5097409812409812, 0.5102984607984609, 0.5101803751803751, 0.5105103415103415]
AAM on pancreatitis, IPEC
[0.3787472324322354, 0.3782282409657072, 0.37788626417727444, 0.3775722269219542, 0.3771360045539843, 0.376425018847365

In [33]:
print("AAM, ICH, concordance")
print(result[0]["ich"])


AAM, ICH, concordance
[0.55230464 0.56047392 0.56276844 0.55967189 0.56115418 0.56292866
 0.56468886]


In [34]:
print("AAM, ICH, IPEC")
print(result[1]["ich"])


AAM, ICH, IPEC
[0.3227908  0.31601842 0.31264407 0.30981338 0.30605018 0.30089251
 0.29305255]


In [35]:
print("AAM, pancreatitis, concordance")
print(result[0]["pancreatitis"])


AAM, pancreatitis, concordance
[0.51003102 0.51026888 0.51023882 0.51038023 0.50981145 0.51010101
 0.51154954]


In [36]:
print("AAM, pancreatitis, IPEC")
print(result[1]["pancreatitis"])


AAM, pancreatitis, IPEC
[0.37874723 0.37822824 0.37788626 0.37757223 0.377136   0.37642502
 0.37527073]


In [11]:
print("AAM, ICH, concordance")
print(result[0]["ich"])


AAM, ICH, concordance
[0.56498838 0.56818799 0.57070117 0.57044894 0.57171012 0.57225793]


In [12]:
print("AAM, ICH, IPEC")
print(result[1]["ich"])


AAM, ICH, IPEC
[0.29307663 0.2902858  0.2878678  0.28573394 0.28390186 0.2808304 ]


In [13]:
print("AAM, pancreatitis, concordance")
print(result[0]["pancreatitis"])


AAM, pancreatitis, concordance
[0.51088408 0.51053632 0.50974098 0.51029846 0.51018038 0.51051034]


In [14]:
print("AAM, pancreatitis, IPEC")
print(result[1]["pancreatitis"])


AAM, pancreatitis, IPEC
[0.37526597 0.37451314 0.37386782 0.37324159 0.37266741 0.37170037]
