In [1]:
import pandas as pd
import numpy as np
import os

ex_data = pd.read_csv(os.path.join('..', 'data', 'ex_data.csv'))

ex_data['admin_date'] = ex_data['admin_date'].astype(int).astype(str)
in_date = pd.to_datetime(ex_data['admin_date'], format='%Y/%m/%d', errors='coerce')

ex_data['discharge_date'] = ex_data['discharge_date'].astype(int).astype(str)
out_date = pd.to_datetime(ex_data['discharge_date'], format='%Y/%m/%d', errors='coerce')

day_diff = out_date - in_date
ex_data['duration'] = day_diff.dt.days


print(ex_data.head())

                                         ID  \
0  0000DF74B811298E7996D362F838C50350D17CA8   
1  0003838B85827FE9BD3FC99F3E899242013CBEBA   
2  0008066764A0E935659492B10A486D4EBE09AF08   
3  1B5DB5B8DF80AD37539AFB7F3D0A19154CC4B3EA   
4  000EC867960E20845A09FCC5FD412B0E28825B7C   

                                     CHT_NO admin_date discharge_date  Sex  \
0  39FAF362A02156E2E9692C7FF1143C6719B39477   20120131       20120206  1.0   
1  AF62D5193B84B1C154DEAD3D5ECA2D0B9A34107D   20160308       20160316  1.0   
2  DC6E8FE8E1CD827B38415DF485BDA51791A9F0ED   20150622       20150627  0.0   
3  C6A772EC2E08E5F456F148D9E89BA0735D97E27C   20160607       20160615  1.0   
4  C6B3DC2C6B2831B2762B6238756438365E036FEA   20170204       20170210  1.0   

    Age   AF   DM  HTN  Dyslipidemia  ...  MeanRR G  RR SD  RRSD G  RR CV  \
0  62.0  0.0  1.0  0.0           0.0  ...       2.0    1.3     2.0   0.08   
1  81.0  1.0  0.0  1.0           1.0  ...       2.0    2.4     3.0   0.15   
2  80.0  0.0  1.0

In [2]:
y_data = ex_data[['SurvivalWeeks']]
X_data = ex_data.drop(['ID', 'CHT_NO', 'admin_date', 'discharge_date',
                       'AllMortality', 'CVDeath  ', 'Death Date', 'SurvivalWeeks'], axis=1)

categorical_columns = ['Sex', 'AF', 'DM', 'HTN', 'Dyslipidemia', 'CHF', 'Smoking', 'Cancer before adm']
numerical_columns =  np.setdiff1d(X_data.columns, categorical_columns)


# Regression-Week

In [3]:
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesRegressor

# one-hot
X_data_one_hot = pd.get_dummies(X_data, columns=categorical_columns)
print(X_data_one_hot.shape, y_data.shape)
# for i in range(10):
#     X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.1, random_state=i, stratify=y_data)
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data_one_hot):
    X_train, X_test = X_data_one_hot.iloc[train_index], X_data_one_hot.iloc[test_index]
    y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]
    # scaling
    # scaler = preprocessing.MinMaxScaler()
    scaler = preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    reg = ExtraTreesRegressor(n_estimators=200, random_state=0).fit(X_train, y_train.values.ravel())
    print('R-squ', reg.score(X_test, y_test.values.ravel()))

(13244, 50) (13244, 1)
R-squ 0.1484765645386844
R-squ 0.11860279843772992
R-squ 0.11025565297853568
R-squ 0.13683069293158467
R-squ 0.1263900775891117
R-squ 0.10818910346175903
R-squ 0.07830673454536752
R-squ 0.11960401011451116
R-squ 0.1269539491262609
R-squ 0.11594733648771172


In [4]:
# dummy
X_data_dummy = pd.get_dummies(X_data, columns=categorical_columns, drop_first=True)
print(X_data_dummy.shape, y_data.shape)

from sklearn import linear_model
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data_one_hot):
    X_train, X_test = X_data_one_hot.iloc[train_index], X_data_one_hot.iloc[test_index]
    y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]
    # scaling
    # scaler = preprocessing.MinMaxScaler()
    scaler = preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    clf = linear_model.Lasso(alpha=0.1).fit(X_train, y_train.values.ravel())
    print('R-squ', clf.score(X_test, y_test.values.ravel()))

(13244, 42) (13244, 1)
R-squ 0.11137738108331763
R-squ 0.09244164380468889
R-squ 0.10276383902240005
R-squ 0.11980129623048041
R-squ 0.13100962408021632
R-squ 0.0975574765975612
R-squ 0.09532068482119982
R-squ 0.10722445803144387
R-squ 0.1131203977765145
R-squ 0.10825343956912048


# Regression-Month

In [5]:
y_data_m = (y_data/4).round(0)
y_data_m

Unnamed: 0,SurvivalWeeks
0,90.0
1,37.0
2,46.0
3,33.0
4,25.0
...,...
13239,6.0
13240,55.0
13241,23.0
13242,62.0


In [6]:
# one-hot
X_data_one_hot = pd.get_dummies(X_data, columns=categorical_columns)
# for i in range(10):
#     X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.1, random_state=i, stratify=y_data)
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data_one_hot):
    X_train, X_test = X_data_one_hot.iloc[train_index], X_data_one_hot.iloc[test_index]
    y_train, y_test = y_data_m.iloc[train_index], y_data_m.iloc[test_index]
    # scaling
    # scaler = preprocessing.MinMaxScaler()
    scaler = preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    clf = linear_model.Lasso(alpha=0.1).fit(X_train, y_train.values.ravel())
    print('R-squ', clf.score(X_test, y_test.values.ravel()))

R-squ 0.10768038921043621
R-squ 0.09105650133900467
R-squ 0.10504955774132863
R-squ 0.11948521809943768
R-squ 0.12691236514332194
R-squ 0.09763268099439615
R-squ 0.0965080840097422
R-squ 0.11035254626269642
R-squ 0.11196797571086059
R-squ 0.10618049990025513


# 6-Month Mortality (Binary classification)

In [9]:
y_data_od = (y_data<24).astype(int)
print(y_data_od)
print(y_data_od.groupby(['SurvivalWeeks']).size())

       SurvivalWeeks
0                  0
1                  0
2                  0
3                  0
4                  0
...              ...
13239              0
13240              0
13241              0
13242              0
13243              0

[13244 rows x 1 columns]
SurvivalWeeks
0    12834
1      410
dtype: int64


In [10]:
from sklearn.ensemble import ExtraTreesClassifier
from imblearn import over_sampling
from sklearn.metrics import auc, roc_curve

all_auroc = []
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data_one_hot):
    X_train, X_test = X_data_one_hot.iloc[train_index], X_data_one_hot.iloc[test_index]
    y_train, y_test = y_data_od.iloc[train_index], y_data_od.iloc[test_index]
    
    # scaling
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # over-sampling
    # print('before', y_train.groupby(['SurvivalWeeks']).size())
    sm = over_sampling.SVMSMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    # print('after', y_train.groupby(['SurvivalWeeks']).size())

    # define the model
    model = ExtraTreesClassifier(n_estimators=250,  random_state=42)
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict_proba(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred[:, 1])
    auroc = auc(fpr, tpr)
    print('auc', auroc)
    all_auroc.append(auroc)
print(np.mean(all_auroc), np.std(all_auroc))

auc 0.8409791040344431
auc 0.8153142129992168
auc 0.7969055309646289
auc 0.790996988716758
auc 0.8362677364237427
auc 0.7625314916167144
auc 0.8036137710898312
auc 0.8347823347823347
auc 0.8217497100925802
auc 0.838242043300409
0.8141382924020659 0.02425518637866377
