In [1]:
import pandas as pd
import numpy as np
import os

ex_data = pd.read_csv(os.path.join('..', '..', 'data', 'tidy_Stroke_Vital_Sign.csv'))

ex_data['admission_date'] = ex_data['admission_date'].astype(int).astype(str)
in_date = pd.to_datetime(ex_data['admission_date'], format='%Y/%m/%d', errors='coerce')

ex_data['discharge_date'] = ex_data['discharge_date'].astype(int).astype(str)
out_date = pd.to_datetime(ex_data['discharge_date'], format='%Y/%m/%d', errors='coerce')

day_diff = out_date - in_date
ex_data['duration'] = day_diff.dt.days


print(ex_data.head())

                                        UID  \
0  0000DF74B811298E7996D362F838C50350D17CA8   
1  0003838B85827FE9BD3FC99F3E899242013CBEBA   
2  0008066764A0E935659492B10A486D4EBE09AF08   
3  000EC867960E20845A09FCC5FD412B0E28825B7C   
4  002257471042A6CF612F151CD78C68812E1BC87E   

                                Hospital_ID  Sex   Age   AF   DM  HTN  \
0  39FAF362A02156E2E9692C7FF1143C6719B39477  1.0  62.0  0.0  1.0  0.0   
1  AF62D5193B84B1C154DEAD3D5ECA2D0B9A34107D  1.0  81.0  1.0  0.0  1.0   
2  DC6E8FE8E1CD827B38415DF485BDA51791A9F0ED  0.0  80.0  0.0  1.0  0.0   
3  C6B3DC2C6B2831B2762B6238756438365E036FEA  1.0  82.0  0.0  1.0  1.0   
4  41F75C287201660F2A29F575E88CE6525F42AD59  1.0  89.0  0.0  0.0  1.0   

   Hyperlipidemia  CHF  Smoking  ...  RRSD G     RR CV  RRCV G  Mortality  \
0             0.0  0.0      1.0  ...     2.0  0.078459     2.0        0.0   
1             1.0  0.0      0.0  ...     3.0  0.146458     4.0        0.0   
2             1.0  0.0      0.0  ...     2.0  0

In [8]:
y_data = ex_data[['SurvivalWeeks']]
X_data = ex_data.drop(['UID', 'Hospital_ID', 'admission_date', 'discharge_date',
                       'Mortality', 'CVDeath', 'death_date', 'SurvivalWeeks'], axis=1)

categorical_columns = ['Sex', 'AF', 'DM', 'HTN', 'Hyperlipidemia', 'CHF', 'Smoking', 'Cancer before adm', 'Foley', 'ICU']
numerical_columns =  np.setdiff1d(X_data.columns, categorical_columns)


# Regression-Week

In [9]:
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesRegressor

# one-hot
X_data_one_hot = pd.get_dummies(X_data, columns=categorical_columns)
print(X_data_one_hot.shape, y_data.shape)
# for i in range(10):
#     X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.1, random_state=i, stratify=y_data)
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data_one_hot):
    X_train, X_test = X_data_one_hot.iloc[train_index], X_data_one_hot.iloc[test_index]
    y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]
    # scaling
    # scaler = preprocessing.MinMaxScaler()
    scaler = preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    reg = ExtraTreesRegressor(n_estimators=200, random_state=0).fit(X_train, y_train.values.ravel())
    print('R-squ', reg.score(X_test, y_test.values.ravel()))

(13623, 54) (13623, 1)
R-squ 0.1553380551626291
R-squ 0.16954192015334246
R-squ 0.15828824464905744
R-squ 0.15792394152899714
R-squ 0.14911167446684004
R-squ 0.17798833697140592
R-squ 0.1302273703165614
R-squ 0.17127899396122148
R-squ 0.1704754474780824
R-squ 0.16150192136248487


In [10]:
# dummy
X_data_dummy = pd.get_dummies(X_data, columns=categorical_columns, drop_first=True)
print(X_data_dummy.shape, y_data.shape)

from sklearn import linear_model
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data_one_hot):
    X_train, X_test = X_data_one_hot.iloc[train_index], X_data_one_hot.iloc[test_index]
    y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]
    # scaling
    # scaler = preprocessing.MinMaxScaler()
    scaler = preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    clf = linear_model.Lasso(alpha=0.1).fit(X_train, y_train.values.ravel())
    print('R-squ', clf.score(X_test, y_test.values.ravel()))

(13623, 44) (13623, 1)
R-squ 0.1332115090894591
R-squ 0.1618281236564043
R-squ 0.13906027670111476
R-squ 0.1493874504154985
R-squ 0.13785679159707376
R-squ 0.17231037133872296
R-squ 0.15349416805873684
R-squ 0.14516059459780783
R-squ 0.14128846315715116
R-squ 0.15892777481643128


# Regression-Month

In [11]:
y_data_m = (y_data/4).round(0)
y_data_m

Unnamed: 0,SurvivalWeeks
0,90.0
1,37.0
2,46.0
3,25.0
4,1.0
...,...
13618,1.0
13619,55.0
13620,23.0
13621,62.0


In [12]:
# one-hot
X_data_one_hot = pd.get_dummies(X_data, columns=categorical_columns)
# for i in range(10):
#     X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.1, random_state=i, stratify=y_data)
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data_one_hot):
    X_train, X_test = X_data_one_hot.iloc[train_index], X_data_one_hot.iloc[test_index]
    y_train, y_test = y_data_m.iloc[train_index], y_data_m.iloc[test_index]
    # scaling
    # scaler = preprocessing.MinMaxScaler()
    scaler = preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    clf = linear_model.Lasso(alpha=0.1).fit(X_train, y_train.values.ravel())
    print('R-squ', clf.score(X_test, y_test.values.ravel()))

R-squ 0.12855039181390115
R-squ 0.16003132821103772
R-squ 0.13676661669926005
R-squ 0.1488674762650497
R-squ 0.13755439919647683
R-squ 0.1676386340543109
R-squ 0.15209491831047606
R-squ 0.14198010627381197
R-squ 0.14236784831065674
R-squ 0.15374807920950417


# 6-Month Mortality (Binary classification)

In [13]:
y_data_od = (y_data<24).astype(int)
print(y_data_od)
print(y_data_od.groupby(['SurvivalWeeks']).size())

       SurvivalWeeks
0                  0
1                  0
2                  0
3                  0
4                  1
...              ...
13618              1
13619              0
13620              0
13621              0
13622              0

[13623 rows x 1 columns]
SurvivalWeeks
0    12834
1      789
dtype: int64


In [14]:
from sklearn.ensemble import ExtraTreesClassifier
from imblearn import over_sampling
from sklearn.metrics import auc, roc_curve

all_auroc = []
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data_one_hot):
    X_train, X_test = X_data_one_hot.iloc[train_index], X_data_one_hot.iloc[test_index]
    y_train, y_test = y_data_od.iloc[train_index], y_data_od.iloc[test_index]
    
    # scaling
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # over-sampling
    # print('before', y_train.groupby(['SurvivalWeeks']).size())
    sm = over_sampling.SVMSMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    # print('after', y_train.groupby(['SurvivalWeeks']).size())

    # define the model
    model = ExtraTreesClassifier(n_estimators=250,  random_state=42)
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict_proba(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred[:, 1])
    auroc = auc(fpr, tpr)
    print('auc', auroc)
    all_auroc.append(auroc)
print(np.mean(all_auroc), np.std(all_auroc))

auc 0.8643354668674699
auc 0.9092935414245392
auc 0.8796466691203534
auc 0.8910999745659731
auc 0.8675871847236494
auc 0.8749232626667758
auc 0.8746925238380395
auc 0.8541545162936501
auc 0.8998523246951219
auc 0.8600852272727272
0.8775670691468298 0.016835541170404447
