In [1]:
import torch
from torch import nn
import torch.utils.data as Data
import torch.nn.functional as F
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import KFold
from sklearn import preprocessing
from imblearn import over_sampling
from sklearn.metrics import auc, roc_curve
import matplotlib.pyplot as plt

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Executing the model on :", device)

Executing the model on : cuda:0


In [3]:
ex_data = pd.read_csv(os.path.join('..', '..', 'data', 'tidy_Stroke_Vital_Sign.csv'))

ex_data['admission_date'] = ex_data['admission_date'].astype(int).astype(str)
in_date = pd.to_datetime(ex_data['admission_date'], format='%Y/%m/%d', errors='coerce')


ex_data['discharge_date'] = ex_data['discharge_date'].astype(int).astype(str)
out_date = pd.to_datetime(ex_data['discharge_date'], format='%Y/%m/%d', errors='coerce')

day_diff = out_date - in_date
ex_data['duration'] = day_diff.dt.days

y_data = ex_data[['SurvivalWeeks']]
X_data = ex_data.drop(['UID', 'Hospital_ID', 'admission_date', 'discharge_date',
                       'Mortality', 'CVDeath', 'death_date', 'SurvivalWeeks'], axis=1)

categorical_columns = ['Sex', 'AF', 'DM', 'HTN', 'CHF', 'Smoking', 'Cancer before adm']
numerical_columns = np.setdiff1d(X_data.columns, categorical_columns)

# one-hot
X_data_one_hot = pd.get_dummies(X_data, columns=categorical_columns)
y_data_od = (y_data < 4).astype(int)

## Basic NN

In [4]:
class DNN (nn.Module):
    def __init__(self, input_len):
        super(DNN, self).__init__()
        self.fc1 = nn.Linear(input_len, 15)
        self.fc2 = nn.Linear(15, 7)
        self.fc3 = nn.Linear(7, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

In [5]:
# Settings
epochs = 10
batch_size = 128
lr = 5e-4

## with vital sign

In [6]:
model = DNN(input_len=X_data_one_hot.shape[1]).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_function = nn.MSELoss()

all_auroc = []
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data_one_hot):
    X_train, X_test = X_data_one_hot.iloc[train_index], X_data_one_hot.iloc[test_index]
    y_train, y_test = y_data_od.iloc[train_index], y_data_od.iloc[test_index]

    # scaling
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # over-sampling
    # print('before', y_train.groupby(['SurvivalWeeks']).size())
    sm = over_sampling.SVMSMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    # print('after', y_train.groupby(['SurvivalWeeks']).size())

    # DataLoader
    train_xt = torch.from_numpy(X_train.astype(np.float32)).cuda(device)
    train_yt = torch.from_numpy(y_train.values.astype(np.float32)).cuda(device)
    train_data = Data.TensorDataset(train_xt, train_yt)
    train_loader = Data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

    test_xt = torch.from_numpy(X_test.astype(np.float32)).cuda(device)
    test_yt = torch.from_numpy(y_test.values.astype(np.float32)).cuda(device)

    train_loss_all = []

    model.train()
    for epoch in range(epochs):
        for step, (inputs, labels) in enumerate(train_loader):
            output = model(inputs)
            train_loss = loss_function(output, labels)
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
            train_loss_all.append(train_loss.item())
#         print("train epoch %d, loss %s:" % (epoch + 1, train_loss.item()))
    print('training complete')

    model.eval()
    y_pred = model(test_xt).cpu().data.numpy()
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auroc = auc(fpr, tpr)
    print('auc', auroc)
    all_auroc.append(auroc)
    # plt.figure()
    # plt.plot(train_loss_all, "g-")
    # plt.title("DNN: Train loss per iteration")
    # plt.show()
print(np.mean(all_auroc), np.std(all_auroc))

training complete
auc 0.9257180650037793
training complete
auc 0.9412294598204579
training complete
auc 0.904089969947408
training complete
auc 0.9728652784879313
training complete
auc 0.9785633107340204
training complete
auc 0.975155458314141
training complete
auc 0.9705636826204485
training complete
auc 0.9503148021639186
training complete
auc 0.9613567388605325
training complete
auc 0.9431409576532064
0.9522997723605844 0.023004006083111244


## with vital sign / without ICU

In [7]:
X_data_one_hot_no_icu = X_data_one_hot.drop('ICU', axis=1)

model = DNN(input_len=X_data_one_hot_no_icu.shape[1]).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_function = nn.MSELoss()

all_auroc = []
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data_one_hot_no_icu):
    X_train, X_test = X_data_one_hot_no_icu.iloc[train_index], X_data_one_hot_no_icu.iloc[test_index]
    y_train, y_test = y_data_od.iloc[train_index], y_data_od.iloc[test_index]

    # scaling
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # over-sampling
    # print('before', y_train.groupby(['SurvivalWeeks']).size())
    sm = over_sampling.SVMSMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    # print('after', y_train.groupby(['SurvivalWeeks']).size())

    # DataLoader
    train_xt = torch.from_numpy(X_train.astype(np.float32)).cuda(device)
    train_yt = torch.from_numpy(y_train.values.astype(np.float32)).cuda(device)
    train_data = Data.TensorDataset(train_xt, train_yt)
    train_loader = Data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

    test_xt = torch.from_numpy(X_test.astype(np.float32)).cuda(device)
    test_yt = torch.from_numpy(y_test.values.astype(np.float32)).cuda(device)

    train_loss_all = []

    model.train()
    for epoch in range(epochs):
        for step, (inputs, labels) in enumerate(train_loader):
            output = model(inputs)
            train_loss = loss_function(output, labels)
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
            train_loss_all.append(train_loss.item())
#         print("train epoch %d, loss %s:" % (epoch + 1, train_loss.item()))
    print('training complete')

    model.eval()
    y_pred = model(test_xt).cpu().data.numpy()
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auroc = auc(fpr, tpr)
    print('auc', auroc)
    all_auroc.append(auroc)
    # plt.figure()
    # plt.plot(train_loss_all, "g-")
    # plt.title("DNN: Train loss per iteration")
    # plt.show()
print(np.mean(all_auroc), np.std(all_auroc))

training complete
auc 0.9126417233560091
training complete
auc 0.9374708787624638
training complete
auc 0.9375469571750563
training complete
auc 0.9784720630498792
training complete
auc 0.967002390563154
training complete
auc 0.9666052510363887
training complete
auc 0.964521386547941
training complete
auc 0.937851959897343
training complete
auc 0.9591150503517726
training complete
auc 0.9523501565046434
0.9513577817244651 0.018739768465919644


## without vital sign

In [8]:
X_data_one_hot_no_vital = X_data_one_hot.drop(['Mean HR', 'MeanHR G', 'HR SD', 'HRSD G', 'HR CV', 'HRCV G', 'Mean SBP',
                                               'Mean SBP G', 'SBP SD', 'SBPSD G', 'SBP CV', 'SBPCV G', 'Mean DBP',
                                               'MeanDBP G', 'DBP SD', 'DBPSD G', 'DBP CV', 'DBPCV G', 'Mean RR',
                                               'MeanRR G', 'RR SD', 'RRSD G', 'RR CV'], axis=1)

model = DNN(input_len=X_data_one_hot_no_vital.shape[1]).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_function = nn.MSELoss()

all_auroc = []
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data_one_hot_no_vital):
    X_train, X_test = X_data_one_hot_no_vital.iloc[train_index], X_data_one_hot_no_vital.iloc[test_index]
    y_train, y_test = y_data_od.iloc[train_index], y_data_od.iloc[test_index]

    # scaling
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # over-sampling
    # print('before', y_train.groupby(['SurvivalWeeks']).size())
    sm = over_sampling.SVMSMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    # print('after', y_train.groupby(['SurvivalWeeks']).size())

    # DataLoader
    train_xt = torch.from_numpy(X_train.astype(np.float32)).cuda(device)
    train_yt = torch.from_numpy(y_train.values.astype(np.float32)).cuda(device)
    train_data = Data.TensorDataset(train_xt, train_yt)
    train_loader = Data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

    test_xt = torch.from_numpy(X_test.astype(np.float32)).cuda(device)
    test_yt = torch.from_numpy(y_test.values.astype(np.float32)).cuda(device)

    train_loss_all = []

    model.train()
    for epoch in range(epochs):
        for step, (inputs, labels) in enumerate(train_loader):
            output = model(inputs)
            train_loss = loss_function(output, labels)
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
            train_loss_all.append(train_loss.item())
#         print("train epoch %d, loss %s:" % (epoch + 1, train_loss.item()))
    print('training complete')

    model.eval()
    y_pred = model(test_xt).cpu().data.numpy()
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auroc = auc(fpr, tpr)
    print('auc', auroc)
    all_auroc.append(auroc)
    # plt.figure()
    # plt.plot(train_loss_all, "g-")
    # plt.title("DNN: Train loss per iteration")
    # plt.show()
print(np.mean(all_auroc), np.std(all_auroc))

training complete
auc 0.9377739984882841
training complete
auc 0.9244400956729725
training complete
auc 0.9169562359128475
training complete
auc 0.9740818449494861
training complete
auc 0.9725673080691304
training complete
auc 0.9713553661906955
training complete
auc 0.9578828112577517
training complete
auc 0.9320175033695832
training complete
auc 0.9743412884535798
training complete
auc 0.9638357865328401
0.952525223889717 0.021320443560194764


## only vital sign

In [9]:
X_data_one_hot_only_vital = X_data_one_hot[['Mean HR', 'MeanHR G', 'HR SD', 'HRSD G', 'HR CV', 'HRCV G', 'Mean SBP',
                                       'Mean SBP G', 'SBP SD', 'SBPSD G', 'SBP CV', 'SBPCV G', 'Mean DBP',
                                       'MeanDBP G', 'DBP SD', 'DBPSD G', 'DBP CV', 'DBPCV G', 'Mean RR',
                                       'MeanRR G', 'RR SD', 'RRSD G', 'RR CV']]

model = DNN(input_len=X_data_one_hot_only_vital.shape[1]).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_function = nn.MSELoss()

all_auroc = []
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data_one_hot_only_vital):
    X_train, X_test = X_data_one_hot_only_vital.iloc[train_index], X_data_one_hot_only_vital.iloc[test_index]
    y_train, y_test = y_data_od.iloc[train_index], y_data_od.iloc[test_index]

    # scaling
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # over-sampling
    # print('before', y_train.groupby(['SurvivalWeeks']).size())
    sm = over_sampling.SVMSMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    # print('after', y_train.groupby(['SurvivalWeeks']).size())

    # DataLoader
    train_xt = torch.from_numpy(X_train.astype(np.float32)).cuda(device)
    train_yt = torch.from_numpy(y_train.values.astype(np.float32)).cuda(device)
    train_data = Data.TensorDataset(train_xt, train_yt)
    train_loader = Data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

    test_xt = torch.from_numpy(X_test.astype(np.float32)).cuda(device)
    test_yt = torch.from_numpy(y_test.values.astype(np.float32)).cuda(device)

    train_loss_all = []

    model.train()
    for epoch in range(epochs):
        for step, (inputs, labels) in enumerate(train_loader):
            output = model(inputs)
            train_loss = loss_function(output, labels)
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
            train_loss_all.append(train_loss.item())
#         print("train epoch %d, loss %s:" % (epoch + 1, train_loss.item()))
    print('training complete')

    model.eval()
    y_pred = model(test_xt).cpu().data.numpy()
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auroc = auc(fpr, tpr)
    print('auc', auroc)
    all_auroc.append(auroc)
    # plt.figure()
    # plt.plot(train_loss_all, "g-")
    # plt.title("DNN: Train loss per iteration")
    # plt.show()
print(np.mean(all_auroc), np.std(all_auroc))

training complete
auc 0.8748866213151927
training complete
auc 0.907215854378281
training complete
auc 0.8727695341848234
training complete
auc 0.9245023537916321
training complete
auc 0.8365795352118196
training complete
auc 0.8733302625518194
training complete
auc 0.8815391954205756
training complete
auc 0.9184837798415834
training complete
auc 0.8700682852807283
training complete
auc 0.8383216493778618
0.8797697071354318 0.02836895241864546
