In [1]:
from __future__ import print_function, division
%load_ext autoreload

In [2]:
%autoreload

import copy, math, os, pickle, time, pandas as pd, numpy as np, scipy.stats as ss

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score

import torch, torch.utils.data as utils, torch.nn as nn, torch.nn.functional as F, torch.optim as optim
from torch.autograd import Variable
from torch.nn.parameter import Parameter


from src.model.mmd_grud_utils import *
from utils.eda_functions import (load_from_pickle, save_to_pickle)
from config import *

In [3]:
BLINDED = False
RANDOM = 0

In [4]:
X_train = load_from_pickle(os.path.join(DATA_DIR, 'X_train.pkl'))
X_dev = load_from_pickle(os.path.join(DATA_DIR, 'X_dev.pkl'))
X_test = load_from_pickle(os.path.join(DATA_DIR, 'X_test.pkl'))

Ys_train = load_from_pickle(os.path.join(DATA_DIR, 'Y_train.pkl'))
Ys_dev = load_from_pickle(os.path.join(DATA_DIR, 'Y_dev.pkl'))
Ys_test = load_from_pickle(os.path.join(DATA_DIR, 'Y_test.pkl'))

In [6]:
GAP_TIME          = 6  # In hours
WINDOW_SIZE       = 24 # In hours
SEED              = 1
ID_COLS           = ['subject_id', 'hadm_id', 'icustay_id']
GPU               = '2'

os.environ['CUDA_VISIBLE_DEVICES'] = GPU
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1784663e5b0>

In [7]:
class DictDist():
    def __init__(self, dict_of_rvs): self.dict_of_rvs = dict_of_rvs
    def rvs(self, n):
        a = {k: v.rvs(n) for k, v in self.dict_of_rvs.items()}
        out = []
        for i in range(n): out.append({k: vs[i] for k, vs in a.items()})
        return out

class Choice():
    def __init__(self, options): self.options = options
    def rvs(self, n): return [self.options[i] for i in ss.randint(0, len(self.options)).rvs(n)]

In [6]:
print(X_train.info())
print(X_train.columns)
print(X_train.head())

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 402240 entries, (3, 211552, 145834, 0) to (99995, 229633, 137810, 23)
Columns: 301 entries, absolute_time to nivdurations
dtypes: float64(267), int64(16), uint8(18)
memory usage: 880.3 MB
None
Index(['absolute_time', 'albumin_mask', 'albumin_mean',
       'albumin_time_since_measured', 'albumin ascites_mask',
       'albumin ascites_mean', 'albumin ascites_time_since_measured',
       'albumin pleural_mask', 'albumin pleural_mean',
       'albumin pleural_time_since_measured',
       ...
       'dopamine', 'epinephrine', 'isuprel', 'milrinone', 'norepinephrine',
       'phenylephrine', 'vasopressin', 'colloid_bolus', 'crystalloid_bolus',
       'nivdurations'],
      dtype='object', length=301)
                                        absolute_time  albumin_mask  \
subject_id icustay_id hadm_id hours_in                                
3          211552     145834  0                    19           1.0   
                              1  

In [7]:
print(Ys_train.info())
print(Ys_train.columns)
print(Ys_train.head())

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 16760 entries, (3, 145834, 211552) to (99995, 137810, 229633)
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   mort_hosp  16760 non-null  int64
 1   mort_icu   16760 non-null  int64
dtypes: int64(2)
memory usage: 5.0 MB
None
Index(['mort_hosp', 'mort_icu'], dtype='object')
                               mort_hosp  mort_icu
subject_id hadm_id icustay_id                     
3          145834  211552              0         0
4          185777  294638              0         0
6          107064  228232              0         0
9          150750  220597              1         1
11         194540  229441              0         0


### Task Prediction

#### Hyperparameters

In [10]:
N = 10

GRU_D_dist = DictDist({
    'cell_size': ss.randint(50, 75),
    'hidden_size': ss.randint(65, 95),
    'learning_rate': ss.uniform(2e-3, 1e-1),
    'num_epochs': ss.randint(15, 150),
    'patience': ss.randint(3, 7),
    'batch_size': ss.randint(35, 65),
    'early_stop_frac': ss.uniform(0.05, 0.1),
    'seed': ss.randint(1, 10000),
})
np.random.seed(SEED)
GRU_D_hyperparams_list = GRU_D_dist.rvs(N)

# with open('../src/model/baselines_gru-d.pkl', mode='rb') as f: results = pickle.load(f)

EOFError: Ran out of input

In [12]:
results = {}

In [14]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,absolute_time,albumin_mask,albumin_mean,albumin_time_since_measured,albumin ascites_mask,albumin ascites_mean,albumin ascites_time_since_measured,albumin pleural_mask,albumin pleural_mean,albumin pleural_time_since_measured,...,dopamine,epinephrine,isuprel,milrinone,norepinephrine,phenylephrine,vasopressin,colloid_bolus,crystalloid_bolus,nivdurations
subject_id,icustay_id,hadm_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
3,211552,145834,0,19,1.0,0.170213,-0.966384,0.0,0.31033,-1.186357,0.0,0.473643,-1.115815,...,0,0,0,0,0,0,0,0,0,0
3,211552,145834,1,20,0.0,0.170213,-0.947027,0.0,0.31033,-1.186357,0.0,0.473643,-1.115815,...,1,0,0,0,0,1,0,0,0,0
3,211552,145834,2,21,0.0,0.170213,-0.927670,0.0,0.31033,-1.186357,0.0,0.473643,-1.115815,...,1,0,0,0,0,1,0,0,0,0
3,211552,145834,3,22,0.0,0.170213,-0.908314,0.0,0.31033,-1.186357,0.0,0.473643,-1.115815,...,0,0,0,0,0,1,0,0,0,0
3,211552,145834,4,23,0.0,0.170213,-0.888957,0.0,0.31033,-1.186357,0.0,0.473643,-1.115815,...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,229633,137810,19,8,0.0,0.461151,0.272453,0.0,0.31033,1.652568,0.0,0.473643,1.340010,...,0,0,0,0,0,0,0,0,0,1
99995,229633,137810,20,9,0.0,0.461151,0.291810,0.0,0.31033,1.652722,0.0,0.473643,1.340146,...,0,0,0,0,0,0,0,0,0,1
99995,229633,137810,21,10,0.0,0.461151,0.311167,0.0,0.31033,1.652876,0.0,0.473643,1.340282,...,0,0,0,0,0,0,0,0,0,1
99995,229633,137810,22,11,0.0,0.461151,0.330523,0.0,0.31033,1.653031,0.0,0.473643,1.340417,...,0,0,0,0,0,0,0,0,0,1


#### GRU-D

In [13]:
model_name       = 'GRU-D'
tasks             = ['mort_hosp', 'mort_icu']
hyperparams_list = GRU_D_hyperparams_list
RERUN            = False

if model_name not in results: results[model_name] = {}

for t in tasks:
    if t not in results[model_name]: results[model_name][t] = {}
    for n, X_train, X_dev, X_test in (('full_X', X_train, X_dev, X_test),):
        print("Running model %s on target %s with representation %s" % (model_name, t, n))
        X_mean = np.nanmean(
            to_3D_tensor(
                X_train.loc[:, pd.IndexSlice[:, 'mean']] *
                np.where((X_train.loc[:, pd.IndexSlice[:, 'mask']] == 1).values, 1, np.NaN)
            ),
            axis=0, keepdims=True
        ).transpose([0, 2, 1])
        base_params = {'X_mean': X_mean, 'output_last': True, 'input_size': X_mean.shape[2]}

        if n in results[model_name][t]:
            if not RERUN:
                print("Final results for model %s on target %s with representation %s" % (model_name, t, n))
                print(results[model_name][t][n])
                continue
            best_s, best_hyperparams = results[model_name][t][n][-1], results[model_name][t][n][1]
            print("Loading best hyperparams", best_hyperparams)
        else:
            best_s, best_hyperparams = -np.Inf, None
            for i, hyperparams in enumerate(hyperparams_list):
                print("On sample %d / %d (hyperparams = %s)" % (i+1, len(hyperparams_list), repr((hyperparams))))

                early_stop_frac,batch_size,seed = [hyperparams[k] for k in ('early_stop_frac','batch_size','seed')]

                np.random.seed(seed)
                all_train_subjects = list(
                    np.random.permutation(Ys_train.index.get_level_values('subject_id').values)
                )
                N_early_stop        = int(len(all_train_subjects) * early_stop_frac)
                train_subjects      = all_train_subjects[:-N_early_stop]
                early_stop_subjects = all_train_subjects[-N_early_stop:]
                X_train_obs         = X_train[X_train.index.get_level_values('subject_id').isin(train_subjects)]
                Ys_train_obs        = Ys_train[Ys_train.index.get_level_values('subject_id').isin(train_subjects)]

                X_train_early_stop  = X_train[X_train.index.get_level_values('subject_id').isin(early_stop_subjects)]
                Ys_train_early_stop = Ys_train[
                    Ys_train.index.get_level_values('subject_id').isin(early_stop_subjects)
                ]

                train_dataloader      = prepare_dataloader(X_train_obs, Ys_train_obs[t], batch_size=batch_size)
                early_stop_dataloader = prepare_dataloader(
                    X_train_early_stop, Ys_train_early_stop[t], batch_size=batch_size
                )
                dev_dataloader        = prepare_dataloader(X_dev, Ys_dev[t], batch_size=batch_size)
                test_dataloader       = prepare_dataloader(X_test, Ys_test[t], batch_size=batch_size)

                model_hyperparams = copy.copy(base_params)
                model_hyperparams.update(
                    {k: v for k, v in hyperparams.items() if k in ('cell_size', 'hidden_size', 'batch_size')}
                )
                model = GRUD(**model_hyperparams)

                best_model, _ = Train_Model(
                    model, train_dataloader, early_stop_dataloader,
                    **{k: v for k, v in hyperparams.items() if k in (
                        'num_epochs', 'patience', 'learning_rate', 'batch_size'
                    )}
                )

                probabilities_dev, labels_dev = predict_proba(best_model, dev_dataloader)
                probabilities_dev = np.concatenate(probabilities_dev)[:, 1]
                labels_dev        = np.concatenate(labels_dev)
                s = roc_auc_score(labels_dev, probabilities_dev)
                if s > best_s:
                    best_s, best_hyperparams = s, hyperparams
                    print("New Best Score: %.2f @ hyperparams = %s" % (100*best_s, repr((best_hyperparams))))

        ## Test
        np.random.seed(SEED)
        hyperparams = best_hyperparams # In case I forgot a replace below
        early_stop_frac,batch_size,seed = [best_hyperparams[k] for k in ('early_stop_frac','batch_size','seed')]

        X_train_concat, Ys_train_concat = pd.concat((X_train, X_dev)), pd.concat((Ys_train, Ys_dev))

        all_train_subjects = list(np.random.permutation(Ys_train_concat.index.get_level_values('subject_id').values))
        N_early_stop = int(len(all_train_subjects) * early_stop_frac)
        train_subjects, early_stop_subjects = all_train_subjects[:-N_early_stop], all_train_subjects[-N_early_stop:]
        X_train_obs         = X_train_concat[X_train_concat.index.get_level_values('subject_id').isin(train_subjects)]
        Ys_train_obs        = Ys_train_concat[Ys_train_concat.index.get_level_values('subject_id').isin(train_subjects)]

        X_train_early_stop  = X_train_concat[X_train_concat.index.get_level_values('subject_id').isin(early_stop_subjects)]
        Ys_train_early_stop = Ys_train_concat[Ys_train_concat.index.get_level_values('subject_id').isin(early_stop_subjects)]

        train_dataloader      = prepare_dataloader(X_train_obs, Ys_train_obs[t], batch_size=batch_size)
        early_stop_dataloader = prepare_dataloader(X_train_early_stop, Ys_train_early_stop[t], batch_size=batch_size)
        test_dataloader       = prepare_dataloader(X_test, Ys_test[t], batch_size=batch_size)

        model_hyperparams = copy.copy(base_params)
        model_hyperparams.update(
            {k: v for k, v in best_hyperparams.items() if k in ('cell_size', 'hidden_size', 'batch_size')}
        )
        model = GRUD(**model_hyperparams)

        best_model, (losses_train, losses_early_stop, losses_epochs_train, losses_epochs_early_stop) = Train_Model(
            model, train_dataloader, early_stop_dataloader,
            **{k: v for k, v in best_hyperparams.items() if k in (
                'num_epochs', 'patience', 'learning_rate', 'batch_size'
            )}
        )

        probabilities_test, labels_test = predict_proba(best_model, test_dataloader)

        y_score = np.concatenate(probabilities_test)[:, 1]
        y_pred  = np.argmax(probabilities_test)
        y_true  = np.concatenate(labels_test)

        auc   = roc_auc_score(y_true, y_score)
        auprc = average_precision_score(y_true, y_score)
        acc   = accuracy_score(y_true, y_pred)
        F1    = f1_score(y_true, y_pred)
        print("Final results for model %s on target %s with representation %s" % (model_name, t, n))
        print(auc, auprc, acc, F1)

        results[model_name][t][n] = None, best_hyperparams, auc, auprc, acc, F1, best_s
        with open('../src/model/baselines_gru-d.pkl', mode='wb') as f: pickle.dump(results, f)

Running model GRU-D on target mort_hosp with representation full_X


TypeError: unhashable type: 'slice'