In [1]:
from __future__ import print_function, division
%load_ext autoreload

In [None]:
%autoreload
from src.model.mmd_grud_utils import *
from utils.eda_functions import (load_from_pickle, save_to_pickle)
from config import *

In [None]:
import copy, math, os, pickle, time, pandas as pd, numpy as np, scipy.stats as ss

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score

import torch, torch.utils.data as utils, torch.nn as nn, torch.nn.functional as F, torch.optim as optim
from torch.autograd import Variable
from torch.nn.parameter import Parameter

In [None]:
# DATA_FILEPATH     = '/scratch/mmd/mimic_data/final/grouping_5/all_hourly_data.h5'
# RAW_DATA_FILEPATH = '/scratch/mmd/mimic_data/final/nogrouping_5/all_hourly_data.h5'
GAP_TIME          = 6  # In hours
WINDOW_SIZE       = 24 # In hours
SEED              = 1
ID_COLS           = ['subject_id', 'hadm_id', 'icustay_id']
GPU               = '2'

os.environ['CUDA_VISIBLE_DEVICES'] = GPU
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
class DictDist():
    def __init__(self, dict_of_rvs): self.dict_of_rvs = dict_of_rvs
    def rvs(self, n):
        a = {k: v.rvs(n) for k, v in self.dict_of_rvs.items()}
        out = []
        for i in range(n): out.append({k: vs[i] for k, vs in a.items()})
        return out
    
class Choice():
    def __init__(self, options): self.options = options
    def rvs(self, n): return [self.options[i] for i in ss.randint(0, len(self.options)).rvs(n)]

In [None]:
vitals_train = load_from_pickle(os.path.join(DATA_DIR, 'Vitals_train_gru.pkl'))
vitals_dev = load_from_pickle(os.path.join(DATA_DIR, 'Vitals_dev_gru.pkl'))
vitals_test = load_from_pickle(os.path.join(DATA_DIR, 'Vitals_test_gru.pkl'))

Ys_train = load_from_pickle(os.path.join(DATA_DIR, 'Y_train.pkl'))
Ys_dev = load_from_pickle(os.path.join(DATA_DIR, 'Y_dev.pkl'))
Ys_test = load_from_pickle(os.path.join(DATA_DIR, 'Y_test.pkl'))

### Task Prediction

#### Hyperparams

In [None]:
N = 10

GRU_D_dist = DictDist({
    'cell_size': ss.randint(50, 75),
    'hidden_size': ss.randint(65, 95), 
    'learning_rate': ss.uniform(2e-3, 1e-1),
    'num_epochs': ss.randint(15, 150),
    'patience': ss.randint(3, 7),
    'batch_size': ss.randint(35, 65),
    'early_stop_frac': ss.uniform(0.05, 0.1),
    'seed': ss.randint(1, 10000),
})
np.random.seed(SEED)
GRU_D_hyperparams_list = GRU_D_dist.rvs(N)

# with open('/scratch/mmd/extraction_baselines_gru-d.pkl', mode='rb') as f: results = pickle.load(f)

In [None]:
results = {}

### GRU-D

In [None]:
model_name       = 'GRU-D'
hyperparams_list = GRU_D_hyperparams_list
RERUN            = False
if model_name not in results: results[model_name] = {}
for t in ['mort_icu', 'mort_hosp']:
    if t not in results[model_name]: results[model_name][t] = {}
    for n, X_train, X_dev, X_test in (
        ('lvl2', vitals_train, vitals_dev, vitals_test),
#         ('raw', raw_train, raw_dev, raw_test)
    ):
        print("Running model %s on target %s with representation %s" % (model_name, t, n))
        X_mean = np.nanmean(
            to_3D_tensor(
                X_train.loc[:, pd.IndexSlice[:, 'mean']] * 
                np.where((X_train.loc[:, pd.IndexSlice[:, 'mask']] == 1).values, 1, np.NaN)
            ),
            axis=0, keepdims=True
        ).transpose([0, 2, 1])
        base_params = {'X_mean': X_mean, 'output_last': True, 'input_size': X_mean.shape[2]}
    
        if n in results[model_name][t]:
            if not RERUN: 
                print("Final results for model %s on target %s with representation %s" % (model_name, t, n))
                print(results[model_name][t][n])
                continue
            best_s, best_hyperparams = results[model_name][t][n][-1], results[model_name][t][n][1]
            print("Loading best hyperparams", best_hyperparams)
        else:
            best_s, best_hyperparams = -np.Inf, None
            for i, hyperparams in enumerate(hyperparams_list):
                print("On sample %d / %d (hyperparams = %s)" % (i+1, len(hyperparams_list), repr((hyperparams))))

                early_stop_frac,batch_size,seed = [hyperparams[k] for k in ('early_stop_frac','batch_size','seed')]

                np.random.seed(seed)
                all_train_subjects = list(
                    np.random.permutation(Ys_train.index.get_level_values('subject_id').values)
                )
                N_early_stop        = int(len(all_train_subjects) * early_stop_frac)
                train_subjects      = all_train_subjects[:-N_early_stop]
                early_stop_subjects = all_train_subjects[-N_early_stop:]
                X_train_obs         = X_train[X_train.index.get_level_values('subject_id').isin(train_subjects)]
                Ys_train_obs        = Ys_train[Ys_train.index.get_level_values('subject_id').isin(train_subjects)]

                X_train_early_stop  = X_train[X_train.index.get_level_values('subject_id').isin(early_stop_subjects)]
                Ys_train_early_stop = Ys_train[
                    Ys_train.index.get_level_values('subject_id').isin(early_stop_subjects)
                ]

                train_dataloader      = prepare_dataloader(X_train_obs, Ys_train_obs[t], batch_size=batch_size)
                early_stop_dataloader = prepare_dataloader(
                    X_train_early_stop, Ys_train_early_stop[t], batch_size=batch_size
                )
                dev_dataloader        = prepare_dataloader(X_dev, Ys_dev[t], batch_size=batch_size)
                test_dataloader       = prepare_dataloader(X_test, Ys_test[t], batch_size=batch_size)

                model_hyperparams = copy.copy(base_params)
                model_hyperparams.update(
                    {k: v for k, v in hyperparams.items() if k in ('cell_size', 'hidden_size', 'batch_size')}
                )
                model = GRUD(**model_hyperparams)

                best_model, _ = Train_Model(
                    model, train_dataloader, early_stop_dataloader,
                    **{k: v for k, v in hyperparams.items() if k in (
                        'num_epochs', 'patience', 'learning_rate', 'batch_size'
                    )}
                )

                probabilities_dev, labels_dev = predict_proba(best_model, dev_dataloader)
                probabilities_dev = np.concatenate(probabilities_dev)[:, 1]
                labels_dev        = np.concatenate(labels_dev)
                s = roc_auc_score(labels_dev, probabilities_dev)
                if s > best_s:
                    best_s, best_hyperparams = s, hyperparams
                    print("New Best Score: %.2f @ hyperparams = %s" % (100*best_s, repr((best_hyperparams))))
                
        ## Test
        np.random.seed(seed)
        hyperparams = best_hyperparams # In case I forgot a replace below
        early_stop_frac,batch_size,seed = [best_hyperparams[k] for k in ('early_stop_frac','batch_size','seed')]
        
        X_train_concat, Ys_train_concat = pd.concat((X_train, X_dev)), pd.concat((Ys_train, Ys_dev))
        
        all_train_subjects = list(np.random.permutation(Ys_train_concat.index.get_level_values('subject_id').values))
        N_early_stop = int(len(all_train_subjects) * early_stop_frac)
        train_subjects, early_stop_subjects = all_train_subjects[:-N_early_stop], all_train_subjects[-N_early_stop:]
        X_train_obs         = X_train_concat[X_train_concat.index.get_level_values('subject_id').isin(train_subjects)]
        Ys_train_obs        = Ys_train_concat[Ys_train_concat.index.get_level_values('subject_id').isin(train_subjects)]

        X_train_early_stop  = X_train_concat[X_train_concat.index.get_level_values('subject_id').isin(early_stop_subjects)]
        Ys_train_early_stop = Ys_train_concat[Ys_train_concat.index.get_level_values('subject_id').isin(early_stop_subjects)]

        train_dataloader      = prepare_dataloader(X_train_obs, Ys_train_obs[t], batch_size=batch_size)
        early_stop_dataloader = prepare_dataloader(X_train_early_stop, Ys_train_early_stop[t], batch_size=batch_size)
        test_dataloader       = prepare_dataloader(X_test, Ys_test[t], batch_size=batch_size)

        model_hyperparams = copy.copy(base_params)
        model_hyperparams.update(
            {k: v for k, v in best_hyperparams.items() if k in ('cell_size', 'hidden_size', 'batch_size')}
        )
        model = GRUD(**model_hyperparams)

        best_model, (losses_train, losses_early_stop, losses_epochs_train, losses_epochs_early_stop) = Train_Model(
            model, train_dataloader, early_stop_dataloader,
            **{k: v for k, v in best_hyperparams.items() if k in (
                'num_epochs', 'patience', 'learning_rate', 'batch_size'
            )}
        )

        probabilities_test, labels_test = predict_proba(best_model, test_dataloader)

        y_score = np.concatenate(probabilities_test)[:, 1]
        y_pred  = np.argmax(probabilities_test)
        y_true  = np.concatenate(labels_test)

        auc   = roc_auc_score(y_true, y_score)
        auprc = average_precision_score(y_true, y_score)
        acc   = accuracy_score(y_true, y_pred)
        F1    = f1_score(y_true, y_pred)
        print("Final results for model %s on target %s with representation %s" % (model_name, t, n))
        print(auc, auprc, acc, F1)
        
        results[model_name][t][n] = None, best_hyperparams, auc, auprc, acc, F1, best_s
        with open('../src/model/baseline_model_gru-d.pkl', mode='wb') as f: pickle.dump(results, f)

In [None]:
X_mean = np.nanmean(
            to_3D_tensor(
                vitals_train.loc[:, pd.IndexSlice[:, 'mean']] *
                np.where((vitals_train.loc[:, pd.IndexSlice[:, 'mask']] == 1).values, 1, np.NaN)
            ),
            axis=0, keepdims=True
        ).transpose([0, 2, 1])
base_params = {'X_mean': X_mean, 'output_last': True, 'input_size': X_mean.shape[2]}

In [None]:
best_s, best_hyperparams = -np.Inf, None
for i, hyperparams in enumerate(hyperparams_list):
    print("On sample %d / %d (hyperparams = %s)" % (i+1, len(hyperparams_list), repr((hyperparams))))

    early_stop_frac,batch_size,seed = [hyperparams[k] for k in ('early_stop_frac','batch_size','seed')]

    np.random.seed(seed)
    all_train_subjects = list(
        np.random.permutation(Ys_train.index.get_level_values('subject_id').values)
    )
    N_early_stop        = int(len(all_train_subjects) * early_stop_frac)
    train_subjects      = all_train_subjects[:-N_early_stop]
    early_stop_subjects = all_train_subjects[-N_early_stop:]
    X_train_obs         = X_train[X_train.index.get_level_values('subject_id').isin(train_subjects)]
    Ys_train_obs        = Ys_train[Ys_train.index.get_level_values('subject_id').isin(train_subjects)]

    X_train_early_stop  = X_train[X_train.index.get_level_values('subject_id').isin(early_stop_subjects)]
    Ys_train_early_stop = Ys_train[
        Ys_train.index.get_level_values('subject_id').isin(early_stop_subjects)
    ]

    train_dataloader      = prepare_dataloader(X_train_obs, Ys_train_obs[t], batch_size=batch_size)
    early_stop_dataloader = prepare_dataloader(
        X_train_early_stop, Ys_train_early_stop[t], batch_size=batch_size
    )
    dev_dataloader        = prepare_dataloader(X_dev, Ys_dev[t], batch_size=batch_size)
    test_dataloader       = prepare_dataloader(X_test, Ys_test[t], batch_size=batch_size)

    model_hyperparams = copy.copy(base_params)
    model_hyperparams.update(
        {k: v for k, v in hyperparams.items() if k in ('cell_size', 'hidden_size', 'batch_size')}
    )
    model = GRUD(**model_hyperparams)
    #
    # best_model, _ = Train_Model(
    #     model, train_dataloader, early_stop_dataloader,
    #     **{k: v for k, v in hyperparams.items() if k in (
    #         'num_epochs', 'patience', 'learning_rate', 'batch_size'
    #     )}
    # )
    #
    # probabilities_dev, labels_dev = predict_proba(best_model, dev_dataloader)
    # probabilities_dev = np.concatenate(probabilities_dev)[:, 1]
    # labels_dev        = np.concatenate(labels_dev)
    # s = roc_auc_score(labels_dev, probabilities_dev)
    # if s > best_s:
    #     best_s, best_hyperparams = s, hyperparams
    #     print("New Best Score: %.2f @ hyperparams = %s" % (100*best_s, repr((best_hyperparams))))

In [None]:
hyperparams_list = GRU_D_hyperparams_list
hyperparams = hyperparams_list[0]
# print("On sample %d / %d (hyperparams = %s)" % (i+1, len(hyperparams_list), repr((hyperparams))))

early_stop_frac,batch_size,seed = [hyperparams[k] for k in ('early_stop_frac','batch_size','seed')]

np.random.seed(seed)
all_train_subjects = list(
    np.random.permutation(Ys_train.index.get_level_values('subject_id').values)
)
N_early_stop        = int(len(all_train_subjects) * early_stop_frac)
train_subjects      = all_train_subjects[:-N_early_stop]
early_stop_subjects = all_train_subjects[-N_early_stop:]
X_train_obs         = vitals_train[vitals_train.index.get_level_values('subject_id').isin(train_subjects)]
Ys_train_obs        = Ys_train[Ys_train.index.get_level_values('subject_id').isin(train_subjects)]

X_train_early_stop  = vitals_train[vitals_train.index.get_level_values('subject_id').isin(early_stop_subjects)]
Ys_train_early_stop = Ys_train[
    Ys_train.index.get_level_values('subject_id').isin(early_stop_subjects)
]

In [None]:
train_dataloader      = prepare_dataloader(X_train_obs, Ys_train_obs['mort_hosp'], batch_size=batch_size)
early_stop_dataloader = prepare_dataloader(
    X_train_early_stop, Ys_train_early_stop['mort_hosp'], batch_size=batch_size
)

In [None]:
dev_dataloader        = prepare_dataloader(vitals_dev, Ys_dev['mort_hosp'], batch_size=batch_size)
test_dataloader       = prepare_dataloader(vitals_test, Ys_test['mort_hosp'], batch_size=batch_size)

In [None]:
model_hyperparams = copy.copy(base_params)
model_hyperparams.update(
    {k: v for k, v in hyperparams.items() if k in ('cell_size', 'hidden_size', 'batch_size')}
)
# model = GRUD(**model_hyperparams)

In [None]:
import torch

In [2]:
torch.cuda.is_available()

NameError: name 'torch' is not defined