In [1]:
import pickle
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from preprocess import Preprocess
from samples import SampleMaker

## Data Cleaning

In [13]:
def display_box_plot(dataset, cols, exclude=[]):
  n = len(cols)
  rows = math.ceil(n / 3)
  plot_count = 1
  plt.figure(figsize=(30, 5 * rows))
  for idx in range(n):
    if cols[idx] in exclude:
      continue
    data = dataset[cols[idx]].dropna().values
    plt.subplot(rows, 3, plot_count)
    plt.boxplot(data)
    plt.title(cols[idx], fontsize=20)
    plot_count += 1
  plt.tight_layout()
  plt.show()

def preprocess():
    # load dataset
    fileObj = open('dataset.obj', 'rb')
    dataset = pickle.load(fileObj)
    fileObj.close()

    # `temperature` should not be categorical => change to numeric type
    dataset['temperature'] = pd.to_numeric(dataset['temperature'])

    preprocessor = Preprocess(dataset)
    cols_to_select = [
        'tidal_volume_observed', 'pao2fio2ratio', 'platelet', 'rbc', 'wbc',
        'temperature', 'glucose', 'age', 'vaso_amount', 'minute_volume',
        'heart_rate', 'mbp', 'tidal_volume_set', 'lactate', 'vaso_rate',
        'resp_rate', 'potassium', 'calcium', 'pco2', 'ph', 'peep',
        'charttime', 'hadm_id', 'stay_id',
    ]
    print("Selecting columns ...\n")
    preprocessor.select_cols(cols_to_select)
    
    # remove outliers
    # before
    # display_box_plot(preprocessor.dataset, ['minute_volume'])
    print("Removing outliers ...\n")
    for col in preprocessor.dataset._get_numeric_data().columns:
        if col not in ['hadm_id', 'stay_id', 'age']:
            preprocessor.remove_outliers(col)
    # after
    # display_box_plot(preprocessor.dataset, ['minute_volume'])

    # group and order
    print("Grouping and ordering ...\n")
    grouped = preprocessor.group()

    # fill null values for `peep` and `pao2fio2ratio`
    print("Filling null values ... \n")
    preprocessor.fill_null(grouped)

    return grouped

grouped = preprocess()

Selecting columns ...

Removing outliers ...

Grouping and ordering ...

Filling null values ... 



## Creating samples

In [3]:
# make samples
def make_samples(p, l):
    sample_maker = SampleMaker(grouped)
    print("Making samples ...\n")
    positives, negatives, pos_timestamps, neg_timestamps = sample_maker.make_samples(p, min_size=l)
    print(f"Positive Cases: {len(positives)}, Negative Cases: {len(negatives)}")

    lookback = 3
    positive_rnn = sample_maker.make_rnn_samples(positives, lookback)
    negative_rnn = sample_maker.make_rnn_samples(negatives, lookback)
    pos_timestamps_rnn = sample_maker.make_rnn_samples(pos_timestamps, lookback)
    neg_timestamps_rnn = sample_maker.make_rnn_samples(neg_timestamps, lookback)
    print("No. of positive samples: {}".format(len(positive_rnn)))
    print("No. of negative samples: {}".format(len(negative_rnn)))

    return positive_rnn, negative_rnn, pos_timestamps_rnn, neg_timestamps_rnn

with open('grouped.obj', 'rb') as fileObj:
    grouped = pickle.load(fileObj)
X_pos, X_neg, T_pos, T_neg = make_samples(12, 3)

Making samples ...

Positive Cases: 377, Negative Cases: 213
No. of positive samples: 1511
No. of negative samples: 2318


## Resample data

In [4]:
from sklearn.utils import resample, shuffle

# assuming negative samples are more
X_pos, T_pos = resample(
    X_pos,
    T_pos,
    n_samples=len(X_neg)
)

X = np.concatenate([X_pos, X_neg], axis=0)
y = np.array(
    [1 for i in range(len(X_pos))] + [0 for i in range(len(X_neg))],
    dtype=np.uint8
)
T = np.concatenate([T_pos, T_neg], axis=0)
X, y, T = shuffle(X, y, T, random_state=42)

## Refactoring for GRU-D

In [5]:
from refactor import Refactor

refactorer = Refactor(X, y, T)
input = X
timestamp = refactorer.standardize_timestamps()
masking = refactorer.get_masks()
label_taskname = y

In [6]:
import os
np.savez(os.path.join('.', 'data.npz'), input=input, masking=masking, timestamp=timestamp, label_taskname=label_taskname)