This notebook, takes the one row per patient data we used in previous notebooks from the top5 hospitals from the eICU dataset and prepares it so it can be used on a remote worker. 

We only use the apache and lab result vars which are continuous. The first 3 hospitals are used for training, whereas the 4th and 5th are merged for test.

In [7]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import os
import torch
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

establish connection to DB and define helper function for running queries

In [15]:
import pandas as pd
from proto.etl.config import SSHInfoEicu, DBInfoEicu
from proto.etl.utils import connect_to_db_via_ssh, run_eicu_query, get_column_completeness, load_schema_for_modelling

conn = connect_to_db_via_ssh(SSHInfoEicu, DBInfoEicu)
cursor = conn.cursor()
query_schema = 'set search_path to eicu_crd;'

### Add hospital id to the dataset 

In [71]:
# get the hospital id for the patients in the top5 hospitals
query = """
    select m.patientunitstayid, hospitalid
    from patient_top5hospitals_mort_dataset as m
    join
        (
        select hospitalid, patientunitstayid
        from patient
        ) as p
    on
        p.patientunitstayid=m.patientunitstayid
"""

df_hospitals = run_eicu_query(query, conn)
df_hospitals.set_index('patientunitstayid', inplace=True)

In [50]:
# find out the top5's size and hospitalid
query = """
    select hospitalid, count(patientunitstayid) as n
    from patient 
    group by hospitalid 
    order by n desc
"""

df_hospitals = run_eicu_query(query, conn)
df_hospitals.head()

Unnamed: 0,hospitalid,n
0,73,7059
1,167,6092
2,264,5237
3,420,4679
4,176,4328


In [72]:
# load data, add hospital id
df = pd.read_csv('orpp_all.csv', index_col=0)
df = df.join(df_hospitals)

# recode hospital ids - largest 3 we keep separate, 4th and 5th become test 
df.hospitalid[df.hospitalid==73] = 1
df.hospitalid[df.hospitalid==167] = 2
df.hospitalid[df.hospitalid==264] = 3
df.hospitalid[df.hospitalid==420] = 4
df.hospitalid[df.hospitalid==176] = 4

# only keep numeric cols and hospitalid
cols_to_keep = df.columns[list(range(4,107)) + [-1]]
df = df[cols_to_keep]

# save the X matrix
df.to_csv('../src/proto/workers/x.csv', index=False)

In [74]:
# create the y outcome csv
df_y = pd.read_csv('mort_y.csv', index_col=0)

# make sure we have the patients in the same order
sum(df_y.index.values == df.index.values) == df.shape[0]

df_y.to_csv('../src/proto/workers/y.csv', index=False)

### Define PyTorch dataset from eICU dataset

In [14]:
class EicuDataset(Dataset):
    """
    Loads the data of 16k patients from the eICU dataset and the 
    corresponding labels for mortality prediction and length of stay
    prediction. 
    
    We only use the apache and lab result features which are continuous.
    Each patient is represented as a single row, with their data from 
    the first 24 hours.
    """

    def __init__(self, root_dir, hospital, outcome='hosp_mort'):
        """
        Args:
            root_dir (string): Directory with data files.
            hospital (int): Which hospitals data to keep
                1-3 are training, 4 is testing
            outcome (string): 'hosp_mort' or 'icu_los_hours'
        """
        self.root_dir = root_dir
        
        # load and scale x, and restrict to requested hospital data
        self.df_x = pd.read_csv(os.path.join(self.root_dir, 'x.csv'))
        to_keep = self.df_x.hospitalid.values == hospital
        self.df_x.drop('hospitalid', axis=1, inplace=True)
        self.df_x = self.df_x[to_keep]
        scaler = RobustScaler(quantile_range=(10.0, 90.0))
        self.x = scaler.fit_transform(self.df_x.values)
        
        # load and select outcome
        self.y = pd.read_csv(
            os.path.join(self.root_dir, 'y.csv')
        )[outcome].values
        
                      
    def __len__(self):
        return len(self.df_x)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.x[idx,:], self.y[idx]

In [15]:
eicu_data = EicuDataset('../src/proto/workers/', 1)
dataloader = DataLoader(eicu_data, batch_size=50, shuffle=True, num_workers=1, drop_last=False)

In [80]:
# test the define dataset with data loader
for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch, sample_batched[0].size(),sample_batched[1].size())


0 torch.Size([50, 103]) torch.Size([50])
1 torch.Size([50, 103]) torch.Size([50])
2 torch.Size([50, 103]) torch.Size([50])
3 torch.Size([50, 103]) torch.Size([50])
4 torch.Size([50, 103]) torch.Size([50])
5 torch.Size([50, 103]) torch.Size([50])
6 torch.Size([50, 103]) torch.Size([50])
7 torch.Size([50, 103]) torch.Size([50])
8 torch.Size([50, 103]) torch.Size([50])
9 torch.Size([50, 103]) torch.Size([50])
10 torch.Size([50, 103]) torch.Size([50])
11 torch.Size([50, 103]) torch.Size([50])
12 torch.Size([50, 103]) torch.Size([50])
13 torch.Size([50, 103]) torch.Size([50])
14 torch.Size([50, 103]) torch.Size([50])
15 torch.Size([50, 103]) torch.Size([50])
16 torch.Size([50, 103]) torch.Size([50])
17 torch.Size([50, 103]) torch.Size([50])
18 torch.Size([50, 103]) torch.Size([50])
19 torch.Size([50, 103]) torch.Size([50])
20 torch.Size([50, 103]) torch.Size([50])
21 torch.Size([50, 103]) torch.Size([50])
22 torch.Size([50, 103]) torch.Size([50])
23 torch.Size([50, 103]) torch.Size([50])
24