# Check the distribution of length of stay in the ER

Grab ADT data for patients in our adjusted_cohort and check out how long they stayed in the ER.

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt

## Query database

Grab ADT columns for our adjusted cohort.

In [None]:
%load_ext google.cloud.bigquery

In [None]:
%%bigquery adjusted_cohort_adt
select jc_uid, pat_enc_csn_id_coded, 
  pat_class, seq_num_in_enc, effective_time_jittered_utc, pat_lv_of_care
from starr_datalake2018.adt
where pat_enc_csn_id_coded in 
  (select distinct pat_enc_csn_id_coded from traige_TE.triage_cohort_adjusted)

In [None]:
# save data to csv
adj_cohort_adt_file = "adjusted_cohort_adt.csv"
adjusted_cohort_adt.to_csv("adjusted_cohort_adt.csv", index=False)

In [None]:
# check the number of unique csn's
adjusted_cohort_adt.pat_enc_csn_id_coded.nunique()

## Start here if CSV file is already saved to system

Read in the CSV.

In [None]:
# read in the cohort after saving the first time
adj_cohort_adt_file = "adjusted_cohort_adt.csv"
adjusted_cohort_adt = pd.read_csv(adj_cohort_adt_file)

# change the effective time to datetime since read in from csv
adjusted_cohort_adt.effective_time_jittered_utc = pd.to_datetime(adjusted_cohort_adt.effective_time_jittered_utc)

# sort by csn and time
adjusted_cohort_adt.sort_values(['pat_enc_csn_id_coded', 'seq_num_in_enc'], inplace=True)

# use this to hide ID columns from view
hidecols = ['jc_uid', 'pat_enc_csn_id_coded']
adjusted_cohort_adt.drop(hidecols, axis=1).head(20)

## Check the time between the first Inpatient Event and the first Emergency Services event for each CSN

This does not use the admit time that we are using. We will check that afterwards and compare to this.

Use some of Conor's code here.

In [None]:
# setting up variables to use conor's code
df_change = adjusted_cohort_adt
inpatient_ids = set(adjusted_cohort_adt.pat_enc_csn_id_coded.values)

# conor's code
df_admit_times = df_change[df_change['pat_enc_csn_id_coded'].isin(inpatient_ids)]
df_admit_times = df_admit_times[df_admit_times['pat_class'] == 'Inpatient'].groupby(
     'pat_enc_csn_id_coded').first().reset_index()[['pat_enc_csn_id_coded', 'effective_time_jittered_utc']].rename(
     columns={'effective_time_jittered_utc' : 'admit_time_jittered'})
df_admit_times

# Changed this time_since_admit to be the opposite of Conor's time so I can have positive values to look at
adjusted_cohort_adt = pd.merge(adjusted_cohort_adt, df_admit_times, how='left', on='pat_enc_csn_id_coded')
adjusted_cohort_adt['time_before_admit'] = adjusted_cohort_adt.apply(lambda x: x.admit_time_jittered - x.effective_time_jittered_utc, axis=1)
adjusted_cohort_adt.drop(hidecols, axis=1)

In [None]:
# mark the events with some labels that make things easier later

# **assumes dataframe is sorted by time

# mark whether previous entry had pat class Emergency Services
adjusted_cohort_adt['prev_emerg'] = adjusted_cohort_adt.pat_class.shift() == 'Emergency Services'
# mark whether current event has pat class Inpatient
adjusted_cohort_adt['curr_inpatient'] = adjusted_cohort_adt.pat_class == 'Inpatient'
# mark whether current event is continued csn as previous
adjusted_cohort_adt['continued'] = adjusted_cohort_adt.pat_enc_csn_id_coded.eq(
    adjusted_cohort_adt.pat_enc_csn_id_coded.shift())

# find our cases by taking the AND of these columns
cols = ['prev_emerg', 'curr_inpatient', 'continued']
adjusted_cohort_adt['first_ip'] = adjusted_cohort_adt[cols].all(axis=1)

adjusted_cohort_adt.drop(hidecols, axis=1)

In [None]:
# find the first emergency services pat_class for each csn
change_pat = adjusted_cohort_adt[~adjusted_cohort_adt.pat_class.eq(adjusted_cohort_adt.pat_class.shift())]
change_pat.drop(hidecols, axis=1)

In [None]:
# lots of these still have many events - meaning they moved around a lot
change_pat.groupby('pat_enc_csn_id_coded').jc_uid.count().value_counts()

In [None]:
# keep the event with pat_class == Emergency Services, that occurs before the first inpatient event
# check to make sure that we're looking at the same CSN between both events
# don't take any where admit ocurred before ER event - happens when people move around a lot (ER -> Inpatient -> ER -> Inpatient)
keep_visits = change_pat[(change_pat.pat_class == 'Emergency Services') & 
                         (change_pat.shift(-1).first_ip) &
                         (change_pat.shift(-1).pat_enc_csn_id_coded == change_pat.pat_enc_csn_id_coded) & 
                         (change_pat.time_before_admit >= timedelta(days=0))] # remove this for now
keep_visits.drop(hidecols, axis=1)

In [None]:
# look at the number of csn's that were dropped becuase admit time was before first ER event
print(hidecols)
print(change_pat[hidecols].drop_duplicates().shape[0])

print(keep_visits[hidecols].drop_duplicates().shape[0])

# we lose about 300 csns where admit occurs before the first ER

In [None]:
# compute the time lapse between the ER visit and admission in hours
keep_visits['hours_before_admit'] = keep_visits.time_before_admit / np.timedelta64(1, 'h')
keep_visits.drop(hidecols, axis=1)

In [None]:
# view the histogram for length of stay in ER
keep_visits.hist('hours_before_admit')

In [None]:
# view the distribution
print(keep_visits.hours_before_admit.describe())

# how many unique csns is this?
print("\nUnique CSNs: ", keep_visits.pat_enc_csn_id_coded.nunique())

### Check out extreme cases

The longest stay in the ED was ~190 hrs (7 days). Just wanted to check out what this stay looked like to make sure nothing weird is happening.

In [None]:
keep_visits.sort_values('hours_before_admit').drop(hidecols, axis=1)

In [None]:
# look at patient who stayed in the ER for 7 days -- seems legit
pd.options.display.max_rows = 100

# grab the last row to get the extreme case
csn = keep_visits.sort_values('hours_before_admit').iloc[-1,:].pat_enc_csn_id_coded
adjusted_cohort_adt[adjusted_cohort_adt.pat_enc_csn_id_coded == csn].drop(hidecols, axis=1)

In [None]:
# zoom in a little on the histogram

hours = 10
# look at people who stayed in ER for <hours hrs
shorter_visits = keep_visits[keep_visits.hours_before_admit < hours]
shorter_visits.hist('hours_before_admit')
print(shorter_visits.hours_before_admit.describe())

# how many unique csns is this?
print("\nUnique CSNs: ", shorter_visits.pat_enc_csn_id_coded.nunique())

## Bring in demographics table

Use the demographics table to get info about label distribution and also use the adjusted admit time.

In [None]:
# load the data from csv
demo = pd.read_csv("../minh/cohort_demo.csv")
demo['admit_time_demo'] = demo['admit_time']

# we're only keeping some of the columns from demographics
cols = ['jc_uid', 'pat_enc_csn_id_coded', 'label', 'admit_time_demo']
keep_joined = keep_visits.merge(demo[cols], on=['jc_uid', 'pat_enc_csn_id_coded'])

# convert admit_time from demographics table into datetime
# use this to compute time spent in ER based on this admit time
keep_joined['admit_time_demo'] = keep_joined.admit_time_demo + '+00:00'
keep_joined['admit_time_demo'] = pd.to_datetime(keep_joined.admit_time_demo)
keep_joined['time_before_admit_demo'] = keep_joined.admit_time_demo - keep_joined.effective_time_jittered_utc
# convert to hours spent in ER
keep_joined['hours_before_admit_demo'] = keep_joined.time_before_admit_demo / np.timedelta64(1, 'h')
keep_joined

# get rid of some of the unneeded columns
keep_joined.drop(['prev_emerg', 'curr_inpatient', 'continued', 'first_ip'], axis=1, inplace=True)

hidecols = ['jc_uid', 'pat_enc_csn_id_coded']
keep_joined.drop(hidecols, axis=1)

In [None]:
# look at the histogram, looks about the same as before
keep_joined.hist('hours_before_admit_demo')
keep_joined.hours_before_admit_demo.describe()

In [None]:
# look into some of the extreme cases - look at most negative case since we already saw the 190 hrs case

# how many cases were admitted before their first ADT ER event?
print("Number of cases where admit occurred before first ER ADT event: ", 
      keep_joined[keep_joined.hours_before_admit_demo < 0].shape[0])

# look into the most extreme negative case
csn = keep_joined.sort_values('hours_before_admit_demo').iloc[0,:].pat_enc_csn_id_coded

# print the admit time that we use for adjusted cohort
print("\nAdmit time for most extreme case: ", keep_joined.sort_values('hours_before_admit_demo').iloc[0,:].admit_time_demo)

# looks like admit time is before the first Emergency Services - might have had more entries under a different CSN
adjusted_cohort_adt[adjusted_cohort_adt.pat_enc_csn_id_coded == csn].drop(hidecols, axis=1)

## Check out the distributions using the adjusted admit time

Below we look at the cohort size and label distribution in the training set when we move the index time further ahead of the admit time.

In [None]:
# check on how index time affects the cohorts
# this function prints out some info for the specified hours (index time)

def check_index(hours):
    print("** COHORT WITH INDEX TIME {} HOURS BEFORE ADMIT".format(hours))
    
    # look at people who stayed in ER for >hours hrs
    shorter_visits = keep_joined[keep_joined.hours_before_admit_demo > hours]

    # how many unique csns is this?
    unique_csn_counts = shorter_visits.pat_enc_csn_id_coded.nunique()
    print("\nUnique CSNs: ", unique_csn_counts)
    
    # split this into train and test
    train = shorter_visits[shorter_visits.admit_time_demo < '2018-01-01 00:00:00+00:00']
    test = shorter_visits[shorter_visits.admit_time_demo >= '2018-01-01 00:00:00+00:00']
    
    # how many unique csns is this?
    train_csn_counts = train.pat_enc_csn_id_coded.nunique()
    test_csn_counts = test.pat_enc_csn_id_coded.nunique()
    print("\nUnique CSNs in Training Set: ", train_csn_counts)
    print("Unique CSNs in Test Set: ", test_csn_counts)
    
    # look into label distribution
    train_labels = train.label.value_counts()
    train_pos = train_labels.loc[0]
    train_neg = train_labels.loc[1]
    pos_perc = round(train_pos / (train_pos + train_neg) * 100, 2)
    neg_perc = round(100 - pos_perc, 2)
    print("\nTraining Set:\n \
            Negative Cases: {} ({}%)\n \
            Positive Cases: {} ({}%)\n ".format(train_pos, pos_perc, train_neg, neg_perc))
    
    # view the distribution
    print("\nDistribution of length of ER stays")
    print("Median: ", shorter_visits.hours_before_admit_demo.median())
    print(shorter_visits.hours_before_admit_demo.describe())
    
    # VIEW HISTOGRAMS
    # view histogram of hours since admit with lower and upper bound
    print("\n***Plotting full histogram, then histogram with upper bound at 10 hrs for better view")
    shorter_visits.hist('hours_before_admit_demo')
    bound_visits = shorter_visits[shorter_visits.hours_before_admit_demo < 10]
    bound_visits.hist('hours_before_admit_demo')
    
    return unique_csn_counts, train_csn_counts, test_csn_counts, train_pos, train_neg

In [None]:
_ = check_index(0)

In [None]:
_ = check_index(1)

In [None]:
_ = check_index(2)

In [None]:
_ = check_index(3)

In [None]:
_ = check_index(4)

In [None]:
# condense data into a single data frame for easier comparison
# test for a range of index times
# this is less for the print out, and more to gather the data into the dataframe

data = []

for i in range(10):
    data.append(tuple([i] + list(check_index(i))));

In [None]:
# check out the data for different index times (hours_prior = index time is X hours before admit)
data_df = pd.DataFrame(data, columns = ['hours_prior', 'total_unique_csn_counts', 'train_unique_csn_counts', 'test_unique_csn_counts', 'train_neg_csns', 'train_pos_csns'])

data_df['train_pos_percent'] = round(data_df.train_pos_csns / (data_df.train_neg_csns + data_df.train_pos_csns) * 100, 2)
data_df

In [None]:
# Plot the data from the dataframe
data_df.drop('train_pos_percent', axis=1, inplace=True)

# style
plt.style.use('seaborn-darkgrid')
 
# create a color palette
palette = plt.get_cmap('Set1')
 
# multiple line plot
num=0
for column in data_df.drop('hours_prior', axis=1):
    num+=1
    plt.plot(data_df['hours_prior'], data_df[column], marker='', 
             color=palette(num), linewidth=1, alpha=0.9, label=column)
 
    # Add legend
    plt.legend()


    # Add titles
    plt.title("Unique CSNs at different index times", loc='left', fontsize=12, fontweight=0, color='orange')
    plt.xlabel("Index time: hours prior to admit")
    plt.ylabel("Number of unique CSNs")

## Index time based on time *since* entering ER

Rather than looking at an index time prior to admission to the ER, it might make more sense to look at an index time based on the time since entering the ER. This makes sense in terms of deployment because the time of admission will be unknown. 

Let's look at cohort stats in terms of time since entering the ER.

In [None]:
# bring in labs and vitals to see how these are affected by index time
labs_vitals = pd.read_csv("../cohort_vitals_labs_long.csv")
labs_vitals.drop(hidecols, axis=1)

In [None]:
labs_vitals_count = labs_vitals.groupby('pat_enc_csn_id_coded').jc_uid.count()
print(labs_vitals_count.describe())
labs_vitals_count.hist()

In [None]:
labs_vitals_adt = keep_joined.merge(labs_vitals, on=['jc_uid', 'pat_enc_csn_id_coded'], how='inner')
labs_vitals_adt.drop(['admit_time_jittered', 'time_before_admit', 'hours_before_admit', 'admit_time'], axis=1, inplace=True)
labs_vitals_adt.drop(hidecols, axis=1)

In [None]:
# check on how index time affects the cohorts
# this function prints out some info for the specified hours (index time)

def index_bounds(hours_since_ER, hours_before_admit):
    print("** COHORT WITH INDEX TIME {} HOURS BEFORE ADMIT".format(hours))
    
    # look at people who stayed in ER for >hours hrs
    shorter_visits = keep_joined[keep_joined.hours_before_admit_demo > hours_before_admit]

    # how many unique csns is this?
    unique_csn_counts = shorter_visits.pat_enc_csn_id_coded.nunique()
    print("\nUnique CSNs: ", unique_csn_counts)
    
    # split this into train and test
    train = shorter_visits[shorter_visits.admit_time_demo < '2018-01-01 00:00:00+00:00']
    test = shorter_visits[shorter_visits.admit_time_demo >= '2018-01-01 00:00:00+00:00']
    
    # how many unique csns is this?
    train_csn_counts = train.pat_enc_csn_id_coded.nunique()
    test_csn_counts = test.pat_enc_csn_id_coded.nunique()
    print("\nUnique CSNs in Training Set: ", train_csn_counts)
    print("Unique CSNs in Test Set: ", test_csn_counts)
    
    # look into label distribution
    train_labels = train.label.value_counts()
    train_pos = train_labels.loc[0]
    train_neg = train_labels.loc[1]
    pos_perc = round(train_pos / (train_pos + train_neg) * 100, 2)
    neg_perc = round(100 - pos_perc, 2)
    print("\nTraining Set:\n \
            Negative Cases: {} ({}%)\n \
            Positive Cases: {} ({}%)\n ".format(train_pos, pos_perc, train_neg, neg_perc))
    
    # view the distribution
    print("\nDistribution of length of ER stays")
    print("Median: ", shorter_visits.hours_before_admit_demo.median())
    print(shorter_visits.hours_before_admit_demo.describe())
    
    # VIEW HISTOGRAMS
    # view histogram of hours since admit with lower and upper bound
    print("\n***Plotting full histogram, then histogram with upper bound at 10 hrs for better view")
    shorter_visits.hist('hours_before_admit_demo')
    bound_visits = shorter_visits[shorter_visits.hours_before_admit_demo < 10]
    bound_visits.hist('hours_before_admit_demo')
    
    return unique_csn_counts, train_csn_counts, test_csn_counts, train_pos, train_neg

In [None]:
labs_vitals_adt_count = labs_vitals_adt.groupby('pat_enc_csn_id_coded').jc_uid.count()
print(labs_vitals_adt.pat_enc_csn_id_coded.nunique())
print(labs_vitals_adt_count.describe())
labs_vitals_adt_count.hist()

In [None]:
# get the amount of time that's passed since vitals/labs time

# change the time to datetime
labs_vitals_adt['time'] = pd.to_datetime(labs_vitals_adt.time)

labs_vitals_adt['results_elapsed_ER_time'] = labs_vitals_adt.time - labs_vitals_adt.effective_time_jittered_utc
labs_vitals_adt['results_hours_elapsed_ER_time'] = labs_vitals_adt.results_elapsed_ER_time / np.timedelta64(1, 'h')
labs_vitals_adt.drop(hidecols, axis=1)

In [None]:
# adjust some of the data frame names
labs_vitals_adt.rename({'label': 'label_24hr',
                        'time_before_admit_demo': 'ER_length_of_stay',
                          'hours_before_admit_demo': 'ER_lengh_of_stay_hours'}, axis='columns', inplace=True)
labs_vitals_adt.drop(hidecols, axis=1)

## Look into label changes based on window

We want to look at different windows for the level of care change for patients. We said we'd look at 12 hours, so I'm going to grab these labels now. I'll use Conor's code that was made to create the original labels.

In [None]:
# load in a fresh copy of the adjusted cohort table here

# read in the cohort after saving the first time
adj_cohort_adt_file = "adjusted_cohort_adt.csv"
adjusted_cohort_adt = pd.read_csv(adj_cohort_adt_file)

# change the effective time to datetime since read in from csv
adjusted_cohort_adt.effective_time_jittered_utc = pd.to_datetime(adjusted_cohort_adt.effective_time_jittered_utc)

adjusted_cohort_adt.sort_values(['pat_enc_csn_id_coded', 'seq_num_in_enc'], inplace=True)

# use this to hide ID columns from view
hidecols = ['jc_uid', 'pat_enc_csn_id_coded']



# setting up variables to use conor's code
df_change = adjusted_cohort_adt
df_lofc = adjusted_cohort_adt
inpatient_ids = set(adjusted_cohort_adt.pat_enc_csn_id_coded.values)

# conor's code
df_admit_times = df_change[df_change['pat_enc_csn_id_coded'].isin(inpatient_ids)]
df_admit_times = df_admit_times[df_admit_times['pat_class'] == 'Inpatient'].groupby(
     'pat_enc_csn_id_coded').first().reset_index()[['pat_enc_csn_id_coded', 'effective_time_jittered_utc']].rename(
     columns={'effective_time_jittered_utc' : 'admit_time_jittered'})
df_admit_times

# Merge to df_lofc and create column called time_since_admit
df_lofc = pd.merge(df_lofc, df_admit_times, how='left', on='pat_enc_csn_id_coded')
df_lofc['time_since_admit'] = df_lofc.apply(lambda x: x.effective_time_jittered_utc - x.admit_time_jittered, axis=1)
# Filter df_lofc so that we only look 12 hours into admission
df_lofc = df_lofc[df_lofc['time_since_admit'] < timedelta(hours=12)]

def was_placed_in_critical_care(arr):
    """Returns true if patient placed in crtical care within 24 hours of admit
       Assumes we have already done the 24 hours logic
       Assumes no overlapping csn ids... """
    for a in arr:
        if a == 'Critical Care':
            return 1
    return 0

df_labels = df_lofc.groupby('pat_enc_csn_id_coded').agg({
    'jc_uid' : 'first',
    'admit_time_jittered' : 'first',
    'pat_lv_of_care' : was_placed_in_critical_care}).rename(
    columns={"pat_lv_of_care" : 'label'}).reset_index()[['jc_uid', 'pat_enc_csn_id_coded', 'admit_time_jittered', 'label']]
df_labels.head()

# save this to join later
hr12_labels = df_labels

df_labels.groupby('label').count()

In [None]:
# add these 12 hour labels back on to the labs_vitals_adt table
hr12_labels['label_12hr'] = hr12_labels.label

labs_vitals_adt = labs_vitals_adt.merge(hr12_labels[['pat_enc_csn_id_coded', 'label_12hr']], 
                                        on='pat_enc_csn_id_coded', how='left')

# add columns to tell us how long before admission these results were recorded
labs_vitals_adt['results_time_before_admit'] = labs_vitals_adt.admit_time_demo - labs_vitals_adt.time
labs_vitals_adt['results_hours_before_admit'] = labs_vitals_adt.results_time_before_admit / np.timedelta64(1, 'h')

labs_vitals_adt.drop(hidecols, axis=1)

In [None]:
# save the parts of this table that will be useful later
labs_vitals_adt.rename({'ER_lengh_of_stay_hours': 'ER_length_of_stay_hours'}, axis='columns', inplace=True)
cols = ['jc_uid', 'pat_enc_csn_id_coded', 'effective_time_jittered_utc', 'admit_time_demo', 'ER_length_of_stay_hours', 'results_hours_elapsed_ER_time', 'results_hours_before_admit', 'features', 'time', 'values', 'feature_type', 'label_24hr', 'label_12hr']

labs_vitals_adt[cols].to_csv("labs_vitals_for_time_filter.csv", index=False)

### Check different index time constraints

Now we have all of the data that we need to explore what happens with different index time constraints.

We can also look at how the 12 hour window for labels affects our cohort.

In [None]:
def describe_df(df, hours_prior_to_admit, hours_in_ER, KEEP_SHORT_STAYS=0):
    print("** DATA WINDOW: \
    {} HOURS SINCE ENTERING ER, \
    {} HOURS PRIOR TO ADMIT **\n".format(hours_in_ER, hours_prior_to_admit))
    
    # the KEEP_SHORT_STAYS flag allows us 
    # to look at keeping labs for patients with shorter ER stays than the 
    # hours_prior_to_admit time
    
    # filter this df down
    # keep labs that occurred during the hours_in_ER time
    df = df[df.results_hours_elapsed_ER_time <= hours_in_ER]
    # keep labs that occur before hours_prior_to_admit
    if KEEP_SHORT_STAYS == 0:
        df = df[df.results_hours_before_admit >= hours_prior_to_admit]
    else:
        # keep labs if the ER length of stay is less than the hours_prior_to_admit time
        df = df[(df.results_hours_before_admit >= hours_prior_to_admit) | (df.ER_length_of_stay_hours <= hours_prior_to_admit)]
        
    # how many unique csns do we have? These labs are already filtered to 1 hr prior to admit
    unique_csn_counts = df.pat_enc_csn_id_coded.nunique()
    print("Unique CSNs: ", unique_csn_counts)
    
    # how many labs/vitals per CSN do we have?
    print("\nVitals/Labs per CSN: ")
    csn_groups = df.groupby('pat_enc_csn_id_coded').jc_uid.count()
    print("median\t ", csn_groups.median())
    print(csn_groups.describe())
    
    # how many unique csns in train/test set?
    # split this into train and test
    train = df[df.admit_time_demo < '2018-01-01 00:00:00+00:00']
    test = df[df.admit_time_demo >= '2018-01-01 00:00:00+00:00']

    train_csn_counts = train.pat_enc_csn_id_coded.nunique()
    test_csn_counts = test.pat_enc_csn_id_coded.nunique()
    print("\nUnique CSNs in TRAIN: ", train_csn_counts)
    print("Unique CSNs in TEST: ", test_csn_counts)
    
    # how many labs/vitals per CSN in TRAIN?
    print("\nVitals/Labs per CSN in TRAIN Set: ")
    csn_groups = train.groupby('pat_enc_csn_id_coded').jc_uid.count()
    train_labs_vitals_median = csn_groups.median()
    print("median\t ", train_labs_vitals_median)
    print(csn_groups.describe())
    
    # how many labs per CSN?
    type_counts = train.groupby(['feature_type', 'pat_enc_csn_id_coded']).jc_uid.count()
    print("\nLabs per CSN in TRAIN set: ")
    train_labs_median = type_counts.labs.median()
    print("median\t ", train_labs_median)
    print(type_counts.labs.describe())
        
    # how many vitals per CSN?
    print("\nVitals per CSN in TRAIN set: ")
    train_vitals_median = type_counts.vitals.median()
    print("median\t ", train_vitals_median)
    print(type_counts.vitals.describe())

    # what's our label distribution?
    train_labels_24hr = train.label_24hr.value_counts()
    train_pos_24hr = train_labels_24hr.loc[1]
    train_neg_24hr = train_labels_24hr.loc[0]
    pos_perc = round(train_pos_24hr / (train_pos_24hr + train_neg_24hr) * 100, 2)
    neg_perc = round(100 - pos_perc, 2)
    print("\nTraining Set (24 hr labels):\n \
            Negative Cases: {} ({}%)\n \
            Positive Cases: {} ({}%)\n ".format(train_neg_24hr, neg_perc, train_pos_24hr, pos_perc))
    
    train_labels_12hr = train.label_12hr.value_counts()
    train_pos_12hr = train_labels_12hr.loc[1]
    train_neg_12hr = train_labels_12hr.loc[0]
    pos_perc = round(train_pos_12hr / (train_pos_12hr + train_neg_12hr) * 100, 2)
    neg_perc = round(100 - pos_perc, 2)
    print("\nTraining Set (12hr labels):\n \
            Negative Cases: {} ({}%)\n \
            Positive Cases: {} ({}%)\n ".format(train_neg_12hr, neg_perc, train_pos_12hr, pos_perc))
    
    return (unique_csn_counts, train_csn_counts, test_csn_counts, 
            train_labs_vitals_median, train_labs_median, train_vitals_median,
            train_pos_24hr, train_neg_24hr, train_pos_12hr, train_neg_12hr)
    
hours_prior_to_admit = 1
hours_in_ER = 1
describe_df(labs_vitals_adt, hours_prior_to_admit, hours_in_ER)

In [None]:
hours_prior_to_admit = 1
hours_in_ER = 1

data = []
for hours_prior_to_admit in range(1,10):
    for hours_in_ER in range(1,10):
        data.append(tuple([hours_prior_to_admit, hours_in_ER] + list(describe_df(labs_vitals_adt, hours_prior_to_admit, hours_in_ER))));

In [None]:
cols = 'hours_prior_to_admit, hours_in_ER, \
            unique_csn_counts, train_csn_counts, test_csn_counts, \
            train_labs_vitals_median, train_labs_median, train_vitals_median,\
            train_pos_24hr, train_neg_24hr, train_pos_12hr, train_neg_12hr'
cols = cols.split(', ')
cols = [x.strip() for x in cols]

data_df = pd.DataFrame(data, columns=cols)
data_df.to_csv('index_time_lab_vitals.csv', index=False)
data_df

In [None]:
# look at the unique csn counts
cols = ['hours_prior_to_admit', 'hours_in_ER', 'unique_csn_counts']
csns = pd.DataFrame(data_df[cols])

# let's look at these in 10,000's
csns.unique_csn_counts = csns.unique_csn_counts / 10000

csns = csns.pivot(index='hours_prior_to_admit', columns='hours_in_ER', values='unique_csn_counts')
csns

In [None]:
# looks like hours in ER doesn't have much effect on the number of unique CSNs we include
def simple_spaghetti(df, title, xlab, ylab):
    plt.figure(figsize=(10,8))
    
    # style
    plt.style.use('seaborn-darkgrid')

    # create a color palette
    palette = plt.get_cmap('tab10')

    # multiple line plot
    num=0
    for column in df:
        num+=1
        plt.plot(df.index, df[column], marker='', color=palette(num), linewidth=5, alpha=0.9, label=column)

        # Add legend
        plt.legend(title='Hours in ER')

        # Add titles
        plt.title(title, loc='left', fontsize=12, fontweight=0, color='orange')
        plt.xlabel(xlab)
        plt.ylabel(ylab)

        
title = 'Unique CSN counts with index time changes'
xlab = 'Hours prior to Admit'
ylab = 'Number of Unique CSNs'
simple_spaghetti(csns, title, xlab, ylab)

In [None]:
# look at the unique train_labs_vitals_median counts
cols = ['hours_prior_to_admit', 'hours_in_ER', 'train_labs_vitals_median']
labs_vitals = pd.DataFrame(data_df[cols])

labs_vitals = labs_vitals.pivot(index='hours_prior_to_admit', columns='hours_in_ER', values='train_labs_vitals_median')
labs_vitals

Note: I thought it was odd to see this v-shape in the plots below, but I think it does make sense.

We'd expect that the number of labs/vitals should increase as hours in the ER increases, because there is more time to collect more labs/vitals. 

We'd expect that the number of labs/vitals should decrease as the hours prior to admit increases. We're essentially shortening the available window to collect these labs/vitals.

In combination, as we increase the hours prior to admit, we esentially exclude patients who have been in the ER for less than that window time. This means our cohort is essentially biased towards people with longer stays in the ER. As such, the median number of labs increases.

Probably want to look into keeping cases where the entire length of stay in the ER < time prior to admit that we use to prevent data leakage. Maybe we can keep this at a minimum of 1 hour prior to admit though since our early SQL included this and so all data tables afterwards also do not contain any of these cases.

In [None]:
# not many labs are available within one hour of entering ER, 2 hours+ are all pretty similar

title = 'Labs/vitals available within window (TRAIN SET)'
xlab = 'Hours prior to admit'
ylab = 'Median number of labs/vitals available'
simple_spaghetti(labs_vitals, title, xlab, ylab)

In [None]:
# look at the training labels
cols = ['hours_prior_to_admit', 'hours_in_ER', 'train_pos_24hr', 'train_neg_24hr', 'train_pos_12hr', 'train_neg_12hr']
labels = pd.DataFrame(data_df[cols])

# add some percentages that we're going to plot
labels['percent_pos_24hr'] = round(labels.train_pos_24hr / (labels.train_pos_24hr + labels.train_neg_24hr) * 100,2)
labels['percent_pos_12hr'] = round(labels.train_pos_12hr / (labels.train_pos_12hr + labels.train_neg_12hr) * 100,2)

labels

In [None]:
# look at the labels separately
cols = ['hours_prior_to_admit', 'hours_in_ER', 'percent_pos_24hr']
labels_24hr = pd.DataFrame(labels[cols])

labels_24hr = labels_24hr.pivot(index='hours_prior_to_admit', columns='hours_in_ER', values='percent_pos_24hr')
labels_24hr

In [None]:
title = 'Positive 24hr labels within window'
xlab = 'Hours prior to admit'
ylab = 'Percent positive labels in training cohort'
simple_spaghetti(labels_24hr, title, xlab, ylab)

In [None]:
# look at 12 hour labels
cols = ['hours_prior_to_admit', 'hours_in_ER', 'percent_pos_12hr']
labels_12hr = pd.DataFrame(labels[cols])

labels_12hr = labels_12hr.pivot(index='hours_prior_to_admit', columns='hours_in_ER', values='percent_pos_12hr')
labels_12hr

In [None]:
title = 'Positive 12hr labels within window'
xlab = 'Hours prior to admit'
ylab = 'Percent positive labels in training cohort'
simple_spaghetti(labels_12hr, title, xlab, ylab)

## Keeping patients with shorter stays than the "hours prior to admit" value

The labs counts looked a little weird with that v-shape earlier so now we'll check what happens when we keep patient's labs if their ER stay was shorter than the hours_prior_to_admit time. 

This is what we originally said we would do with the shorter stays. 

*Note* These still don't include any labs that occurr within one hour of admit because those were filtered out using SQL to create a table used in a earlier step. We need to change the original table if we want to get those cases.

In [None]:
data_keepshort = []
for hours_prior_to_admit in range(1,10):
    for hours_in_ER in range(1,10):
        data_keepshort.append(tuple([hours_prior_to_admit, hours_in_ER] 
                          + list(describe_df(labs_vitals_adt, hours_prior_to_admit, hours_in_ER, KEEP_SHORT_STAYS=1))));

In [None]:
cols = 'hours_prior_to_admit, hours_in_ER, \
            unique_csn_counts, train_csn_counts, test_csn_counts, \
            train_labs_vitals_median, train_labs_median, train_vitals_median,\
            train_pos_24hr, train_neg_24hr, train_pos_12hr, train_neg_12hr'
cols = cols.split(', ')
cols = [x.strip() for x in cols]

data_keepshort_df = pd.DataFrame(data_keepshort, columns=cols)
data_keepshort_df.to_csv('index_time_lab_vitals_keepshort.csv', index=False)
data_keepshort_df

In [None]:
# look at the unique train_labs_vitals_median counts
cols = ['hours_prior_to_admit', 'hours_in_ER', 'train_labs_vitals_median']
labs_vitals = pd.DataFrame(data_keepshort_df[cols])

labs_vitals = labs_vitals.pivot(index='hours_prior_to_admit', columns='hours_in_ER', values='train_labs_vitals_median')
labs_vitals

In [None]:
# not many labs are available within one hour of entering ER, 2 hours+ are all pretty similar

title = 'Labs/vitals available within window (TRAIN SET)'
xlab = 'Hours prior to admit'
ylab = 'Median number of labs/vitals available'
simple_spaghetti(labs_vitals, title, xlab, ylab)

## Create tables that will allow for easily choosing the labs/vitals that we need

Will connect these with the traige_cohort_adjusted_vitals_labs_4binning so that they can be easily filtered based on whatever time points we decide.

Also, store the labels for the CSNs for easy use later.

In [None]:
# %load_ext google.cloud.bigquery

In [None]:
# %%bigquery triage_cohort_adjusted_vitals_labs_4binning
# select *
# from traige_TE.triage_cohort_adjusted_vitals_labs_4binning

In [None]:
# triage_cohort_adjusted_vitals_labs_4binning.to_csv("triage_cohort_adjusted_vitals_labs_4binning.csv")

In [None]:
# %%bigquery triage_cohort_adjusetd_demographics_clean
# select *
# from traige_TE.triage_cohort_adjusted_demographics_clean

In [None]:
# triage_cohort_adjusetd_demographics_clean.to_csv("triage_cohort_adjusetd_demographics_clean.csv")

In [None]:
triage_cohort_adjusetd_demographics_clean = pd.read_csv("triage_cohort_adjusetd_demographics_clean.csv")

In [None]:
labels = labs_vitals_adt[['jc_uid', 'pat_enc_csn_id_coded', 'label_24hr', 'label_12hr']]
labels = labels.drop_duplicates(keep='first')

joined_demo = triage_cohort_adjusetd_demographics_clean.merge(labels,
                                                             on=['jc_uid', 'pat_enc_csn_id_coded'],
                                                             how='left')

# we're missing about 3,073 CSNs here - these should be the patients who were dropped because
# their admit time was before their first ER visit
joined_demo[(joined_demo.label != joined_demo.label_24hr) & 
           (joined_demo.label_24hr.isnull())][hidecols].drop_duplicates().shape[0]

