# Labels

This notebook grabs two labels for our adjusted cohort.

1) Outcome labels: labels that we're trying to predict

2) Switch labels: indicator of people this predictor can help

## Part 1: Outcome labels
Get the labels for different windows (12 and 24 hours) after admission. 

0 if patient doesn't go to critical care within X hours

1 if patient goes to critical care within X hours

Find the labels for the adjusted cohort. Attach to the BQ dataframe.

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
%load_ext google.cloud.bigquery

### Download adjusted_cohort table from BQ

In [None]:
%%bigquery triage_cohort_adjusted
select *
from traige_TE.triage_cohort_adjusted

In [None]:
triage_cohort_adjusted.to_csv("triage_cohort_adjusted.csv", index=False)

In [None]:
# select the deaths for our adjusted cohorts
%%bigquery triage_adjusted_cohort_deaths
select rit_uid, death_date_jittered from starr_datalake2018.demographic
where rit_uid in 
  (
  select jc_uid from traige_TE.triage_cohort_adjusted
  )

In [None]:
triage_adjusted_cohort_deaths.to_csv("triage_adjusted_cohort_deaths.csv", index=False)

### Get the labels for different windows

We need to use the ADT table to get the labels for the different CSNs since we need to know the trajectories.

In [None]:
triage_cohort_adjusted = pd.read_csv("triage_cohort_adjusted.csv")

In [None]:
# load in a fresh copy of the adjusted cohort table here

# read in the cohort after saving the first time
adj_cohort_adt_file = "adjusted_cohort_adt.csv"
adjusted_cohort_adt = pd.read_csv(adj_cohort_adt_file)

# change the effective time to datetime since read in from csv
adjusted_cohort_adt.effective_time_jittered_utc = pd.to_datetime(adjusted_cohort_adt.effective_time_jittered_utc)

adjusted_cohort_adt.sort_values(['pat_enc_csn_id_coded', 'seq_num_in_enc'], inplace=True)

# use this to hide ID columns from view
hidecols = ['jc_uid', 'pat_enc_csn_id_coded']


def get_labels(window_hours):
    # setting up variables to use conor's code
    df_change = adjusted_cohort_adt
    df_lofc = adjusted_cohort_adt
    inpatient_ids = set(adjusted_cohort_adt.pat_enc_csn_id_coded.values)

    # conor's code
    df_admit_times = df_change[df_change['pat_enc_csn_id_coded'].isin(inpatient_ids)]
    df_admit_times = df_admit_times[df_admit_times['pat_class'] == 'Inpatient'].groupby(
         'pat_enc_csn_id_coded').first().reset_index()[['pat_enc_csn_id_coded', 'effective_time_jittered_utc']].rename(
         columns={'effective_time_jittered_utc' : 'admit_time_jittered'})
    df_admit_times

    # Merge to df_lofc and create column called time_since_admit
    df_lofc = pd.merge(df_lofc, df_admit_times, how='left', on='pat_enc_csn_id_coded')
    df_lofc['time_since_admit'] = df_lofc.apply(lambda x: x.effective_time_jittered_utc - x.admit_time_jittered, axis=1)
    # Filter df_lofc so that we only look window_hours hours into admission
    df_lofc = df_lofc[df_lofc['time_since_admit'] < timedelta(hours=window_hours)]

    def was_placed_in_critical_care(arr):
        """Returns true if patient placed in crtical care within 24 hours of admit
           Assumes we have already done the 24 hours logic
           Assumes no overlapping csn ids... """
        for a in arr:
            if a == 'Critical Care':
                return 1
        return 0

    label_name = "label_{}hr".format(window_hours)
    
    df_labels = df_lofc.groupby('pat_enc_csn_id_coded').agg({
        'jc_uid' : 'first',
        'admit_time_jittered' : 'first',
        'pat_lv_of_care' : was_placed_in_critical_care}).rename(
        columns={"pat_lv_of_care" : label_name}).reset_index()[['jc_uid', 'pat_enc_csn_id_coded', 'admit_time_jittered', label_name]]
    df_labels.head()

    print(df_labels.groupby(label_name).count())
    
    return df_labels

In [None]:
labels_24hr = get_labels(24)
labels_12hr = get_labels(12)

### Connect 24 hour labels to the adjusted_cohort table

These labels should be the same as those already in the table. Use this as a check to make sure we're doing things properly.

Everything matches so we're good!

In [None]:
# connect the new labels to the adjusted cohort
triage_cohort_adjusted_label24 = triage_cohort_adjusted.merge(labels_24hr, 
                                                             on=['jc_uid', 'pat_enc_csn_id_coded']
                                                             )

# check that the labels match the previous ones
triage_cohort_adjusted_label24[triage_cohort_adjusted_label24.label != 
                              triage_cohort_adjusted_label24.label_24hr]

### Connect the 12 hour labels 

In [None]:
# connect the new labels to the adjusted cohort
triage_cohort_adjusted_label24_label12 = triage_cohort_adjusted_label24.merge(labels_12hr, 
                                                             on=['jc_uid', 'pat_enc_csn_id_coded'])

# remove the admit time columns that we added from the ADT tables
triage_cohort_adjusted_label24_label12_clean = triage_cohort_adjusted_label24_label12.drop(
    ['admit_time_jittered_x', 'admit_time_jittered_y', 'label'], axis=1)

hidecols=['jc_uid', 'pat_enc_csn_id_coded'] # use this to hide output
triage_cohort_adjusted_label24_label12_clean.drop(hidecols, axis=1)

## Part 2: Switch labels

Now we add the switch labels. These labels tell us whether a patient was placed into acute care prior to critical care and then was switched to critical care within our window time.

1 for patients who were in acute care and then moved to critical care within X hours

0 otherwise

In [None]:
# run this code just to get the admit time and the time_since_admit columns
# also filtering events down to a 24 hour window
window_hours = 24

# setting up variables to use conor's code
df_change = adjusted_cohort_adt
df_lofc = adjusted_cohort_adt
inpatient_ids = set(adjusted_cohort_adt.pat_enc_csn_id_coded.values)

# conor's code
df_admit_times = df_change[df_change['pat_enc_csn_id_coded'].isin(inpatient_ids)]
df_admit_times = df_admit_times[df_admit_times['pat_class'] == 'Inpatient'].groupby(
     'pat_enc_csn_id_coded').first().reset_index()[['pat_enc_csn_id_coded', 'effective_time_jittered_utc']].rename(
     columns={'effective_time_jittered_utc' : 'admit_time_jittered'})
df_admit_times

# Merge to df_lofc and create column called time_since_admit
df_lofc = pd.merge(df_lofc, df_admit_times, how='left', on='pat_enc_csn_id_coded')
df_lofc['time_since_admit'] = df_lofc.apply(lambda x: x.effective_time_jittered_utc - x.admit_time_jittered, axis=1)
# Filter df_lofc so that we only look window_hours hours into admission
df_lofc = df_lofc[df_lofc['time_since_admit'] < timedelta(hours=window_hours)]

In [None]:
# get the labels for switching from acutre to critical care within the window time

# assumes that outcome labels for this window_hours has already been created, 
# will not work unless this is true!

def get_switch_labels(window_hours, cohort_df):
    print("Creating switch labels for {} hours".format(window_hours))
    # grab only the cases who were sent to critical care within X hours - just use label we already made
    label_col = "label_{}hr".format(window_hours)
    pos_cases = triage_cohort_adjusted_label24_label12[
        triage_cohort_adjusted_label24_label12[label_col] == 1]
    pos_csn = pos_cases.pat_enc_csn_id_coded.values
    pos_adt = df_lofc[df_lofc.pat_enc_csn_id_coded.isin(pos_csn)]

    # filter down to the time window
    pos_adt_filtered = pos_adt[pos_adt['time_since_admit'] < timedelta(hours=window_hours)]

    # sort values by time
    pos_adt_filtered = pos_adt_filtered.sort_values(['pat_enc_csn_id_coded', 'effective_time_jittered_utc'])
    pos_adt_filtered


    # keep only cases where pat_lv_of_care changed from row above or csn changed
    # logic: (row.csn == last.csn) --> (row.care != last.care) ::: p --> q
    # equivalent: (!(row.csn == last.csn) OR (row.care != last.care)) ::: !p OR q
    pos_adt_changed = pos_adt_filtered[(pos_adt_filtered.shift().pat_enc_csn_id_coded != 
                                      pos_adt_filtered.pat_enc_csn_id_coded) |
                                        (pos_adt_filtered.shift().pat_lv_of_care != 
                                      pos_adt_filtered.pat_lv_of_care) 
                                      ]
    pos_adt_changed.drop(hidecols, axis=1)

    # modified this function from conor's code
    def has_critical_after_acute(arr):
        """Assumes arr is ordered by time"""
        has_acute = False

        # must see Critical Care AFTER not Critical Care
        for a in arr:
            if a != 'Critical Care':
                has_acute = True
            elif a == 'Critical Care' and has_acute == True:
                return True
            else:
                return False

        return False


    # Get labels for patients that switched from acute -> critical care within window
    df_temp = pos_adt_changed[['pat_enc_csn_id_coded', 'pat_lv_of_care']].dropna().groupby('pat_enc_csn_id_coded').agg(
    {'pat_lv_of_care' : has_critical_after_acute}).reset_index()
    inpatient_ids = set(df_temp[df_temp['pat_lv_of_care'] == True]['pat_enc_csn_id_coded'].values)

    newlab = "acute_to_critical_{}hr".format(window_hours)
    df_temp.rename({'pat_lv_of_care': newlab}, axis=1, inplace=True)
    
    
    # join these labels to our adjusted cohort
    cohort_label24_label12_atc = cohort_df.merge(
                                    df_temp, on=['pat_enc_csn_id_coded'], how='left')
    # switch NaNs and False, then convert everything to ints
    cohort_label24_label12_atc[newlab].fillna(False, inplace=True)
    cohort_label24_label12_atc[newlab] = cohort_label24_label12_atc[newlab].astype(int)

    print(cohort_label24_label12_atc[newlab].value_counts())
    
    return(cohort_label24_label12_atc)

In [None]:
# create a new cohort table for the labels
cohort_label24_label12_atc = triage_cohort_adjusted_label24_label12_clean

# get the labels for 24 hours
cohort_label24_label12_atc = get_switch_labels(24, cohort_label24_label12_atc)

print()
# get the labels for 12 hours
cohort_label24_label12_atc = get_switch_labels(12, cohort_label24_label12_atc)

In [None]:
cohort_label24_label12_atc.drop(hidecols, axis=1)

In [None]:
# convert the inpatient IDs to strings to get rid of decimal
cohort_label24_label12_atc.inpatient_data_id_coded = cohort_label24_label12_atc.inpatient_data_id_coded.astype(str).str.split('.').str[0]

In [None]:
cohort_label24_label12_atc.to_csv("triage_cohort_adjusted_multilabel.csv", index=False)

In [None]:
triage_cohort_labels = triage_cohort_adjusted

# get the labels for a range of hours
hours = 48
for i in range(hours):
    print("Getting labels for hour ", i)
    labels = get_labels(i)
    triage_cohort_labels = triage_cohort_labels.merge(labels, 
                                                             on=['jc_uid', 'pat_enc_csn_id_coded'])
    
triage_cohort_labels

In [None]:
## get labels for many different hour marks
dropcols = [x for x in triage_cohort_labels.columns if 'admit_time_jittered' in x]
triage_cohort_labels_clean = triage_cohort_labels.drop(dropcols, axis=1)

triage_cohort_labels_clean.to_csv("traige_cohort_all_labels_clean.csv")


In [None]:
triage_cohort_labels_clean = pd.read_csv("traige_cohort_all_labels_clean.csv")

Find out how percentage of positive labels change as we move the window to create our labels.

In [None]:

pos_count = []
neg_count = []
cols = []

hours=48
for i in range(hours):
    col = "label_{}hr".format(i)
    cols.append(col)
    labels = triage_cohort_labels_clean[col]
    pos = sum(labels)
    neg = len(labels) - pos
    pos_count.append(pos)
    neg_count.append(neg)

print(pos_count)
print(neg_count)

pos_perc = np.array(pos_count) / (np.array(pos_count) + np.array(neg_count)) * 100
neg_perc = np.array(neg_count) / (np.array(pos_count) + np.array(neg_count)) * 100

print(pos_perc)
print(neg_perc)

In [None]:
# plot
names = [x for x in range(48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.bar(names, pos_perc, color='cornflowerblue', edgecolor='whitesmoke')
barlist[24].set_color('palevioletred')

# Custom x axis

plt.xticks(names)
plt.xlabel("Hours since admit")
plt.ylabel("Percent positive cases")
plt.title("Percent positive labels X hours after admission")
 
# Show graphic
plt.show()


# plot
names = [x for x in range(48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.plot(names, pos_perc, color='cornflowerblue')

# Custom x axis

plt.xticks(names)
plt.xlabel("Hours since admit")
plt.ylabel("Percent positive cases")
plt.title("Percent positive labels X hours after admission")
 
# Show graphic
plt.show()





# plot
names = [x for x in range(1,48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.plot(names, pos_perc[1:], color='cornflowerblue')

# Custom x axis

plt.xticks(names)
plt.xlabel("Hours since admit")
plt.ylabel("Percent positive cases")
plt.title("Percent positive labels X hours after admission")
 
# Show graphic
plt.show()

In [None]:
# plot
names = [x for x in range(48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.bar(names, pos_count, color='slateblue', edgecolor='white')
barlist[24].set_color('r')

# Custom x axis

plt.xticks(names)
plt.xlabel("Hours since admit")
plt.ylabel("Positive cases")
plt.title("Positive labels X hours after admission")
 
# Show graphic
plt.show()


# plot
names = [x for x in range(48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.plot(names, pos_count, color='cornflowerblue')

# Custom x axis

plt.xticks(names)
plt.xlabel("Hours since admit")
plt.ylabel("Positive cases")
plt.title("Positive labels X hours after admission")
 
# Show graphic
plt.show()



# plot
names = [x for x in range(1,48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.plot(names, pos_count[1:], color='cornflowerblue')

# Custom x axis

plt.xticks(names)
plt.xlabel("Hours since admit")
plt.ylabel("Positive cases")
plt.title("Positive labels X hours after admission")
 
# Show graphic
plt.show()

In [None]:
perc24 = pos_perc[24]
perc24

diffs = pos_perc - perc24
diffs

# plot
names = [x for x in range(48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.bar(names, diffs, color='dodgerblue', edgecolor='white')
# barlist[24].set_color('crimson')

# Custom x axis

plt.xticks(names)
plt.xlabel("Hours since admit")
plt.ylabel("Percent positive cases at 24hrs - Percent positive cases at X hours")
plt.title("Difference in percentage of positive labels at X hours after admission vs 24 hours after admission")
 
# Show graphic
plt.show()



# plot
names = [x for x in range(1,48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.bar(names, diffs[1:], color='dodgerblue', edgecolor='white')
# barlist[24].set_color('crimson')

# Custom x axis

plt.xticks(names)
plt.xlabel("Hours since admit")
plt.ylabel("Percent positive cases at 24hrs - Percent positive cases at X hours")
plt.title("Difference in percentage of positive labels at X hours after admission vs 24 hours after admission")
 
# Show graphic
plt.show()

In [None]:
# get the labels for switching
cohort_label24_label12_atc = pd.read_csv("triage_cohort_adjusted_multilabel.csv")

# get those who died
died = triage_adjusted_cohort_deaths[~triage_adjusted_cohort_deaths.death_date_jittered.isnull()]

# join death dates to the atc
died['jc_uid'] = died.rit_uid
joined = cohort_label24_label12_atc.merge(died, on='jc_uid', how='inner')

# find out how much time has passed since admit time and death
joined['admit_time'] = pd.to_datetime(joined.admit_time) 
joined['death_date_jittered'] = pd.to_datetime(joined.death_date_jittered)
# joined['death_date_jittered'] = pd.Timestamp.to_pydatetime(joined.death_date_jittered)
# get the date from admit time
joined['admit_date'] = joined.admit_time.dt.date

# joined.dtypes
joined['admit_date'] = pd.to_datetime(joined.admit_date)
# calculate time between death and admit
joined['admit_death_delta'] = joined.death_date_jittered - joined.admit_date

joined

In [None]:
died_7days = joined[joined.admit_death_delta <= timedelta(days=7)]

died_7days

In [None]:
print(died_7days.acute_to_critical_24hr.value_counts())

print(died_7days.label_24hr.value_counts())

## Choose days until death here

In [None]:
# join the death deltas to the label table
joined_clean = joined[['jc_uid', 'pat_enc_csn_id_coded', 'death_date_jittered', 'admit_death_delta']]
all_labels = triage_cohort_labels_clean.merge(joined_clean, on=['jc_uid', 'pat_enc_csn_id_coded'], how='left')

days=14
# create label for whether patient died within X days
all_labels['death_{}days'.format(days)] = all_labels.admit_death_delta < timedelta(days=days)

# find out interaction between death in 7days and labels at different hours
for i in range(48):
    key = "death_{}days_label{}".format(days, i)
    all_labels[key] = all_labels["label_{}hr".format(i)] * all_labels['death_{}days'.format(days)]


all_labels

In [None]:
# find out proportions for the labels created
props = []

for i in range(48):
    prop = sum(all_labels["death_{}days_label{}".format(days, i)]) / sum(all_labels["label_{}hr".format(i)]) *100
    props.append(prop)

props

In [None]:
# plot
names = [x for x in range(1,48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.plot(names, props[1:], color='darkslategray')
# barlist[24].set_color('gold')

# Custom x axis

plt.xticks(names)
plt.title("Percentage of deaths within {} days for patients transfered to ICU within X hours".format(days))
plt.ylabel("Percentage of deaths within {} days".format(days))
plt.xlabel("Hours since admit")
 
# Show graphic
plt.show()

In [None]:
# find out proportions for the labels created
counts = []

for i in range(48):
    count = sum(all_labels["death_{}days_label{}".format(days, i)])
    counts.append(count)

counts

In [None]:
# plot
names = [x for x in range(1,48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.plot(names, counts[1:], color='darkslategray')

# Custom x axis

plt.xticks(names)
plt.title("Number of deaths within {} days for patients transfered to ICU within X hours".format(days))
plt.ylabel("Number of deaths within {} days".format(days))
plt.xlabel("Hours since admit")
 
# Show graphic
plt.show()

In [None]:
# # how many people move to the ICU
moved = []

for i in range(48):
    labels = all_labels["label_{}hr".format(i)]
    moved.append(sum(labels))

# plot
names = [x for x in range(48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.plot(names, moved, color='darkslategray')
# barlist[24].set_color('gold')

# Custom x axis

plt.xticks(names)
plt.title("Percentage of deaths within {} days for patients transfered to ICU within X hours".format(days))
plt.ylabel("Percentage of deaths within {} days".format(days))
plt.xlabel("Hours since admit")
 
# Show graphic
plt.show()

# Admits only during that hour

Look at admits only at that hour to separate the cumulative effect that's going on with the earlier graphs.

In [None]:
all_labels["admit_at_{}hr".format(0)] = all_labels["label_0hr"]
for i in range(1,48):
        key = "admit_at_{}hr".format(i)
        all_labels[key] = all_labels["label_{}hr".format(i)] & ~all_labels["label_{}hr".format(i-1)]

cols = [x for x in all_labels.columns if "admit_at" in x]
all_labels[cols]

In [None]:
at_props = []

# find out interaction between death in 7days and labels at different hours
for i in range(48):
    key = "death_{}days_admit_at_{}hr".format(days, i)
    all_labels[key] = all_labels["admit_at_{}hr".format(i)] * all_labels['death_{}days'.format(days)]


all_labels

for i in range(48):
    prop = sum(all_labels["death_{}days_admit_at_{}hr".format(days, i)]) / sum(all_labels["admit_at_{}hr".format(i)]) *100
    at_props.append(prop)

print(at_props)

In [None]:
# plot
names = [x for x in range(48)]
plt.figure(figsize=(15,8))
# Create plot
sns.regplot(names, at_props, fit_reg=True)

# Custom x axis

plt.xticks(names)
plt.title("Percentage of patient deaths within {} days for patients transfered to ICU at X hours".format(days))
plt.ylabel("Percentage of patient deaths within {} days".format(days))
plt.xlabel("Hours since admit")
 
# Show graphic
plt.show()

np.polyfit(x=names, y=at_props, deg=1)

stats.linregress(x=names, y=at_props)

In [None]:
# try binning rows
binned = []
binned_x = []

n=3
# print(len(at_props))
for i in range(int(48/n)):
    binned_x.append(i*n)
    binned.append(sum(at_props[i*n:(i+1)*n]))

binned

model = stats.linregress(x=binned_x, y=binned)
print(model)



# plot
names = [x for x in range(int(48/n))]
plt.figure(figsize=(15,8))
# Create green Bars
# barlist = plt.scatter(names, at_props, color='darkslategray')
# barlist[24].set_color('gold')
sns.regplot(binned_x, binned, fit_reg=True)

# Custom x axis

labels = ["{}-{}".format(x*3,(x+1)*3) for x in range(int(48/n))]
plt.xticks(binned_x, labels=labels)
plt.title("Percentage of patient deaths within {} days for patients transfered to ICU at X hours".format(days))
plt.ylabel("Percentage of deaths within {} days".format(days))
plt.xlabel("Hours since admit")
 
# Show graphic
plt.show()

In [None]:
# looking at normality of data and residuals
slope,intercept,_,_,_ = model

residuals = binned - (np.array(intercept) + np.array(slope)*np.array(binned_x))

plt.hist(binned)
plt.show()

plt.scatter(binned_x, residuals)
plt.show()