# Deaths Analysis 2019

Checking on the patients that died in our cohort. Specifically looking for any that died within 24 hours of being admitted.

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Format the data

Reading in files that were pulled and saved from BQ.

`cohorts_2019_deaths.csv` and `cohort_2018_prior_deaths.csv` pull the demographics table for our cohort with the death date included. If the death date is NaN, then the individual is considered alive.

The 2018 data was pulled from the initial database, and then 2019 data was pulled from a later release.

In [None]:
# read in files that we need
datadir = "../../../../2019_data/"  # point to directory with saved data files
deaths = pd.read_csv("{}cohort_2019_deaths.csv".format(datadir))
deaths18 = pd.read_csv("{}cohort_2018_prior_deaths.csv".format(datadir))
cohort = pd.read_csv("{}triage_to_keep_cohort_with_labels_updated.csv".format(datadir))

# labels = pd.read_csv("{}triage_cohort_2019_all_labels.csv".format(datadir))
# cohort = pd.read_csv("{}features_demos_vitals_labs.csv".format(datadir))

# use this to identify test/train set - pulled from Box
results = pd.read_csv("{}lightgbm_test_results_24hr_max.csv".format(datadir))

In [None]:
deaths18.head()
deaths.ANON_ID.nunique()

In [None]:
# using this to print out dataframes with specific columns hidden

forrepo = 1 # change this to 0 if you want to see hidden columns displayed

hidecols = []
if forrepo:
    hidecols=['anon_id', 'pat_enc_csn_id_coded', 'inpatient_data_id_coded'] # these are hidden columns

In [None]:
# number of unique anon_id and csns in our cohort to begin with
# 29,891 anon_id
# 43,207 csns
print(cohort.anon_id.nunique())
print(cohort.pat_enc_csn_id_coded.nunique())
# cohort.drop(hidecols, axis=1, errors='ignore').head()

In [None]:
# how many anon_ids do we have in the deaths table? 28,985
cohort_deaths = deaths[deaths.ANON_ID.isin(cohort.anon_id)]
print(cohort_deaths.ANON_ID.nunique())
# cohort_deaths.head()

In [None]:
# how many anon_id (aka rit_uid) are in the deaths18 table? 26,670
cohort_deaths = deaths18[deaths18.rit_uid.isin(cohort.anon_id)]
print(cohort_deaths.rit_uid.nunique())
# deaths18.head()

In [None]:
# get the csns that we care about
csns = cohort.pat_enc_csn_id_coded.unique()

# filter labels down to these csns also
labels_2019 = cohort

In [None]:
# get 2018 prior death dates
cohort_deaths18 = deaths18[deaths18.rit_uid.isin(labels_2019.anon_id)]
cohort_deaths18.rename({'rit_uid':'anon_id',
                       'death_date_jittered':'death_date_2018'},
                      inplace=True, axis=1)


# get the 2019 deaths
cohort_deaths19 = deaths[deaths.ANON_ID.isin(labels_2019.anon_id)]
cohort_deaths19.rename({'ANON_ID':'anon_id',
                       'DEATH_DATE_JITTERED':'death_date_2019'},
                      inplace=True, axis=1)
# cohort_deaths19.head()

In [None]:
# cohort_deaths18.head()

In [None]:
# join 2018prior and 2019 death dates
all_deaths = cohort_deaths18.merge(cohort_deaths19, how='outer')
# all_deaths.head()

In [None]:
# check out people who have a death date
died = all_deaths[(~all_deaths.death_date_2018.isnull()) | (~all_deaths.death_date_2019.isnull())]
# died

In [None]:
# join the deaths data to labels
labels_deaths = labels_2019.merge(all_deaths, how='left')

# change the dates to datetime
labels_deaths.death_date_2018 = pd.to_datetime(labels_deaths.death_date_2018)
labels_deaths.death_date_2019 = pd.to_datetime(labels_deaths.death_date_2019)

# get the year from the dates
# labels_deaths['death_year_2019'] = pd.DatetimeIndex(labels_deaths['death_date_2019']).year
# labels_deaths['death_year_2018'] = pd.DatetimeIndex(labels_deaths['death_date_2018']).year

# column indicating whether someone died
labels_deaths['died'] = (~labels_deaths.death_date_2018.isnull()) | (~labels_deaths.death_date_2019.isnull())


In [None]:
# take the 2019 date only if 2018prior is NaN
# ie they died during or after 2019

# everyone who died in 2020 or 2019 has their death dates in the 2019 deaths data
labels_deaths[(labels_deaths.died) & 
              (labels_deaths.death_date_2018.isnull())].drop(hidecols, axis=1, errors='ignore').head()

#              (~labels_deaths.death_year_2019.isin([2019,2020]))]

In [None]:
# keep the earliest death date - will be from 2018prior if it exists 
# DON'T DO THIS! -- all dates are rejittered for the 2019 data so it's okay to just use the 2019 death dates
# labels_deaths['death_date'] = labels_deaths[['death_date_2019', 'death_date_2018']].min(axis=1)

In [None]:
labels_deaths.drop(hidecols, axis=1, errors='ignore').head()

In [None]:
# save a copy of the table with death dates from each database pull
to_save = labels_deaths.rename({'death_date_2018':'death_date_starrdatalake2018',
                               'death_date_2019':'death_date_shccore'})
to_save.to_csv("{}labels_deaths_2019_2018_dates.csv".format(datadir))

In [None]:
to_save.drop(hidecols, axis=1, errors='ignore').head()

In [None]:
# missing_death2019 = labels_deaths[(labels_deaths.death_date_2019.isnull()) &
#              (~labels_deaths.death_date_2018.isnull())]

# missing_death2019[['anon_id', 'admit_date', 'death_date_2018', 'death_date_2019', 'admit_death_delta']]

In [None]:
labels_deaths['death_date'] = labels_deaths.death_date_2019

In [None]:
# labels_deaths.drop(['death_date_2018', 'death_date_2019', 
#                     'match_2018_2019', 'death_year_2019', 'death_year_2018'],
#                   axis=1, inplace=True)

In [None]:
# find out how much time has passed since admit time and death
labels_deaths['admit_time'] = pd.to_datetime(labels_deaths.admit_time) 
labels_deaths['death_date'] = pd.to_datetime(labels_deaths.death_date)

# get the date from admit time
labels_deaths['admit_date'] = labels_deaths.admit_time.dt.date


labels_deaths['admit_date'] = pd.to_datetime(labels_deaths.admit_date)
# calculate time between death and admit
labels_deaths['admit_death_delta'] = labels_deaths.death_date - labels_deaths.admit_date

labels_deaths.drop(hidecols, axis=1, errors='ignore').head()

# Check test cohort

Filter down to the test set and check on number of deaths.

In [None]:
# add column to mark whether csn is in test set
labels_deaths['in_test_set'] = labels_deaths.pat_enc_csn_id_coded.isin(results.pat_enc_csn_id_coded).astype('int')

# test set numbers
# csns = 43,207 total
# test = 33,111
# train = 10,096
print(labels_deaths.pat_enc_csn_id_coded.nunique())
print(labels_deaths.in_test_set.value_counts())

# # save to file
labels_deaths.to_csv("{}full_cohort_2019_deaths_with_labels.csv".format(datadir))

In [None]:
labels_deaths.drop(hidecols, axis=1, errors='ignore').head()

In [None]:
# filter down to test set
# test_deaths = labels_deaths[labels_deaths.in_test_set]

# print(labels_deaths.pat_enc_csn_id_coded.nunique())
# print(results.pat_enc_csn_id_coded.nunique())
# print(test_deaths.pat_enc_csn_id_coded.nunique())
# test_deaths.head()

# Died within 1 day

Find out which individuals died within 24 hours of being admitted.

If an individual was in non-ICU, but died within 24 hours, change their label to 1. Indicate that these individuals were in a critical condition.

I will add a column called `died_within_24hrs` which can be used to change the existing label columns downstream.

In [None]:
days=1
died_1day = labels_deaths[labels_deaths.admit_death_delta <= timedelta(days=days)]

labels_deaths['died_within_24hrs'] = (labels_deaths.admit_death_delta <= timedelta(days=days)).astype(int)

labels_deaths.sort_values('admit_death_delta').drop(hidecols, axis=1, errors='ignore').head()

In [None]:
labels_deaths.died.value_counts()

In [None]:
## one individual with negative delta -- may be an entry error? **
# maybe remove this individual later??
# everyone else seems fine
died_1day.admit_death_delta.value_counts()

In [None]:
# how are these individuals distributed as far as admit label goes?
died_1day.admit_label.value_counts()

In [None]:
# how do the recent 24hr labels look?
died_1day.label_24hr_recent.value_counts()

In [None]:
# save the table with the new labels 
cleaned_labels = labels_deaths.drop(['death_date_2018', 'death_date_2019'], axis=1, errors='ignore')
cleaned_labels.died = cleaned_labels.died.astype(int)
cleaned_labels.drop(hidecols, axis=1, errors='ignore').head()

cleaned_labels.to_csv("{}labels_with_death_delta.csv".format(datadir), index=False)

In [None]:
cleaned_labels.pat_enc_csn_id_coded.nunique()

# --- Below this point is for plotting only ---

Will remove this and move to another file later!

# Remake plot for admissions to ICU at each hour

In [None]:
# use this to describe the tables
def describe_df(df):
    print("df shape", df.shape)
    print("unique CSNs: ", df.pat_enc_csn_id_coded.nunique())
    print("unique patients: ", df.anon_id.nunique())

In [None]:
# read in the adt file
adt_file = "{}triage_cohort_adt_2019.csv".format(datadir)
adt = pd.read_csv(adt_file)
describe_df(adt)
adt.head().drop(hidecols, axis=1, errors='ignore')

In [None]:
cohort_adt = adt[adt.pat_enc_csn_id_coded.isin(cohort.pat_enc_csn_id_coded)]
describe_df(cohort_adt)

In [None]:
# change time columns to datetime
cohort_adt.effective_time_jittered_utc = pd.to_datetime(cohort_adt.effective_time_jittered_utc).dt.tz_localize(None)
cohort.admit_time = pd.to_datetime(cohort.admit_time)

In [None]:
joined_cohort_adt = cohort.merge(cohort_adt, how='left')

In [None]:
# cut down on space
cols = ['anon_id', 'pat_enc_csn_id_coded', 'label_24hr_recent', 
       'admit_time', 'pat_class', 'pat_lv_of_care', 'pat_service',
       'effective_time_jittered_utc', 'seq_num_in_enc']
joined_cohort_adt_less = joined_cohort_adt[cols]

In [None]:
print(joined_cohort_adt_less.shape)
print(cohort_adt.shape)

In [None]:
# save the adt table in case we crash again
joined_cohort_adt_less.to_csv("{}joined_cohort_adt_2019.csv".format(datadir))

In [None]:
joined_cohort_adt_less.head().drop(hidecols, axis=1, errors='ignore')

In [None]:
# sort the adt table by seqnum in encounter
joined_cohort_adt_less.sort_values(['pat_enc_csn_id_coded', 'seq_num_in_enc'], inplace=True)
joined_cohort_adt_less.head().drop(hidecols, axis=1, errors='ignore')

In [None]:
# compute the time since event
joined_cohort_adt_less['time_since_admit'] = joined_cohort_adt_less.apply(
    lambda x: x.effective_time_jittered_utc - x.admit_time, axis=1)
joined_cohort_adt_less.head()

joined_cohort_adt_less.sort_values(['pat_enc_csn_id_coded', 'seq_num_in_enc'], inplace=True)

In [None]:
# function to get labels
def get_labels(window_hours):
    df_lofc = joined_cohort_adt_less

    # Filter df_lofc so that we only look window_hours hours into admission
    df_lofc = df_lofc[(df_lofc['time_since_admit'] < timedelta(hours=window_hours))
                      & (df_lofc.time_since_admit 
                                       >= timedelta(hours=0))]

    def was_placed_in_critical_care(arr):
        """Returns true if patient placed in crtical care within 24 hours of admit
           Assumes we have already done the 24 hours logic
           Assumes no overlapping csn ids... """
        for a in arr:
            if a == 'Critical Care':
                return 1
        return 0

    label_name = "label_{}hr".format(window_hours)
    
    df_labels = df_lofc.groupby('pat_enc_csn_id_coded').agg({
        'anon_id' : 'first',
        'admit_time' : 'first',
        'pat_lv_of_care' : was_placed_in_critical_care}).rename(
        columns={"pat_lv_of_care" : label_name}).reset_index()[['anon_id', 'pat_enc_csn_id_coded', 
                                                                'admit_time', label_name]]
    df_labels.head()

    print(df_labels.groupby(label_name).count())
    
    return df_labels

In [None]:
labels_24hr = get_labels(24)

In [None]:
labels_24hr.head()

In [None]:
labels.head()

In [None]:
labels_24hr.rename({'label_24hr':'label_24hr00'}, axis=1, inplace=True)

In [None]:
describe_df(labels)

In [None]:
cohort_labels = labels[labels.pat_enc_csn_id_coded.isin(cohort.pat_enc_csn_id_coded)]

In [None]:
newlabs = cohort_labels.merge(labels_24hr[['anon_id', 'pat_enc_csn_id_coded', 'label_24hr00']], how='left')
newlabs.head()

In [None]:
newlabs[newlabs.label_max24 != newlabs.label_24hr00]

# Old code

In [None]:
# plot
names = [x for x in range(48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.bar(names, pos_count, color='slateblue', edgecolor='white')
barlist[24].set_color('r')

# Custom x axis

plt.xticks(names)
plt.xlabel("Hours since admit")
plt.ylabel("Positive cases")
plt.title("Positive labels X hours after admission")
 
# Show graphic
plt.show()


# plot
names = [x for x in range(48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.plot(names, pos_count, color='cornflowerblue')

# Custom x axis

plt.xticks(names)
plt.xlabel("Hours since admit")
plt.ylabel("Positive cases")
plt.title("Positive labels X hours after admission")
 
# Show graphic
plt.show()



# plot
names = [x for x in range(1,48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.plot(names, pos_count[1:], color='cornflowerblue')

# Custom x axis

plt.xticks(names)
plt.xlabel("Hours since admit")
plt.ylabel("Positive cases")
plt.title("Positive labels X hours after admission")
 
# Show graphic
plt.show()

In [None]:
perc24 = pos_perc[24]
perc24

diffs = pos_perc - perc24
diffs

# plot
names = [x for x in range(48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.bar(names, diffs, color='dodgerblue', edgecolor='white')
# barlist[24].set_color('crimson')

# Custom x axis

plt.xticks(names)
plt.xlabel("Hours since admit")
plt.ylabel("Percent positive cases at 24hrs - Percent positive cases at X hours")
plt.title("Difference in percentage of positive labels at X hours after admission vs 24 hours after admission")
 
# Show graphic
plt.show()



# plot
names = [x for x in range(1,48)]
plt.figure(figsize=(15,8))
# Create green Bars
barlist = plt.bar(names, diffs[1:], color='dodgerblue', edgecolor='white')
# barlist[24].set_color('crimson')

# Custom x axis

plt.xticks(names)
plt.xlabel("Hours since admit")
plt.ylabel("Percent positive cases at 24hrs - Percent positive cases at X hours")
plt.title("Difference in percentage of positive labels at X hours after admission vs 24 hours after admission")
 
# Show graphic
plt.show()

# Admits only during that hour

Look at admits only at that hour to separate the cumulative effect that's going on with the earlier graphs.

In [None]:
all_labels["admit_at_{}hr".format(0)] = all_labels["label_0hr"]
for i in range(1,48):
        key = "admit_at_{}hr".format(i)
        all_labels[key] = all_labels["label_{}hr".format(i)] & ~all_labels["label_{}hr".format(i-1)]

cols = [x for x in all_labels.columns if "admit_at" in x]
all_labels[cols]