## Looking into the CSN's for encounters

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime, timedelta
from dateutil.parser import parse
import os 

## Query the ADT table

I just grab a 2017 data to look at here.

In [None]:
### THIS IS MEANT TO RUN ON NERO - NEEDS TO BE CHANGED IF YOU RUN LOCALLY
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/eulalio/.config/gcloud/application_default_credentials.json' 
# os.environ['GCLOUD_PROJECT'] = 'mining-clinical-decisions' 

from google.cloud import bigquery
from google.cloud.bigquery import dbapi;

# set up the connection
client = bigquery.Client()
conn = dbapi.connect(client);
cursor = conn.cursor();

In [None]:
# function to query the database
def query_data(query, savefile):
    # make the query
    resultsDF = pd.read_sql_query(query, conn);
    resultsDF.head()
    
    # save the results to a file
    resultsDF.to_csv(savefile, sep = '\t')

    # return the results as data frame
    return resultsDF

In [None]:
# query the adt (admission, discharge, transfer) table for 2017 data

# things to extract: 
fieldstr = str("jc_uid, pat_enc_csn_id_coded, effective_time_jittered_utc, event_id_coded, event_type, event_subtype"
", department_id, pat_class, base_pat_class, pat_service, status_of_bed, pat_lv_of_care" 
", xfer_event_id_coded, swap_event_id_coded, reason_for_admit, accomodation, accom_reason_c"
", adm_event_id_coded, dis_event_id_coded, xfer_in_event_id_coded, next_out_event_id_coded"
", last_in_event_id_coded, prev_event_id_coded, in_event_type, out_event_type, from_base_class"
", to_base_class, seq_num_in_enc, seq_num_in_bed_min, labor_status_c, first_ip_in_ip_yn"
", loa_reason_c")

# build query
query = "select {} from `starr_datalake2018.adt` WHERE EXTRACT(YEAR from effective_time_jittered_utc) = 2017".format(fieldstr)

outfile = "adt_selectcolumns_2017.txt"

# commenting this out after the first time because it's already saved on my computer
# adt_df = query_data(query, outfile)
adt_df

In [None]:
# load the adt_df file
adt_df = pd.read_csv("adt_selectcolumns_2017.txt", sep='\t')
adt_df

In [None]:
# add a datetime date to the dataframe

# need to convert the effective_time_jittered_utc to a datetime objects since it's been stored as a string
adt_df['effective_datetime'] = pd.to_datetime(adt_df.effective_time_jittered_utc, format=("%Y-%m-%d"))

# create a new column for the day extracted from effective_time_jittered_utc
# adt_df['effective_date'] = adt_df.apply(lambda row: row['effective_datetime'].strftime("%Y-%m-%d"), axis = 1)
adt_df['effective_date'] = adt_df.apply(lambda row: row['effective_datetime'].date(), axis = 1)

In [None]:
adt_df.columns

## Filter to Inpatient encounters

Checking to see if there are any patients with multiple CSN's within an inpatient encounter.

In [None]:
# filter the table down to only Inpatient pat class since this is how we build the cohort
inpatient_adt = pd.DataFrame(adt_df[adt_df.pat_class == 'Inpatient'])
inpatient_adt

### Group by the effective date

We'll group down to the granularity of a single day to see whether we find multiple CSNs in a single day for a patient.

In [None]:
# count the number of unique CSNs that occur on one day
# sort so we can see instances where multiple CSNs occur on the same day

# group by jc_uid and effective_date
dup_counts = inpatient_adt.groupby(['jc_uid', 'effective_date']).pat_enc_csn_id_coded.nunique().sort_values(ascending=False)
print("There are", len(dup_counts), "patient-dates with multiple CSNs")
dup_counts[:20]

## Look for instances that affect us

I looked into several of these manually and they were kind of weird so I wanted to see whether any of them would even fall into our cohort. I checked entries from the list above in the original adt_data dataframe that was pulled in the beginning to see if any of them had pat_class "Emergency Services" to "Inpatient" events.

In [None]:
# check to see if any of our list above might fall into our cohort queries
i = 0
output = []

# look through the list above
for idx in dup_counts.index:
    # only check those that have multiple CSNs on the same day
    if dup_counts[idx] > 1:
        jid, date = idx

        # grab this patient jc_uid and the effective_date entries
        adt = adt_df[adt_df.jc_uid == jid]
        adt = adt[adt.effective_date == date]
    
        # check if "Emergency Services" is contained in this dataframe
        # if yes, then print the jc_uid and the date, 
        # stop once we find at least 10 because it take forever to check the entire list
        adt = adt[adt.pat_class == "Emergency Services"]
        if adt.shape[0] > 0:
            print(jid, date)
            output.append((jid, date))
            i += 1
    
    if i == 10:
        break


## Manually check the jc_uid and dates 

Check to see what's going on with the entries found above

In [None]:
# this function grabs the rows for the input jc_uid and date + some range of following dates
# sorts them by time and returns the dataframe
# it's used to query those dup dates above
def view_adt(jid, date):
    n = 7
    
    # grab the jc_uid and then the date from the original adt_df data
    dups = adt_df[adt_df.jc_uid == jid]
    
    # check out entries that occur within n days
    dups = dups[dups.effective_date >= date]
    dups = dups[dups.effective_date <= date + timedelta(hours=n*24)]
    sorted_dups = dups.sort_values('effective_time_jittered_utc')
    
    sorted_dups['time_order'] = sorted_dups.reset_index().index
    
    return(sorted_dups)
    

### After looking through these examples, it seems like

1) `pat_lv_of_care` labels shouldn't be affected by the change in CSN because overlapping CSNs don't seem to change that without this being reflected in the original CSN.

2) The can reflect cases where the patient is already in the hospital prior to the "Emergency Services --> Inpatient" CSN occurs. This is a little weird and we'd miss any labs/vitals/orders associated with these earlier CSNs. 

3) Overlapping CSNs after the Inpatient status aren't really that important because we don't take any labs/vitals/orders after admission anyways.

## Case 1

Check out the first case. We look at the full dataframe, followed by select columns afterwards.

In [None]:
j = 0
jid, date = output[j]
view_adt(jid, date)

### Select columns are viewed here

The CSN that would be pulled through Conor's query would be `131217107843` since it's the one that goes from "Emergency Services" followed by "Inpatient". 

In [None]:
test = view_adt(jid, date)
test[['jc_uid', 'pat_enc_csn_id_coded', 'effective_time_jittered_utc', 'pat_class', 'pat_lv_of_care', 'time_order']]

## Case 2-10 follows

Check out the next case. We look at the full dataframe, followed by select columns afterwards.

In [None]:
j = 1
jid, date = output[j]
view_adt(jid, date)

In [None]:
test = view_adt(jid, date)
test[['jc_uid', 'pat_enc_csn_id_coded', 'effective_time_jittered_utc', 'pat_class', 'pat_lv_of_care']]

In [None]:
j = 2
jid, date = output[j]
view_adt(jid, date)

In [None]:
test = view_adt(jid, date)
test[['jc_uid', 'pat_enc_csn_id_coded', 'effective_time_jittered_utc', 'pat_class', 'pat_lv_of_care']]

In [None]:
j = 3
jid, date = output[j]
view_adt(jid, date)

In [None]:
test = view_adt(jid, date)
test[['jc_uid', 'pat_enc_csn_id_coded', 'effective_time_jittered_utc', 'pat_class', 'pat_lv_of_care']]

In [None]:
j = 4
jid, date = output[j]
view_adt(jid, date)

In [None]:
test = view_adt(jid, date)
test[['jc_uid', 'pat_enc_csn_id_coded', 'effective_time_jittered_utc', 'pat_class', 'pat_lv_of_care']]

In [None]:
j = 5
jid, date = output[j]
view_adt(jid, date)

In [None]:
test = view_adt(jid, date)
test[['jc_uid', 'pat_enc_csn_id_coded', 'effective_time_jittered_utc', 'pat_class', 'pat_lv_of_care']]

In [None]:
j = 6
jid, date = output[j]
view_adt(jid, date)

In [None]:
test = view_adt(jid, date)
test[['jc_uid', 'pat_enc_csn_id_coded', 'effective_time_jittered_utc', 'pat_class', 'pat_lv_of_care']]

In [None]:
j = 7
jid, date = output[j]
view_adt(jid, date)

In [None]:
j = 8
jid, date = output[j]
view_adt(jid, date)

In [None]:
j = 9
jid, date = output[j]
view_adt(jid, date)

## Check the admit time for weird patients

### Order admission time < ADT admission time

This looks like the patient goes into surgery and it's already known that the patient will need to be admitted afterwards. 

**Decision: Use the Order admission time because it is the earliest time when the doctors already knew the patient would be admitted.**

In [None]:
fields = ['jc_uid', 'pat_enc_csn_id_coded', 'effective_time_jittered_utc', '' 'event_type', 'pat_class', 'pat_lv_of_care']

adt_df[adt_df.jc_uid == 'JCcb68fc'].sort_values('effective_time_jittered_utc')[fields]

### More extreme case

The order was placed two days before the patient reached the Inpatient status.

**Decision: ALWAYS take the earliest time**

In [None]:
jid = 'JCd282bf'
adt_df[adt_df.jc_uid == jid].sort_values('effective_time_jittered_utc')[fields]

## Check the admit time for weird patients

### Order admission time > ADT admission time

Looks like the order just went in late. The patient was clearly an inpatient.

**Decision: Always take the earliest time, i.e. take the ADT admission time in this case**

In [None]:
jid = 'JCcbc391'
adt_df[adt_df.jc_uid == jid].sort_values('effective_time_jittered_utc')[fields]