In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os 
from google.cloud import bigquery

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/conorcorbin/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 
client=bigquery.Client()



In [8]:
query="""
WITH cohort_bugs AS (
ed), 

adt_dep as (
    SELECT DISTINCT
      adt.pat_enc_csn_id_coded, 
      FIRST_VALUE(dm.department_name) OVER 
      (PARTITION BY adt.pat_enc_csn_id_coded ORDER BY adt.effective_time_jittered_utc) department_name,
    FROM 
      `shc_core.adt` adt
    INNER JOIN
      `som-nero-phi-jonc101.shc_core.dep_map` dm
    USING
      (department_id)
)

SELECT 
    c.*, a.department_name
FROM
    cohort_bugs c
INNER JOIN
    adt_dep a
USING
    (pat_enc_csn_id_coded)

"""
query_job = client.query(query)
df = query_job.result().to_dataframe()
df.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,index_time,organism,description,department_name
0,JCe118ef,131264295601,2019-03-29 16:58:00+00:00,SERRATIA MARCESCENS,BLOOD CULTURE (2 AEROBIC BOTTLES),PEDIATRIC EMERGENCY DEPARTMENT
1,JCd8098d,131264669449,2019-02-28 09:52:00+00:00,ESCHERICHIA COLI,BLOOD CULTURE (AEROBIC & ANAEROBIC BOTTLE),PEDIATRIC EMERGENCY DEPARTMENT
2,JCd8098d,131264669449,2019-02-28 09:52:00+00:00,ESCHERICHIA COLI,URINE CULTURE,PEDIATRIC EMERGENCY DEPARTMENT
3,JCd3a17f,131272919007,2019-08-12 19:08:00+00:00,STREPTOCOCCUS SANGUINIS,BLOOD CULTURE (AEROBIC & ANAEROBIC BOTTLE),PEDIATRIC EMERGENCY DEPARTMENT
4,JCd5d810,131273005232,2019-08-05 01:55:00+00:00,KLEBSIELLA OXYTOCA,URINE CULTURE,PEDIATRIC EMERGENCY DEPARTMENT


In [9]:
(df
    .assign(description=lambda x: ["Blood Culture" if "BLOOD" in a else
                                   "Urine Culture" if "URINE" in a else
                                   "Csf or Fluid Culture" for a in x.description])
    .groupby('description')
    .agg(num_csns=('pat_enc_csn_id_coded', 'nunique'))
)

Unnamed: 0_level_0,num_csns
description,Unnamed: 1_level_1
Blood Culture,3420
Csf or Fluid Culture,486
Urine Culture,5746


In [10]:
### Filter out coag neg staph and rename MRSA to Staph Aureus
df = (df
    .assign(description=lambda x: ["Blood Culture" if "BLOOD" in a else
                                   "Urine Culture" if "URINE" in a else
                                   "Csf or Fluid Culture" for a in x.description])
    .query("organism != 'COAG NEGATIVE STAPHYLOCOCCUS'")
    .assign(organism = lambda x: [a if a != 'STAPH AUREUS {MRSA}' else 'STAPHYLOCOCCUS AUREUS'
                                 for a in x.organism])
    .assign(organism = lambda x: [a if a != 'STREPTOCOCCUS AGALACTIAE {GROUP B}' else 'STREPTOCOCCUS AGALACTIAE (GROUP B)'
                                 for a in x.organism])
    .assign(organism = lambda x: [a.lower() for a in x.organism])
)

In [43]:
### Get Top 5 bugs for each culture type by department where we count by number of encounters bug grew. 
df_stanford = pd.DataFrame()
df_valley_care = pd.DataFrame()
for culture_type in ['Blood Culture', 'Urine Culture', 'Csf or Fluid Culture']:
    
    df_stanford_temp = (df
        .query('not department_name.str.contains("VCP") and description==@culture_type', engine='python')
        .groupby('organism')
        .agg({'pat_enc_csn_id_coded' : 'nunique'})
        .reset_index()
        .rename(columns={'organism' : 'Organism', 'pat_enc_csn_id_coded' : "Count"})
        .sort_values('Count', ascending=False)
        .head(5)
    )
    df_stanford_temp['Culture Type'] = [culture_type for i in range(len(df_stanford_temp))]


    df_valley_care_temp = (df
        .query('department_name.str.contains("VCP") and description==@culture_type', engine='python')
        .groupby('organism')
        .agg({'pat_enc_csn_id_coded' : 'nunique'})
        .reset_index()
        .rename(columns={'organism' : 'Organism', 'pat_enc_csn_id_coded' : "Count"})
        .sort_values('Count', ascending=False)
        .head(5)
    )
    df_valley_care_temp['Culture Type'] = [culture_type for i in range(len(df_valley_care_temp))]

    
    df_stanford = pd.concat([df_stanford, df_stanford_temp])
    df_valley_care = pd.concat([df_valley_care, df_valley_care_temp])

In [44]:
df_stanford

Unnamed: 0,Organism,Count,Culture Type
72,escherichia coli,1031,Blood Culture
116,staphylococcus aureus,585,Blood Culture
81,klebsiella pneumoniae,318,Blood Culture
66,enterococcus faecalis,159,Blood Culture
119,streptococcus agalactiae (group b),131,Blood Culture
23,escherichia coli,2927,Urine Culture
22,enterococcus species,877,Urine Culture
27,klebsiella pneumoniae,653,Urine Culture
34,proteus mirabilis,299,Urine Culture
39,pseudomonas aeruginosa,268,Urine Culture


In [45]:
df_valley_care

Unnamed: 0,Organism,Count,Culture Type
10,escherichia coli,98,Blood Culture
22,staphylococcus aureus,49,Blood Culture
13,klebsiella pneumoniae,29,Blood Culture
15,proteus mirabilis,15,Blood Culture
18,pseudomonas aeruginosa,9,Blood Culture
10,escherichia coli,361,Urine Culture
19,proteus mirabilis,90,Urine Culture
13,klebsiella pneumoniae,84,Urine Culture
8,enterococcus faecalis,59,Urine Culture
24,pseudomonas aeruginosa,43,Urine Culture


In [46]:
df_stanford['Institution'] = ["Stanford ED" for i in range(len(df_stanford))]
df_valley_care['Institution'] = ["Valley Care ED" for i in range(len(df_valley_care))]
df_final = pd.concat([df_stanford, df_valley_care])
df_final.head()

Unnamed: 0,Organism,Count,Culture Type,Institution
72,escherichia coli,1031,Blood Culture,Stanford ED
116,staphylococcus aureus,585,Blood Culture,Stanford ED
81,klebsiella pneumoniae,318,Blood Culture,Stanford ED
66,enterococcus faecalis,159,Blood Culture,Stanford ED
119,streptococcus agalactiae (group b),131,Blood Culture,Stanford ED


In [47]:
df_final = df_final[['Institution', 'Culture Type', 'Organism', 'Count']]
df_final['Organism'] = [a.capitalize() for a in df_final['Organism']]
df_final.to_csv('Table 2: Organism by culture type long.csv')

In [30]:
culture_type = df_stanford['Culture Type'].values
df_stanford = df_stanford.drop('Culture Type', axis=1)
df_valley_care = df_valley_care.drop('Culture Type', axis=1)

df_valley_care.columns = pd.MultiIndex.from_product([['Valley Care ED'], df_valley_care.columns])
df_stanford.columns = pd.MultiIndex.from_product([['Stanford ED'], df_stanford.columns])

df_bugs = pd.concat([df_valley_care.reset_index(drop=True), df_stanford.reset_index(drop=True)], axis=1)
df_bugs['Culture Type'] = culture_type

In [32]:
df_bugs = df_bugs[['Culture Type', 'Stanford ED', 'Valley Care ED']]
df_bugs

Unnamed: 0_level_0,Culture Type,Stanford ED,Stanford ED,Valley Care ED,Valley Care ED
Unnamed: 0_level_1,Unnamed: 1_level_1,Organism,Count,Organism,Count
0,Blood Culture,escherichia coli,1031,escherichia coli,98
1,Blood Culture,staphylococcus aureus,585,staphylococcus aureus,49
2,Blood Culture,klebsiella pneumoniae,318,klebsiella pneumoniae,29
3,Blood Culture,enterococcus faecalis,159,proteus mirabilis,15
4,Blood Culture,streptococcus agalactiae (group b),131,pseudomonas aeruginosa,9
5,Urine Culture,escherichia coli,2927,escherichia coli,361
6,Urine Culture,enterococcus species,877,proteus mirabilis,90
7,Urine Culture,klebsiella pneumoniae,653,klebsiella pneumoniae,84
8,Urine Culture,proteus mirabilis,299,enterococcus faecalis,59
9,Urine Culture,pseudomonas aeruginosa,268,pseudomonas aeruginosa,43


In [14]:
df_bugs.to_html('table2_top_bugs.html', index=None)

In [33]:
df_bugs.to_csv('table2_top_bugs_by_culture.csv', index=None)

In [24]:
df_bugs_test['test'] = ['test' for i in range(len(df_bugs_test))]

In [26]:
df_bugs_test.drop("Culture Type", axis=1)

KeyError: 'Culture Type'