In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pulp import *
import os, glob

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/conorcorbin/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'mining-clinical-decisions' 
%load_ext google.cloud.bigquery

from google.cloud import bigquery
client=bigquery.Client()



In [9]:
query="""
SELECT DISTINCT
    orders.order_proc_id_coded,
    orders.description,
    EXTRACT(YEAR FROM orders.order_time_jittered_utc) year,
    TIMESTAMP_DIFF(cs.result_time_jittered_utc, orders.order_time_jittered_utc, HOUR) hours_to_abx_sensitivities
FROM 
    mining-clinical-decisions.abx.culture_orders_within_24_hrs orders
INNER JOIN
    mining-clinical-decisions.shc_core.culture_sensitivity cs
USING
    (order_proc_id_coded)
"""
query_job = client.query(query)
df = query_job.result().to_dataframe()
df.head()

Unnamed: 0,order_proc_id_coded,description,year,hours_to_abx_sensitivities
0,359897857,URINE CULTURE,2009,63
1,361591188,URINE CULTURE,2009,43
2,363724629,URINE CULTURE,2009,108
3,359089263,URINE CULTURE,2009,70
4,359267867,URINE CULTURE,2009,171


In [16]:
# cultures = ['URINE CULTURE', 'BLOOD CULTURE (AEROBIC & ANAEROBIC BOTTLE)', 'BLOOD CULTURE (2 AEROBIC BOTTLES)' ]
train_years = ['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']
test_years = ['2019']
df_time_to_results = (df
.assign(description=lambda x: ['Blood Culture' if "BLOOD" in a
                               else 'Urine Culture' if 'URINE' in a
                               else 'Other Fluid Culture' for a in x.description])
.assign(dataset=lambda x: ['Training Set' if str(year) in train_years else 'Test Set' for year in x.year])
.groupby(['description', 'dataset'])
.agg(mean_hours=('hours_to_abx_sensitivities', 'mean'),
     median_hours=('hours_to_abx_sensitivities', 'median'),
     std_hours=('hours_to_abx_sensitivities', 'std')
    )
.reset_index()
.sort_values(['dataset', 'description'], ascending=True)
)

In [17]:
df_time_to_results

Unnamed: 0,description,dataset,mean_hours,median_hours,std_hours
0,Blood Culture,Test Set,103.533827,77.0,115.959405
2,Other Fluid Culture,Test Set,116.48125,92.5,148.3383
4,Urine Culture,Test Set,64.436461,59.0,33.242212
1,Blood Culture,Training Set,119.477792,92.0,136.570805
3,Other Fluid Culture,Training Set,131.84183,96.0,177.164512
5,Urine Culture,Training Set,81.224708,60.0,287.805999


In [18]:
df_time_to_results.to_csv('time_to_results.csv', index=None)

In [20]:
### Extract method info for each positive culture (kirby bauer or MIC)
query="""
SELECT DISTINCT
    orders.order_proc_id_coded,
    orders.description,
    antibiotic,
    sensitivity_value,
FROM 
    mining-clinical-decisions.abx.culture_orders_within_24_hrs orders
INNER JOIN
    mining-clinical-decisions.shc_core.culture_sensitivity cs
USING
    (order_proc_id_coded)
WHERE
    UPPER(antibiotic) LIKE "%METHOD%"
"""
query_job = client.query(query)
df_method = query_job.result().to_dataframe()
df_method.head()

Unnamed: 0,order_proc_id_coded,description,antibiotic,sensitivity_value
0,367546681,URINE CULTURE,Method,MIC mcg/mL
1,369602977,URINE CULTURE,Method,MIC mcg/mL
2,375709591,BLOOD CULTURE (2 AEROBIC BOTTLES),Method,MIC mcg/mL
3,375712390,URINE CULTURE,Method,MIC mcg/mL
4,378154266,BLOOD CULTURE (AEROBIC & ANAEROBIC BOTTLES),Method,MIC mcg/mL


In [24]:
(df_method
.groupby('sensitivity_value')
.agg(num_orders=('order_proc_id_coded', 'nunique'))
.reset_index()
)

Unnamed: 0,sensitivity_value,num_orders
0,Beta Lactamase,2
1,KB (Inducible),15
2,Kirby Bauer,312
3,Kirby Bauer (KB4),6
4,MIC mcg/mL,1826
5,MIC mcg/mL by Etest,1


In [28]:
### Prevalance in trainign set vs test set for each of the twelve classifiers
query="""
SELECT
    *, EXTRACT(YEAR FROM index_time) as year
FROM 
    mining-clinical-decisions.abx.final_ast_labels
"""
query_job = client.query(query)
df_labels = query_job.result().to_dataframe()
df_labels.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,index_time,Ampicillin,Ciprofloxacin,Cefazolin,Ceftriaxone,Cefepime,Zosyn,Vancomycin,Meropenem,Vancomycin_Ceftriaxone,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem,year
0,JCda516b,131004896326,2009-07-31 03:49:00+00:00,0,1,0,0,0,1,1,0,1,1,1,1,2009
1,JCdb35f4,131006601177,2009-09-11 11:44:00+00:00,0,0,0,0,0,0,0,0,0,0,0,0,2009
2,JCe36ef7,131006646452,2009-08-12 18:10:00+00:00,0,0,0,0,0,1,0,0,0,1,1,1,2009
3,JCe99abc,131006655405,2009-09-17 22:27:00+00:00,0,0,0,0,0,1,0,1,0,0,1,1,2009
4,JCe5eb0c,131006687926,2009-10-09 22:36:00+00:00,0,0,0,0,0,0,1,0,1,1,1,1,2009


In [29]:
abx_columns = ['Ampicillin',
             'Ciprofloxacin',
             'Cefazolin',
             'Ceftriaxone',
             'Cefepime',
             'Zosyn',
             'Vancomycin',
             'Meropenem',
             'Vancomycin_Ceftriaxone',
             'Vancomycin_Cefepime',
             'Vancomycin_Zosyn',
             'Vancomycin_Meropenem']
df_labels[abx_columns].mean()

Ampicillin                0.441993
Ciprofloxacin             0.629544
Cefazolin                 0.590988
Ceftriaxone               0.683196
Cefepime                  0.774591
Zosyn                     0.903496
Vancomycin                0.243526
Meropenem                 0.809778
Vancomycin_Ceftriaxone    0.828475
Vancomycin_Cefepime       0.943677
Vancomycin_Zosyn          0.950180
Vancomycin_Meropenem      0.971548
dtype: float64