### Organized Cells

In [1]:
### Imports
import matplotlib.pyplot as plt
import numpy as np
from typing import List, Dict, Optional, Union
import sys
import time
import importlib
import sparse
import datetime
import pandas as pd
import numpy as np
import scipy.sparse
import matplotlib.pyplot as plt
import os
#
sys.path.append("..")
import Utils.dbutils as dbutils
import Utils.data_utils as data_utils
import Generators.CohortGenerator as CohortGenerator
import Generators.FeatureGenerator as FeatureGenerator
import config
local_imports = (
    dbutils,
    data_utils,
    CohortGenerator,
    FeatureGenerator,
    config
)
for i in local_imports:
    i = importlib.reload(i)

In [2]:
### Define config
in_name = "manuscript_covariates_5_final"
out_name = "antibiotic_prevalence_table_final"

In [3]:
### Setting up database
# username = config.PG_USERNAME #we use peer authentication so don't need use vars, but in theory would pass them into config_path
# password = config.PG_PASSWORD
database_name = config.DB_NAME
print(database_name)
config_path = 'postgresql://{database_name}'.format(
    database_name = database_name
)
connect_args = {"host": '/var/run/postgresql/'} # connect_args to pass to sqlalchemy create_engine function

# schemas 
schema_name = 'eol_test_ncjones' # all created tables will be created using this schema
cdm_schema_name = config.OMOP_CDM_SCHEMA # the name of the schema housing your OMOP CDM tables
print(f"cdm schema: {cdm_schema_name}")
# caching
reset_schema = False # if true, rebuild all data from scratch

# set up database, reset schemas as needed
db = dbutils.Database(config_path, schema_name, connect_args, cdm_schema_name)
# if reset_schema:
#     db.execute(
#         'drop schema if exists {} cascade'.format(schema_name)
#     )
# db.execute(
#     'create schema if not exists {}'.format(schema_name)
# )

localhost/omop_v6
cdm schema: cdm_6871_21


In [4]:
%%time
### Loading cohort
# Get the full condition item table
sql = """
    select
        *
    from
        {omop_schema}.{in_name} c
""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,
    in_name=in_name
)
cohort = db.query(sql)
filtered_cohort = cohort.copy().loc[cohort.antibiotic_type!='inappropriate']

CPU times: user 9.03 s, sys: 1.59 s, total: 10.6 s
Wall time: 12.7 s


In [5]:
### Defining antibiotic lookup dictionaries

antibiotic_categories = {
'first_line' : ["nitrofurantoin","trimethoprim-sulfamethoxazole"],
"second_line" : ["ciprofloxacin","ofloxacin","levofloxacin"],
"alternatives" : ["amoxicillin-clavulanic acid","cefpodoxime","cefadroxil"]
}
all_antibiotics = [j for x in antibiotic_categories.values() for j in x]
# antibiotic_categories.update({'all':all_antibiotics})

antibiotic_hist = dict()
for category, antibiotics in antibiotic_categories.items():
    antibiotic_list = []
    for antibiotic in antibiotics:
        antibiotic_list.append(antibiotic + '_0_6_months')
    antibiotic_hist[category] = antibiotic_list

antibiotic_history_columns = [x + '_0_6_months' for x in all_antibiotics]
antibiotic_prevalence_columns = [x + '_0_months' for x in all_antibiotics]

In [6]:
%%time
### Loading drug and exposure tables
# Get the full condition item table
sql = """select * from {omop_schema}.concept
                                 where domain_id = 'Drug'""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,
)
all_drugs = db.query(sql)

sql = """SELECT
                                uti.index,
                                uti.condition_occurrence_id, 
                                uti.person_id,
                                uti.condition_start_date,
                                drug.drug_concept_id,
                                drug.drug_exposure_id,
                                drug.drug_exposure_start_date,
                                drug.drug_exposure_start_datetime
                           FROM {omop_schema}.{in_name} uti
                           JOIN {omop_schema}.drug_exposure drug ON 
                                    drug.person_id = uti.person_id AND
                                    (drug.drug_exposure_start_date <= (uti.condition_start_date + INTERVAL '7' DAY))
                           ORDER BY 
                                    uti.condition_occurrence_id
                        """.format(
    omop_schema=config.OMOP_CDM_SCHEMA,in_name=in_name
)
drug_exposures = db.query(sql)

CPU times: user 39.7 s, sys: 7.04 s, total: 46.7 s
Wall time: 1min 29s


### Creating antibioitic history variables

In [7]:
%%time
### Finding all occurences of drug queries
# First Line
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("nitrofurantoin"), 'antibiotic_name'] = "nitrofurantoin"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("sulfamethoxazole"), 'antibiotic_name'] = "sulfamethoxazole"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("trimethoprim"), 'antibiotic_name'] = "trimethoprim"
all_drugs.loc[(all_drugs["concept_name"].str.lower().str.contains("trimethoprim")) & \
                             (all_drugs["concept_name"].str.lower().str.contains("sulfamethoxazole")), 'antibiotic_name'] = "trimethoprim-sulfamethoxazole"


#alternatives
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("quinolone"), 'antibiotic_name'] = "quinolone"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("ceftriaxone"), 'antibiotic_name'] = "ceftriaxone"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefuroxime"), 'antibiotic_name'] = "cefuroxime"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefdinir"), 'antibiotic_name'] = "cefdinir"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefazolin"), 'antibiotic_name'] = "cefazolin"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefepime"), 'antibiotic_name'] = "cefepime"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefpodoxime"), 'antibiotic_name'] = "cefpodoxime"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefixime"), 'antibiotic_name'] = "cefixime"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefadroxil"), 'antibiotic_name'] = "cefadroxil"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("amoxicillin"), 'antibiotic_name'] = "amoxicillin"
all_drugs.loc[(all_drugs["concept_name"].str.lower().str.contains("amoxicillin")) & \
                             (all_drugs["concept_name"].str.lower().str.contains("clavulan")), 'antibiotic_name'] = "amoxicillin-clavulanic acid"


# # second line
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("ofloxacin"), 'antibiotic_name'] = "ofloxacin"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("levofloxacin"), 'antibiotic_name'] = "levofloxacin"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("ciprofloxacin"), 'antibiotic_name'] = "ciprofloxacin"

# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cephalexin"), 'antibiotic_name'] = "cephalexin"

# # Inappropriate

# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("fluconazole"), 'antibiotic_name'] = "fluconazole"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("doxycycline"), 'antibiotic_name'] = "doxycycline"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("clindamycin"), 'antibiotic_name'] = "clindamycin"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("linezolid"), 'antibiotic_name'] = "linezolid"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("metronidazole"), 'antibiotic_name'] = "metronidazole"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("azithromycin"), 'antibiotic_name'] = "azithromycin"




all_drugs = all_drugs.rename(columns={"concept_id": "drug_concept_id", "concept_name": "drug_name"})

all_drugs = all_drugs[~all_drugs.antibiotic_name.isna()]

CPU times: user 45.5 s, sys: 3.07 s, total: 48.5 s
Wall time: 48.6 s


In [8]:
%%time
### Join the data frame with the drug concept id to get all concept ids


antibiotic_df = drug_exposures.merge(all_drugs[['drug_concept_id','drug_name','antibiotic_name']],how='left',on='drug_concept_id')
antibiotic_df = antibiotic_df.loc[~antibiotic_df.antibiotic_name.isnull()]


CPU times: user 1.57 s, sys: 828 ms, total: 2.4 s
Wall time: 2.51 s


In [9]:
%%time
### Populate cohort table with new variables indicating presence of 6 month drug history or \
### receipt of antibiotic within 7 days of antibiotic
from datetime import timedelta

antibiotic_history_columns = [x + '_0_6_months' for x in all_antibiotics]
antibiotic_prevalence_columns = [x + '_0_months' for x in all_antibiotics]
all_columns = antibiotic_history_columns + antibiotic_prevalence_columns

#for history variables we are searching for past 6 months before condition start date and searching antibiotic prevalence variables 
# at start date as well as within 7 days of original date


antibiotic_days_to_columns = {(180,0):antibiotic_history_columns,(0,7):antibiotic_prevalence_columns}


    

filtered_cohort = cohort.copy().loc[cohort.antibiotic_type!='inappropriate']

temp_anti_df = antibiotic_df.copy().loc[antibiotic_df.antibiotic_name.isin(all_antibiotics) & (antibiotic_df.person_id.isin(filtered_cohort.person_id))]

#populate dataframe with the columns
for column in all_columns:
    filtered_cohort[column] = 0


#counter for progress 
total = len(filtered_cohort)
cnt = 0 
for idx,row in filtered_cohort.iterrows():

    #filter the dataframe by person
    person = row['person_id']
    person_antibiotic_df = temp_anti_df.copy().loc[(temp_anti_df.person_id == person)]
    for antibiotic_column_days, antibiotic_column_names in antibiotic_days_to_columns.items():
        # get all antibiotic_events between a start and end time
        date = row['condition_start_date']
        start_days,end_days = antibiotic_column_days
        start = date - timedelta(days=start_days)
        end = date + timedelta(days=end_days)
        query_df = person_antibiotic_df.copy().loc[(person_antibiotic_df.drug_exposure_start_date >= start) &\
                                            (person_antibiotic_df.drug_exposure_start_date < end)]
        
#         if antibiotic_column_names == antibiotic_prevalence_columns:
#                 print(start_days,end_days)
#                 print(start)
#                 print(end)
        for antibiotic_col in antibiotic_column_names:
            antibiotic = antibiotic_col.split('_')[0]
            if antibiotic in np.unique(query_df.antibiotic_name.values):
                filtered_cohort.loc[idx,antibiotic_col] = 1
    cnt+=1
    if cnt % (total // 10) == 0:
        print(f"{cnt/total:.1}")


0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1e+00
CPU times: user 17min 9s, sys: 3.75 s, total: 17min 13s
Wall time: 17min 14s
Parser   : 421 ms


In [10]:
## grouping the antibiotic_type_0 variables

#may not need to run this
filtered_cohort = filtered_cohort.loc[filtered_cohort.antibiotic_type != '']

filtered_cohort['antibiotic_type_0'] = 0

filtered_cohort.loc[np.any(filtered_cohort[['nitrofurantoin_0_months', 'trimethoprim-sulfamethoxazole_0_months']],axis=1),'antibiotic_type_0'] = 'first_line'
filtered_cohort.loc[np.any(filtered_cohort[['ciprofloxacin_0_months', 'ofloxacin_0_months','levofloxacin_0_months']],axis=1),'antibiotic_type_0'] = 'second_line'
filtered_cohort.loc[np.any(filtered_cohort[['amoxicillin-clavulanic acid_0_months', 'cefpodoxime_0_months','cefadroxil_0_months']],axis=1),'antibiotic_type_0'] = 'alternatives'

### Create a new cohort based on filtering the prevalence variables. Verification check
filtered_cohort_n = filtered_cohort.copy().loc[np.any(filtered_cohort[antibiotic_prevalence_columns],axis=1)]

In [11]:
filtered_cohort_n = filtered_cohort.copy().loc[np.any(filtered_cohort[antibiotic_prevalence_columns],axis=1)]

In [17]:
for antibiotic_indicator in ['nitrofurantoin_0_months', 'trimethoprim-sulfamethoxazole_0_months','ciprofloxacin_0_months', 'ofloxacin_0_months','levofloxacin_0_months','amoxicillin-clavulanic acid_0_months', 'cefpodoxime_0_months','cefadroxil_0_months']:
    print(antibiotic_indicator.replace('_0_months',' at time 0'), " count: ", filtered_cohort_n.loc[np.any(filtered_cohort_n[[antibiotic_indicator]],axis=1)].shape[0])

nitrofurantoin at time 0  count:  20064
trimethoprim-sulfamethoxazole at time 0  count:  14954
ciprofloxacin at time 0  count:  18593
ofloxacin at time 0  count:  17
levofloxacin at time 0  count:  2530
amoxicillin-clavulanic acid at time 0  count:  1235
cefpodoxime at time 0  count:  124
cefadroxil at time 0  count:  68


In [20]:
filtered_cohort_n = filtered_cohort_n.drop('index',axis=1)
filtered_cohort_n = filtered_cohort_n.drop('level_0',axis=1)

In [21]:
#can only run this if you don't have an idx variable
df1 = filtered_cohort_n.reset_index().rename(columns={"index":'idx'})
df2 = filtered_cohort_n.antibiotic_type_0.str.get_dummies().reset_index().rename(columns={"index":'idx'})

In [22]:
filtered_cohort_n2 = df1.merge(df2,how='left',on='idx')

In [23]:
filtered_cohort_n2['all'] = 1

In [24]:
### 

#This code Compute conditional antibiotic prevalencies
#make two dictionaries one for prevalencies and another for column conditions
#prevalencies
#make dataframe with columns

#fill out prevalencies column

#loop through the antibiotic type variables
    #nested loop through the filtered_cohort columns

#loop through the prevalencies column


antibiotic_list = list(antibiotic_hist.keys())

antibiotic_preval_vars = antibiotic_list + antibiotic_prevalence_columns

antibiotic_columns = ['prevalence_variable'] + antibiotic_list + antibiotic_history_columns + ['any_antibiotic_0_6_months']
antibiotic_preval = pd.DataFrame(columns=antibiotic_columns)

antibiotic_preval['prevalence_variable'] = antibiotic_preval_vars

antibiotic_columns = antibiotic_columns[1:]

antibiotic_0_month_conditions = antibiotic_preval_vars

updated_history_columns = antibiotic_history_columns + ['any_antibiotic_0_6_months','no_antibiotic_0_6_months']

antibiotic_0_month_conditions.insert(0,'prevalence_variable')
antibiotic_0_month_conditions.insert(1,'all')
new_df = pd.DataFrame(columns=antibiotic_0_month_conditions)
new_df['prevalence_variable'] = updated_history_columns
#BASE CONDITION IS 0 month CONDITION

for column in antibiotic_0_month_conditions[1:]:


    base_condition = (filtered_cohort_n2[column] == 1)

    for prevalence_variable in antibiotic_history_columns:


 
        compare_condition = (filtered_cohort_n2[prevalence_variable] == 1)

        prevalence = (compare_condition & base_condition).sum() / base_condition.sum()
        

        print(f"% received {prevalence_variable} given {column} {prevalence * 100:.2f}")
        new_df.loc[new_df['prevalence_variable'] == prevalence_variable,column] = prevalence


    for prevalence_variable in ['any_antibiotic_0_6_months','no_antibiotic_0_6_months']:
        if prevalence_variable.startswith('any'):
            compare_condition = (np.any(filtered_cohort_n2[antibiotic_history_columns], axis=1))
        else:
            compare_condition = ~(np.any(filtered_cohort_n2[antibiotic_history_columns], axis=1))

        
        prevalence = (compare_condition & base_condition).sum() / base_condition.sum()


        print(f"% received {prevalence_variable} given {column} {prevalence * 100:.2f}")
        new_df.loc[new_df['prevalence_variable'] == prevalence_variable,column] = prevalence


    print()


% received nitrofurantoin_0_6_months given all 4.49
% received trimethoprim-sulfamethoxazole_0_6_months given all 4.59
% received ciprofloxacin_0_6_months given all 5.49
% received ofloxacin_0_6_months given all 0.45
% received levofloxacin_0_6_months given all 2.24
% received amoxicillin-clavulanic acid_0_6_months given all 5.23
% received cefpodoxime_0_6_months given all 0.04
% received cefadroxil_0_6_months given all 0.18
% received any_antibiotic_0_6_months given all 19.25
% received no_antibiotic_0_6_months given all 80.75

% received nitrofurantoin_0_6_months given first_line 5.17
% received trimethoprim-sulfamethoxazole_0_6_months given first_line 4.83
% received ciprofloxacin_0_6_months given first_line 4.14
% received ofloxacin_0_6_months given first_line 0.45
% received levofloxacin_0_6_months given first_line 1.60
% received amoxicillin-clavulanic acid_0_6_months given first_line 5.11
% received cefpodoxime_0_6_months given first_line 0.03
% received cefadroxil_0_6_months gi

In [25]:
new_df.to_csv(f"{out_name}.csv")