### Organized Cells

In [251]:
### Imports
import matplotlib.pyplot as plt
import numpy as np
from typing import List, Dict, Optional, Union
import sys
import time
import importlib
import sparse
import datetime
import pandas as pd
import numpy as np
import scipy.sparse
import matplotlib.pyplot as plt
import os
#
sys.path.append("..")
import Utils.dbutils as dbutils
import Utils.data_utils as data_utils
import Generators.CohortGenerator as CohortGenerator
import Generators.FeatureGenerator as FeatureGenerator
import config
local_imports = (
    dbutils,
    data_utils,
    CohortGenerator,
    FeatureGenerator,
    config
)
for i in local_imports:
    i = importlib.reload(i)

In [252]:
### Define config
in_name_1 = "manuscript_covariates_4_v4"
#out_name tbd
test_name = "cohort_manuscript_updated_treatments_ncj"
cohort_name = test_name

In [253]:
### Setting up database
# username = config.PG_USERNAME #we use peer authentication so don't need use vars, but in theory would pass them into config_path
# password = config.PG_PASSWORD
database_name = config.DB_NAME
print(database_name)
config_path = 'postgresql://{database_name}'.format(
    database_name = database_name
)
connect_args = {"host": '/var/run/postgresql/'} # connect_args to pass to sqlalchemy create_engine function

# schemas 
schema_name = 'eol_test_ncjones' # all created tables will be created using this schema
cdm_schema_name = config.OMOP_CDM_SCHEMA # the name of the schema housing your OMOP CDM tables
print(f"cdm schema: {cdm_schema_name}")
# caching
reset_schema = False # if true, rebuild all data from scratch

# set up database, reset schemas as needed
db = dbutils.Database(config_path, schema_name, connect_args, cdm_schema_name)
# if reset_schema:
#     db.execute(
#         'drop schema if exists {} cascade'.format(schema_name)
#     )
# db.execute(
#     'create schema if not exists {}'.format(schema_name)
# )

localhost/omop_v6
cdm schema: cdm_6871_21


In [254]:
%%time
### Loading cohort
# Get the full condition item table
sql = """
    select
        *
    from
        {omop_schema}.{cohort_name} c
""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,
    cohort_name=cohort_name
)
cohort = db.query(sql)
filtered_cohort = cohort.copy().loc[cohort.antibiotic_type!='inappropriate']

CPU times: user 1.22 s, sys: 72.5 ms, total: 1.29 s
Wall time: 1.86 s


In [255]:
### Defining antibiotic lookup dictionaries
my_definition = True


if my_definition:
    antibiotic_categories = {
    'first_line' : ["nitrofurantoin","trimethoprim-sulfamethoxazole"],
    "second_line" : ["ciprofloxacin","ofloxacin","levofloxacin"],
    "alternatives" : ["amoxicillin-clavulanic acid","cefpodoxime","cefadroxil"]
    }
    all_antibiotics = [j for x in antibiotic_categories.values() for j in x]
    # antibiotic_categories.update({'all':all_antibiotics})

    antibiotic_hist = dict()
    for category, antibiotics in antibiotic_categories.items():
        antibiotic_list = []
        for antibiotic in antibiotics:
            antibiotic_list.append(antibiotic + '_0_6_months')
        antibiotic_hist[category] = antibiotic_list

    antibiotic_history_columns = [x + '_0_6_months' for x in all_antibiotics]
    antibiotic_prevalence_columns = [x + '_0_months' for x in all_antibiotics]
else:
    cef_antibiotics = ["ceftriaxone","cefuroxime","cefdinir","cefazolin","cefepime","cefpodoxime","cefixime","cefadroxil"]
    new_alternatives = ["amoxicillin-clavulanic acid","amoxicillin"] + cef_antibiotics
    antibiotic_categories = {
    'first_line' : ["nitrofurantoin","trimethoprim-sulfamethoxazole","trimethoprim","sulfamethoxazole"],
    "second_line" : ["ciprofloxacin","ofloxacin","levofloxacin","cephalexin"],
    "alternatives" : new_alternatives
    }
    
    all_antibiotics = [j for x in antibiotic_categories.values() for j in x]
    # antibiotic_categories.update({'all':all_antibiotics})

    antibiotic_hist = dict()
    for category, antibiotics in antibiotic_categories.items():
        antibiotic_list = []
        for antibiotic in antibiotics:
            antibiotic_list.append(antibiotic + '_0_6_months')
        antibiotic_hist[category] = antibiotic_list

    antibiotic_history_columns = [x + '_0_6_months' for x in all_antibiotics]
    antibiotic_prevalence_columns = [x + '_0_months' for x in all_antibiotics]    


In [256]:
%%time
### Loading drug and exposure tables
# Get the full condition item table
sql = """select * from {omop_schema}.concept
                                 where domain_id = 'Drug'""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,
)
all_drugs = db.query(sql)

sql = """SELECT
                                uti.index,
                                uti.condition_occurrence_id, 
                                uti.person_id,
                                uti.condition_start_date,
                                drug.drug_concept_id,
                                drug.drug_exposure_id,
                                drug.drug_exposure_start_date,
                                drug.drug_exposure_start_datetime
                           FROM {omop_schema}.{cohort_name} uti
                           JOIN {omop_schema}.drug_exposure drug ON 
                                    drug.person_id = uti.person_id AND
                                    (drug.drug_exposure_start_date <= (uti.condition_start_date + INTERVAL '7' DAY))
                           ORDER BY 
                                    uti.condition_occurrence_id
                        """.format(
    omop_schema=config.OMOP_CDM_SCHEMA,cohort_name=cohort_name
)
drug_exposures = db.query(sql)

CPU times: user 42.9 s, sys: 5.64 s, total: 48.5 s
Wall time: 1min 39s


### My antibiotic definition

In [257]:
%%time
### Finding all occurences of drug queries
# First Line
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("nitrofurantoin"), 'antibiotic_name'] = "nitrofurantoin"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("sulfamethoxazole"), 'antibiotic_name'] = "sulfamethoxazole"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("trimethoprim"), 'antibiotic_name'] = "trimethoprim"
all_drugs.loc[(all_drugs["concept_name"].str.lower().str.contains("trimethoprim")) & \
                             (all_drugs["concept_name"].str.lower().str.contains("sulfamethoxazole")), 'antibiotic_name'] = "trimethoprim-sulfamethoxazole"


#alternatives
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("quinolone"), 'antibiotic_name'] = "quinolone"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("ceftriaxone"), 'antibiotic_name'] = "ceftriaxone"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefuroxime"), 'antibiotic_name'] = "cefuroxime"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefdinir"), 'antibiotic_name'] = "cefdinir"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefazolin"), 'antibiotic_name'] = "cefazolin"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefepime"), 'antibiotic_name'] = "cefepime"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefpodoxime"), 'antibiotic_name'] = "cefpodoxime"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefixime"), 'antibiotic_name'] = "cefixime"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefadroxil"), 'antibiotic_name'] = "cefadroxil"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("amoxicillin"), 'antibiotic_name'] = "amoxicillin"
all_drugs.loc[(all_drugs["concept_name"].str.lower().str.contains("amoxicillin")) & \
                             (all_drugs["concept_name"].str.lower().str.contains("clavulan")), 'antibiotic_name'] = "amoxicillin-clavulanic acid"


# # second line
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("ofloxacin"), 'antibiotic_name'] = "ofloxacin"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("levofloxacin"), 'antibiotic_name'] = "levofloxacin"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("ciprofloxacin"), 'antibiotic_name'] = "ciprofloxacin"

# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cephalexin"), 'antibiotic_name'] = "cephalexin"

# # Inappropriate

# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("fluconazole"), 'antibiotic_name'] = "fluconazole"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("doxycycline"), 'antibiotic_name'] = "doxycycline"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("clindamycin"), 'antibiotic_name'] = "clindamycin"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("linezolid"), 'antibiotic_name'] = "linezolid"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("metronidazole"), 'antibiotic_name'] = "metronidazole"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("azithromycin"), 'antibiotic_name'] = "azithromycin"

if not my_definition:
    #THE DRUGS UNCOMMENTED IN ELIZABETH'S CODE
    #second line
    all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cephalexin"), 'antibiotic_name'] = "cephalexin"

    #alternatives
    all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("ceftriaxone"), 'antibiotic_name'] = "ceftriaxone"
    all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefuroxime"), 'antibiotic_name'] = "cefuroxime"
    all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefdinir"), 'antibiotic_name'] = "cefdinir"
    all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefazolin"), 'antibiotic_name'] = "cefazolin"
    all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefepime"), 'antibiotic_name'] = "cefepime"
    all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefpodoxime"), 'antibiotic_name'] = "cefpodoxime"
    all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefixime"), 'antibiotic_name'] = "cefixime"
    all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefadroxil"), 'antibiotic_name'] = "cefadroxil"
    all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("amoxicillin"), 'antibiotic_name'] = "amoxicillin"



all_drugs = all_drugs.rename(columns={"concept_id": "drug_concept_id", "concept_name": "drug_name"})

all_drugs = all_drugs[~all_drugs.antibiotic_name.isna()]

CPU times: user 46.1 s, sys: 2.93 s, total: 49.1 s
Wall time: 49.1 s


In [258]:
%%time
### Join the data frame with the drug concept id to get all concept ids


antibiotic_df = drug_exposures.merge(all_drugs[['drug_concept_id','drug_name','antibiotic_name']],how='left',on='drug_concept_id')
antibiotic_df = antibiotic_df.loc[~antibiotic_df.antibiotic_name.isnull()]


CPU times: user 2.63 s, sys: 657 ms, total: 3.29 s
Wall time: 3.29 s


In [259]:
%%time
### Populate cohort table with new variables indicating presence of 6 month drug history or \
### receipt of antibiotic within 7 days of antibiotic
from datetime import timedelta

antibiotic_history_columns = [x + '_0_6_months' for x in all_antibiotics]
antibiotic_prevalence_columns = [x + '_0_months' for x in all_antibiotics]
all_columns = antibiotic_history_columns + antibiotic_prevalence_columns

#for history variables we are searching for past 6 months before condition start date and searching antibiotic prevalence variables 
# at start date as well as within 7 days of original date

if my_definition:
    antibiotic_days_to_columns = {(180,0):antibiotic_history_columns,(0,7):antibiotic_prevalence_columns}
else:
    antibiotic_days_to_columns = {(180,0):antibiotic_history_columns,(0,8):antibiotic_prevalence_columns}

    

filtered_cohort = cohort.copy().loc[cohort.antibiotic_type!='inappropriate']

temp_anti_df = antibiotic_df.copy().loc[antibiotic_df.antibiotic_name.isin(all_antibiotics) & (antibiotic_df.person_id.isin(filtered_cohort.person_id))]

#populate dataframe with the columns
for column in all_columns:
    filtered_cohort[column] = 0


#counter for progress 
total = len(filtered_cohort)
cnt = 0 
for idx,row in filtered_cohort.iterrows():

    #filter the dataframe by person
    person = row['person_id']
    person_antibiotic_df = temp_anti_df.copy().loc[(temp_anti_df.person_id == person)]
    for antibiotic_column_days, antibiotic_column_names in antibiotic_days_to_columns.items():
        # get all antibiotic_events between a start and end time
        date = row['condition_start_date']
        start_days,end_days = antibiotic_column_days
        start = date - timedelta(days=start_days)
        end = date + timedelta(days=end_days)
        query_df = person_antibiotic_df.copy().loc[(person_antibiotic_df.drug_exposure_start_date >= start) &\
                                            (person_antibiotic_df.drug_exposure_start_date < end)]
        
#         if antibiotic_column_names == antibiotic_prevalence_columns:
#                 print(start_days,end_days)
#                 print(start)
#                 print(end)
        for antibiotic_col in antibiotic_column_names:
            antibiotic = antibiotic_col.split('_')[0]
            if antibiotic in np.unique(query_df.antibiotic_name.values):
                filtered_cohort.loc[idx,antibiotic_col] = 1
    cnt+=1
    if cnt % (total // 10) == 0:
        print(f"{cnt/total:.1}")


0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1e+00
CPU times: user 35min 43s, sys: 5.42 s, total: 35min 49s
Wall time: 35min 49s


## Can comment out

### Changed antibiotic_table (using Elizabeth's antibiotic definitions) and Added the longer period

In [59]:
# ### Create a new cohort based on filtering the prevalence variables. Verification check
# filtered_cohort_n = filtered_cohort.copy().loc[np.any(filtered_cohort[antibiotic_prevalence_columns],axis=1)]
# print(f"old shape  {filtered_cohort.shape}, new shape {filtered_cohort_n.shape}")

old shape  (65881, 432), new shape (65881, 432)


In [60]:
# filtered_cohort_n[antibiotic_prevalence_columns].sum()

nitrofurantoin_0_months                   19959
trimethoprim-sulfamethoxazole_0_months    14901
trimethoprim_0_months                        58
sulfamethoxazole_0_months                     0
ciprofloxacin_0_months                    18603
ofloxacin_0_months                           19
levofloxacin_0_months                      2529
cephalexin_0_months                        4203
amoxicillin-clavulanic acid_0_months          0
amoxicillin_0_months                       2438
ceftriaxone_0_months                       1081
cefuroxime_0_months                        1394
cefdinir_0_months                           328
cefazolin_0_months                          114
cefepime_0_months                            39
cefpodoxime_0_months                        121
cefixime_0_months                            29
cefadroxil_0_months                          65
dtype: int64

In [61]:
# e_counts = filtered_cohort_n[antibiotic_prevalence_columns].sum()
# for cat,antibiotic_vals in antibiotic_categories.items():
#     temp_sum = 0
#     for antibiotic_val in antibiotic_vals:
#         new_name = antibiotic_val + '_0_months'
#         temp_sum+= e_counts[new_name]
    
#     print(cat," sum is: ",temp_sum)

first_line  sum is:  34918
second_line  sum is:  25354
alternatives  sum is:  5609


### Changed antibiotic_table (using Elizabeth's antibiotic definitions)

In [27]:
# ### Create a new cohort based on filtering the prevalence variables. Verification check
# filtered_cohort_n = filtered_cohort.copy().loc[np.any(filtered_cohort[antibiotic_prevalence_columns],axis=1)]
# print(f"old shape  {filtered_cohort.shape}, new shape {filtered_cohort_n.shape}")

old shape  (65881, 432), new shape (64777, 432)


In [45]:
# filtered_cohort_n[antibiotic_prevalence_columns].sum()

nitrofurantoin_0_months                   19709
trimethoprim-sulfamethoxazole_0_months    14720
trimethoprim_0_months                        52
sulfamethoxazole_0_months                     0
ciprofloxacin_0_months                    18315
ofloxacin_0_months                           17
levofloxacin_0_months                      2460
cephalexin_0_months                        4119
amoxicillin-clavulanic acid_0_months          0
amoxicillin_0_months                       2297
ceftriaxone_0_months                       1066
cefuroxime_0_months                        1356
cefdinir_0_months                           320
cefazolin_0_months                           99
cefepime_0_months                            38
cefpodoxime_0_months                        118
cefixime_0_months                            26
cefadroxil_0_months                          65
dtype: int64

In [35]:
# e_counts = filtered_cohort_n[antibiotic_prevalence_columns].sum()
# for cat,antibiotic_vals in antibiotic_categories.items():
#     temp_sum = 0
#     for antibiotic_val in antibiotic_vals:
#         new_name = antibiotic_val + '_0_months'
#         temp_sum+= e_counts[new_name]
    
#     print(cat," sum is: ",temp_sum)

first_line  sum is:  34481
second_line  sum is:  24911
alternatives  sum is:  5385


In [None]:
# 34481 + 24911 + 5385

In [46]:
# 56601/65881

0.8591399644814135

In [None]:
# 1 - (56601/65881)

## End of comment out

In [270]:
## grouping the antibiotic_type_0 variables

#may not need to run this
filtered_cohort = filtered_cohort.loc[filtered_cohort.antibiotic_type != '']

filtered_cohort['antibiotic_type_0'] = 0

filtered_cohort.loc[np.any(filtered_cohort[['nitrofurantoin_0_months', 'trimethoprim-sulfamethoxazole_0_months']],axis=1),'antibiotic_type_0'] = 'first_line'
filtered_cohort.loc[np.any(filtered_cohort[['ciprofloxacin_0_months', 'ofloxacin_0_months','levofloxacin_0_months']],axis=1),'antibiotic_type_0'] = 'second_line'
filtered_cohort.loc[np.any(filtered_cohort[['amoxicillin-clavulanic acid_0_months', 'cefpodoxime_0_months','cefadroxil_0_months']],axis=1),'antibiotic_type_0'] = 'alternatives'

### Create a new cohort based on filtering the prevalence variables. Verification check
filtered_cohort_n = filtered_cohort.copy().loc[np.any(filtered_cohort[antibiotic_prevalence_columns],axis=1)]

In [269]:
filtered_cohort_n = filtered_cohort.copy().loc[np.any(filtered_cohort[antibiotic_prevalence_columns],axis=1)]

### Sanity checks (optional)

In [263]:
filtered_cohort.antibiotic_type.value_counts().sum() - 8388

57585

In [264]:
filtered_cohort[antibiotic_prevalence_columns].sum()

nitrofurantoin_0_months                   20064
trimethoprim-sulfamethoxazole_0_months    14954
ciprofloxacin_0_months                    18593
ofloxacin_0_months                           17
levofloxacin_0_months                      2530
amoxicillin-clavulanic acid_0_months       1235
cefpodoxime_0_months                        124
cefadroxil_0_months                          68
dtype: int64

In [271]:
print(f"old shape  {filtered_cohort.shape}, new shape {filtered_cohort_n.shape}")

old shape  (57585, 54), new shape (57585, 54)


In [261]:
# print(f"old shape  {filtered_cohort.shape}, new shape {filtered_cohort_n.shape}")

old shape  (65973, 54), new shape (57585, 54)


In [239]:
# print(f"old shape  {filtered_cohort.shape}, new shape {filtered_cohort_n.shape}")

old shape  (65881, 413), new shape (56601, 413)


In [240]:
# print("old cohort numbers")
# counts = filtered_cohort.antibiotic_type.value_counts()
# first_line_count = 0
# for index_value in counts.index:
#     if index_value in ['nitrofurantoin','trimethoprim-sulfamethoxazole']:
#         first_line_count+= counts[index_value]
#     else:
#         print(index_value,"  ",counts[index_value])
        
# print("first_line","  ",first_line_count)        

old cohort numbers
second_line    25354
alternatives    5609
first_line    34918


In [241]:
# print("new_cohort_numbers")
# filtered_cohort_n.antibiotic_type_0.value_counts()

new_cohort_numbers


first_line      34429
second_line     20792
alternatives     1380
Name: antibiotic_type_0, dtype: int64

In [248]:
# filtered_cohort_n[antibiotic_prevalence_columns].sum()

nitrofurantoin_0_months                   19709
trimethoprim-sulfamethoxazole_0_months    14720
ciprofloxacin_0_months                    18315
ofloxacin_0_months                           17
levofloxacin_0_months                      2460
amoxicillin-clavulanic acid_0_months       1197
cefpodoxime_0_months                        118
cefadroxil_0_months                          65
dtype: int64

In [243]:
### Verification check
    
# filtered_cohort[all_columns].sum().T

nitrofurantoin_0_6_months                    2843
trimethoprim-sulfamethoxazole_0_6_months     2972
ciprofloxacin_0_6_months                     3574
ofloxacin_0_6_months                          306
levofloxacin_0_6_months                      1442
amoxicillin-clavulanic acid_0_6_months       3371
cefpodoxime_0_6_months                         26
cefadroxil_0_6_months                         116
nitrofurantoin_0_months                     19709
trimethoprim-sulfamethoxazole_0_months      14720
ciprofloxacin_0_months                      18315
ofloxacin_0_months                             17
levofloxacin_0_months                        2460
amoxicillin-clavulanic acid_0_months         1197
cefpodoxime_0_months                          118
cefadroxil_0_months                            65
dtype: int64

In [272]:
print("old cohort numbers")
counts = filtered_cohort.antibiotic_type.value_counts()
first_line_count = 0
for index_value in counts.index:
    if index_value in ['nitrofurantoin','trimethoprim-sulfamethoxazole']:
        first_line_count+= counts[index_value]
    else:
        print(index_value,"  ",counts[index_value])
        
print("first_line","  ",first_line_count)        

old cohort numbers
second_line    21140
alternatives    1427
first_line    35018


In [273]:
print("new_cohort_numbers")
filtered_cohort_n.antibiotic_type_0.value_counts()

new_cohort_numbers


first_line      35018
second_line     21140
alternatives     1427
Name: antibiotic_type_0, dtype: int64

In [276]:
filtered_cohort_n = filtered_cohort_n.drop('index',axis=1)
filtered_cohort_n = filtered_cohort_n.drop('level_0',axis=1)

In [277]:
#can only run this if you don't have an idx variable
df1 = filtered_cohort_n.reset_index().rename(columns={"index":'idx'})
df2 = filtered_cohort_n.antibiotic_type_0.str.get_dummies().reset_index().rename(columns={"index":'idx'})

In [278]:
filtered_cohort_n2 = df1.merge(df2,how='left',on='idx')

In [279]:
filtered_cohort_n2['all'] = 1

In [280]:
### Compute conditional antibiotic prevalencies

# Pseudocode:

#make two dictionaries one for prevalencies and another for column conditions
#prevalencies
#make dataframe with columns

#fill out prevalencies column

#loop through the antibiotic type variables
    #nested loop through the filtered_cohort columns

#loop through the prevalencies column


antibiotic_list = list(antibiotic_hist.keys())

antibiotic_preval_vars = antibiotic_list + antibiotic_prevalence_columns

antibiotic_columns = ['prevalence_variable'] + antibiotic_list + antibiotic_history_columns + ['any_antibiotic_0_6_months']
antibiotic_preval = pd.DataFrame(columns=antibiotic_columns)

# antibiotic_preval_vars.extend(['any_antibiotic_0_6_months','no_antibiotic_0_6_months'])
antibiotic_preval['prevalence_variable'] = antibiotic_preval_vars

antibiotic_columns = antibiotic_columns[1:]

#treatment vs antibiotic 6 month history 
# for antibiotic_prevalence_var in antibiotic_list:
#     for antibiotic_type in antibiotic_list:    
#         base_condition = (np.any(filtered_cohort_n[antibiotic_hist[antibiotic_type]], axis=1))
        
        
#         print(f"% received {antibiotic_prevalence_var} given {antibiotic_type} 0_to_6 months {(filtered_cohort_n.antibiotic_type_0.isin([antibiotic_prevalence_var]) & base_condition).sum() / base_condition.sum() * 100:.4f}")
#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == antibiotic_prevalence_var,antibiotic_type] = (filtered_cohort_n.antibiotic_type_0.isin([antibiotic_prevalence_var]) & base_condition).sum() / base_condition.sum()

        
antibiotic_0_month_conditions = antibiotic_preval_vars

updated_history_columns = antibiotic_history_columns + ['any_antibiotic_0_6_months','no_antibiotic_0_6_months']

antibiotic_0_month_conditions.insert(0,'prevalence_variable')
antibiotic_0_month_conditions.insert(1,'all')
new_df = pd.DataFrame(columns=antibiotic_0_month_conditions)
new_df['prevalence_variable'] = updated_history_columns
#BASE CONDITION IS 0 month CONDITION

for column in antibiotic_0_month_conditions[1:]:
           
#     if column in antibiotic_list: #you can ignore these two lines
#         continue
        
#         print("condition", column)

    base_condition = (filtered_cohort_n2[column] == 1)

    for prevalence_variable in antibiotic_history_columns:


    #     print("prevalence ",prevalence_variable)
    #         compare_condition = (filtered_cohort_n.antibiotic_type_0.isin([prevalence_variable]))
    #     else:



        #any condition
    #     base_condition = (np.any(filtered_cohort_n[antibiotic_history_columns], axis=1))
    #     prevalence = (compare_condition & base_condition).sum() / base_condition.sum()


    #     print(f"% received {prevalence_variable} given any 0_to_6 months {prevalence * 100:.4f}")
    #     antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,"any_antibiotic_0_6_months"] = prevalence
        #NEW CONDITIONS compare_condition  == Condition    
        compare_condition = (filtered_cohort_n2[prevalence_variable] == 1)
        # drug condition 

        #ANY PREVALENCE
    #     print("any prevalence")
        #OTHER PREVALENCIES base condition == prevalency  

    #         print("base",column,base_condition.sum())
    #         print("compare",prevalence_variable,compare_condition.sum())
        prevalence = (compare_condition & base_condition).sum() / base_condition.sum()
        

        print(f"% received {prevalence_variable} given {column} {prevalence * 100:.2f}")
#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,column] = prevalence
        new_df.loc[new_df['prevalence_variable'] == prevalence_variable,column] = prevalence
    #         break
    #     for antibiotic_type in antibiotic_list:    
    #         base_condition = (np.any(filtered_cohort[antibiotic_hist[antibiotic_type]], axis=1))
    #         print(f"% received {prevalence_variable} given {antibiotic_type} 0_to_6 months {(compare_condition & base_condition).sum() / base_condition.sum() * 100:.4f}")
    #         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,antibiotic_type] = (compare_condition & base_condition).sum() / base_condition.sum()
    


    for prevalence_variable in ['any_antibiotic_0_6_months','no_antibiotic_0_6_months']:
        if prevalence_variable.startswith('any'):
            compare_condition = (np.any(filtered_cohort_n2[antibiotic_history_columns], axis=1))
        else:
            compare_condition = ~(np.any(filtered_cohort_n2[antibiotic_history_columns], axis=1))

        # drug condition 

        #ANY PREVALENCE
    #     print("any prevalence")
        #OTHER PREVALENCIES base condition == prevalency  
        
        prevalence = (compare_condition & base_condition).sum() / base_condition.sum()


        print(f"% received {prevalence_variable} given {column} {prevalence * 100:.2f}")
        new_df.loc[new_df['prevalence_variable'] == prevalence_variable,column] = prevalence

#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,column] = prevalence
    #         break
    print()
    
    #new conditions to add
    #Add all
    
    #add first line
    
    
    #add second line
    
    #add alternatives

% received nitrofurantoin_0_6_months given all 4.49
% received trimethoprim-sulfamethoxazole_0_6_months given all 4.59
% received ciprofloxacin_0_6_months given all 5.49
% received ofloxacin_0_6_months given all 0.45
% received levofloxacin_0_6_months given all 2.24
% received amoxicillin-clavulanic acid_0_6_months given all 5.23
% received cefpodoxime_0_6_months given all 0.04
% received cefadroxil_0_6_months given all 0.18
% received any_antibiotic_0_6_months given all 19.25
% received no_antibiotic_0_6_months given all 80.75

% received nitrofurantoin_0_6_months given first_line 5.17
% received trimethoprim-sulfamethoxazole_0_6_months given first_line 4.83
% received ciprofloxacin_0_6_months given first_line 4.14
% received ofloxacin_0_6_months given first_line 0.45
% received levofloxacin_0_6_months given first_line 1.60
% received amoxicillin-clavulanic acid_0_6_months given first_line 5.11
% received cefpodoxime_0_6_months given first_line 0.03
% received cefadroxil_0_6_months gi

In [281]:
new_df.to_csv("antibiotic_prevalence_table_v7.csv")

In [217]:
### Compute conditional antibiotic prevalencies

# Pseudocode:

#make two dictionaries one for prevalencies and another for column conditions
#prevalencies
#make dataframe with columns

#fill out prevalencies column

#loop through the antibiotic type variables
    #nested loop through the filtered_cohort columns

#loop through the prevalencies column


antibiotic_list = list(antibiotic_hist.keys())

antibiotic_preval_vars = antibiotic_list + antibiotic_prevalence_columns

antibiotic_columns = ['prevalence_variable'] + antibiotic_list + antibiotic_history_columns + ['any_antibiotic_0_6_months']
antibiotic_preval = pd.DataFrame(columns=antibiotic_columns)

# antibiotic_preval_vars.extend(['any_antibiotic_0_6_months','no_antibiotic_0_6_months'])
antibiotic_preval['prevalence_variable'] = antibiotic_preval_vars

antibiotic_columns = antibiotic_columns[1:]

#treatment vs antibiotic 6 month history 
# for antibiotic_prevalence_var in antibiotic_list:
#     for antibiotic_type in antibiotic_list:    
#         base_condition = (np.any(filtered_cohort_n[antibiotic_hist[antibiotic_type]], axis=1))
        
        
#         print(f"% received {antibiotic_prevalence_var} given {antibiotic_type} 0_to_6 months {(filtered_cohort_n.antibiotic_type_0.isin([antibiotic_prevalence_var]) & base_condition).sum() / base_condition.sum() * 100:.4f}")
#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == antibiotic_prevalence_var,antibiotic_type] = (filtered_cohort_n.antibiotic_type_0.isin([antibiotic_prevalence_var]) & base_condition).sum() / base_condition.sum()

        
antibiotic_0_month_conditions = antibiotic_preval_vars

updated_history_columns = antibiotic_history_columns + ['any_antibiotic_0_6_months','no_antibiotic_0_6_months']

antibiotic_0_month_conditions.insert(0,'prevalence_variable')
antibiotic_0_month_conditions.insert(1,'all')
new_df = pd.DataFrame(columns=antibiotic_0_month_conditions)
new_df['prevalence_variable'] = updated_history_columns
#BASE CONDITION IS 0 month CONDITION

for column in antibiotic_0_month_conditions[1:]:
           
#     if column in antibiotic_list: #you can ignore these two lines
#         continue
        
#         print("condition", column)

    base_condition = (filtered_cohort_n2[column] == 1)

    for prevalence_variable in antibiotic_history_columns:


    #     print("prevalence ",prevalence_variable)
    #         compare_condition = (filtered_cohort_n.antibiotic_type_0.isin([prevalence_variable]))
    #     else:



        #any condition
    #     base_condition = (np.any(filtered_cohort_n[antibiotic_history_columns], axis=1))
    #     prevalence = (compare_condition & base_condition).sum() / base_condition.sum()


    #     print(f"% received {prevalence_variable} given any 0_to_6 months {prevalence * 100:.4f}")
    #     antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,"any_antibiotic_0_6_months"] = prevalence
        #NEW CONDITIONS compare_condition  == Condition    
        compare_condition = (filtered_cohort_n2[prevalence_variable] == 1)
        # drug condition 

        #ANY PREVALENCE
    #     print("any prevalence")
        #OTHER PREVALENCIES base condition == prevalency  

    #         print("base",column,base_condition.sum())
    #         print("compare",prevalence_variable,compare_condition.sum())
        prevalence = (compare_condition & base_condition).sum() / base_condition.sum()
        

        print(f"% received {prevalence_variable} given {column} {prevalence * 100:.2f}")
#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,column] = prevalence
        new_df.loc[new_df['prevalence_variable'] == prevalence_variable,column] = prevalence
    #         break
    #     for antibiotic_type in antibiotic_list:    
    #         base_condition = (np.any(filtered_cohort[antibiotic_hist[antibiotic_type]], axis=1))
    #         print(f"% received {prevalence_variable} given {antibiotic_type} 0_to_6 months {(compare_condition & base_condition).sum() / base_condition.sum() * 100:.4f}")
    #         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,antibiotic_type] = (compare_condition & base_condition).sum() / base_condition.sum()
    


    for prevalence_variable in ['any_antibiotic_0_6_months','no_antibiotic_0_6_months']:
        if prevalence_variable.startswith('any'):
            compare_condition = (np.any(filtered_cohort_n2[antibiotic_history_columns], axis=1))
        else:
            compare_condition = ~(np.any(filtered_cohort_n2[antibiotic_history_columns], axis=1))

        # drug condition 

        #ANY PREVALENCE
    #     print("any prevalence")
        #OTHER PREVALENCIES base condition == prevalency  
        
        prevalence = (compare_condition & base_condition).sum() / base_condition.sum()


        print(f"% received {prevalence_variable} given {column} {prevalence * 100:.2f}")
        new_df.loc[new_df['prevalence_variable'] == prevalence_variable,column] = prevalence

#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,column] = prevalence
    #         break
    print()
    
    #new conditions to add
    #Add all
    
    #add first line
    
    
    #add second line
    
    #add alternatives

% received nitrofurantoin_0_6_months given all 4.40
% received trimethoprim-sulfamethoxazole_0_6_months given all 4.52
% received ciprofloxacin_0_6_months given all 5.39
% received ofloxacin_0_6_months given all 0.45
% received levofloxacin_0_6_months given all 2.20
% received amoxicillin-clavulanic acid_0_6_months given all 5.19
% received cefpodoxime_0_6_months given all 0.03
% received cefadroxil_0_6_months given all 0.17
% received any_antibiotic_0_6_months given all 19.02
% received no_antibiotic_0_6_months given all 80.98

% received nitrofurantoin_0_6_months given first_line 5.08
% received trimethoprim-sulfamethoxazole_0_6_months given first_line 4.76
% received ciprofloxacin_0_6_months given first_line 4.09
% received ofloxacin_0_6_months given first_line 0.45
% received levofloxacin_0_6_months given first_line 1.57
% received amoxicillin-clavulanic acid_0_6_months given first_line 5.07
% received cefpodoxime_0_6_months given first_line 0.03
% received cefadroxil_0_6_months gi

In [220]:
filtered_cohort_n2[['all','first_line','second_line','alternatives']].sum()

all             56601
first_line      34429
second_line     20792
alternatives     1380
dtype: int64

In [218]:
new_df

Unnamed: 0,prevalence_variable,all,first_line,second_line,alternatives,nitrofurantoin_0_months,trimethoprim-sulfamethoxazole_0_months,ciprofloxacin_0_months,ofloxacin_0_months,levofloxacin_0_months,amoxicillin-clavulanic acid_0_months,cefpodoxime_0_months,cefadroxil_0_months
0,nitrofurantoin_0_6_months,0.043957,0.050771,0.031839,0.056522,0.061343,0.036617,0.030794,0.117647,0.039024,0.05848,0.016949,0.092308
1,trimethoprim-sulfamethoxazole_0_6_months,0.045229,0.047634,0.041025,0.048551,0.036836,0.062092,0.040295,0.0,0.046748,0.050961,0.033898,0.030769
2,ciprofloxacin_0_6_months,0.053939,0.040896,0.074885,0.063768,0.042519,0.038723,0.075566,0.117647,0.069512,0.061821,0.076271,0.076923
3,ofloxacin_0_6_months,0.00447,0.004473,0.004329,0.006522,0.004769,0.004076,0.004368,0.0,0.004065,0.007519,0.0,0.0
4,levofloxacin_0_6_months,0.021978,0.015713,0.031647,0.032609,0.014004,0.018003,0.024843,0.0,0.08252,0.033417,0.025424,0.030769
5,amoxicillin-clavulanic acid_0_6_months,0.05189,0.050655,0.049971,0.111594,0.051449,0.049592,0.049904,0.117647,0.05,0.123642,0.033898,0.030769
6,cefpodoxime_0_6_months,0.000336,0.00029,0.000289,0.002174,0.000304,0.000272,0.000164,0.0,0.00122,0.0,0.025424,0.0
7,cefadroxil_0_6_months,0.001696,0.001859,0.001299,0.003623,0.001776,0.00197,0.001201,0.0,0.002033,0.000835,0.0,0.061538
8,any_antibiotic_0_6_months,0.190156,0.180778,0.20075,0.264493,0.180121,0.181658,0.195359,0.294118,0.240244,0.274018,0.169492,0.261538
9,no_antibiotic_0_6_months,0.809844,0.819222,0.79925,0.735507,0.819879,0.818342,0.804641,0.705882,0.759756,0.725982,0.830508,0.738462


In [222]:
new_df.to_csv("antibiotic_prevalence_table_v6.csv")

In [133]:
### Compute conditional antibiotic prevalencies

# Pseudocode:

#make two dictionaries one for prevalencies and another for column conditions
#prevalencies
#make dataframe with columns

#fill out prevalencies column

#loop through the antibiotic type variables
    #nested loop through the filtered_cohort columns

#loop through the prevalencies column


antibiotic_list = list(antibiotic_hist.keys())

antibiotic_preval_vars = antibiotic_list + antibiotic_prevalence_columns

antibiotic_columns = ['prevalence_variable'] + antibiotic_list + antibiotic_history_columns + ['any_antibiotic_0_6_months']
antibiotic_preval = pd.DataFrame(columns=antibiotic_columns)
antibiotic_preval['prevalence_variable'] = antibiotic_preval_vars

antibiotic_columns = antibiotic_columns[1:]

#treatment vs antibiotic 6 month history 
# for antibiotic_prevalence_var in antibiotic_list:
#     for antibiotic_type in antibiotic_list:    
#         base_condition = (np.any(filtered_cohort_n[antibiotic_hist[antibiotic_type]], axis=1))
        
        
#         print(f"% received {antibiotic_prevalence_var} given {antibiotic_type} 0_to_6 months {(filtered_cohort_n.antibiotic_type_0.isin([antibiotic_prevalence_var]) & base_condition).sum() / base_condition.sum() * 100:.4f}")
#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == antibiotic_prevalence_var,antibiotic_type] = (filtered_cohort_n.antibiotic_type_0.isin([antibiotic_prevalence_var]) & base_condition).sum() / base_condition.sum()

        

#NEW CONDITIONS compare_condition  == Condition    

for prevalence_variable in antibiotic_history_columns:
    

#     print("prevalence ",prevalence_variable)
#         compare_condition = (filtered_cohort_n.antibiotic_type_0.isin([prevalence_variable]))
#     else:



    #any condition
#     base_condition = (np.any(filtered_cohort_n[antibiotic_history_columns], axis=1))
#     prevalence = (compare_condition & base_condition).sum() / base_condition.sum()
    
    
#     print(f"% received {prevalence_variable} given any 0_to_6 months {prevalence * 100:.4f}")
#     antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,"any_antibiotic_0_6_months"] = prevalence

    compare_condition = (filtered_cohort_n[prevalence_variable] == 1)
    # drug condition 
    
    #ANY PREVALENCE
#     print("any prevalence")
    #OTHER PREVALENCIES base condition == prevalency  
    for column in antibiotic_preval_vars:
        
        
        
        if column in antibiotic_list: #you can ignore these two lines
            continue
        
#         print("condition", column)

        base_condition = (filtered_cohort_n[column] == 1)
#         print("base",column,base_condition.sum())
#         print("compare",prevalence_variable,compare_condition.sum())
        prevalence = (compare_condition & base_condition).sum() / base_condition.sum()
                            
        
        print(f"% received {prevalence_variable} given {column} {prevalence * 100:.2f}")
        antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,column] = prevalence
#         break
#     for antibiotic_type in antibiotic_list:    
#         base_condition = (np.any(filtered_cohort[antibiotic_hist[antibiotic_type]], axis=1))
#         print(f"% received {prevalence_variable} given {antibiotic_type} 0_to_6 months {(compare_condition & base_condition).sum() / base_condition.sum() * 100:.4f}")
#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,antibiotic_type] = (compare_condition & base_condition).sum() / base_condition.sum()
    print()


for prevalence_variable in ['any 6 month history','no 6 month history']:
    if prevalence_variable.startswith('any'):
        compare_condition = (np.any(filtered_cohort_n[antibiotic_history_columns], axis=1))
    else:
        compare_condition = ~(np.any(filtered_cohort_n[antibiotic_history_columns], axis=1))

    # drug condition 
    
    #ANY PREVALENCE
#     print("any prevalence")
    #OTHER PREVALENCIES base condition == prevalency  
    for column in antibiotic_preval_vars:



        if column in antibiotic_list: #you can ignore these two lines
            continue

    #         print("condition", column)

        base_condition = (filtered_cohort_n[column] == 1)
    #         print("base",column,base_condition.sum())
    #         print("compare",prevalence_variable,compare_condition.sum())
        prevalence = (compare_condition & base_condition).sum() / base_condition.sum()


        print(f"% received {prevalence_variable} given {column} {prevalence * 100:.2f}")
#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,column] = prevalence
#         break
    print()

% received nitrofurantoin_0_6_months given nitrofurantoin_0_months 6.13
% received trimethoprim-sulfamethoxazole_0_6_months given nitrofurantoin_0_months 3.68
% received ciprofloxacin_0_6_months given nitrofurantoin_0_months 4.25
% received ofloxacin_0_6_months given nitrofurantoin_0_months 0.48
% received levofloxacin_0_6_months given nitrofurantoin_0_months 1.40
% received amoxicillin-clavulanic acid_0_6_months given nitrofurantoin_0_months 5.14
% received cefpodoxime_0_6_months given nitrofurantoin_0_months 0.03
% received cefadroxil_0_6_months given nitrofurantoin_0_months 0.18
% received any 6 month history given nitrofurantoin_0_months 18.01
% received no 6 month history given nitrofurantoin_0_months 81.99


In [118]:
filtered_cohort_n['nitrofurantoin_0_months'].sum()

19709

In [101]:
### Compute conditional antibiotic prevalencies

# Pseudocode:

#make two dictionaries one for prevalencies and another for column conditions
#prevalencies
#make dataframe with columns

#fill out prevalencies column

#loop through the antibiotic type variables
    #nested loop through the filtered_cohort columns

#loop through the prevalencies column


antibiotic_list = list(antibiotic_hist.keys())

antibiotic_preval_vars = antibiotic_list + antibiotic_prevalence_columns

antibiotic_columns = ['prevalence_variable'] + antibiotic_list + antibiotic_history_columns + ['any_antibiotic_0_6_months']
antibiotic_preval = pd.DataFrame(columns=antibiotic_columns)
antibiotic_preval['prevalence_variable'] = antibiotic_preval_vars

antibiotic_columns = antibiotic_columns[1:]

#treatment vs antibiotic 6 month history 
# for antibiotic_prevalence_var in antibiotic_list:
#     for antibiotic_type in antibiotic_list:    
#         base_condition = (np.any(filtered_cohort_n[antibiotic_hist[antibiotic_type]], axis=1))
        
        
#         print(f"% received {antibiotic_prevalence_var} given {antibiotic_type} 0_to_6 months {(filtered_cohort_n.antibiotic_type_0.isin([antibiotic_prevalence_var]) & base_condition).sum() / base_condition.sum() * 100:.4f}")
#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == antibiotic_prevalence_var,antibiotic_type] = (filtered_cohort_n.antibiotic_type_0.isin([antibiotic_prevalence_var]) & base_condition).sum() / base_condition.sum()

        

        
for prevalence_variable in antibiotic_preval_vars:
    
    
    if prevalence_variable in antibiotic_list: #you can ignore these two lines
        continue
#         compare_condition = (filtered_cohort_n.antibiotic_type_0.isin([prevalence_variable]))
#     else:

    #any condition
#     compare_condition = (filtered_cohort_n[prevalence_variable] == 1)
#     base_condition = (np.any(filtered_cohort_n[antibiotic_history_columns], axis=1))
#     prevalence = (compare_condition & base_condition).sum() / base_condition.sum()
    
    
#     print(f"% received {prevalence_variable} given any 0_to_6 months {prevalence * 100:.4f}")
#     antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,"any_antibiotic_0_6_months"] = prevalence
    
    # drug condition 
    for column in antibiotic_history_columns:
        base_condition = (filtered_cohort_n[column] == 1)
        prevalence = (compare_condition & base_condition).sum() / base_condition.sum()
                                       
        
        print(f"% received {prevalence_variable} given {column} {prevalence * 100:.2f}")
        antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,column] = prevalence

#     for antibiotic_type in antibiotic_list:    
#         base_condition = (np.any(filtered_cohort[antibiotic_hist[antibiotic_type]], axis=1))
#         print(f"% received {prevalence_variable} given {antibiotic_type} 0_to_6 months {(compare_condition & base_condition).sum() / base_condition.sum() * 100:.4f}")
#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,antibiotic_type] = (compare_condition & base_condition).sum() / base_condition.sum()
    break
    print()
    


% received nitrofurantoin_0_months given nitrofurantoin_0_6_months 0.24
% received nitrofurantoin_0_months given trimethoprim-sulfamethoxazole_0_6_months 0.08
% received nitrofurantoin_0_months given ciprofloxacin_0_6_months 0.16
% received nitrofurantoin_0_months given ofloxacin_0_6_months 0.00
% received nitrofurantoin_0_months given levofloxacin_0_6_months 0.16
% received nitrofurantoin_0_months given amoxicillin-clavulanic acid_0_6_months 0.07
% received nitrofurantoin_0_months given cefpodoxime_0_6_months 0.00
% received nitrofurantoin_0_months given cefadroxil_0_6_months 4.17


In [123]:
### Compute conditional antibiotic prevalencies

# Pseudocode:

#make two dictionaries one for prevalencies and another for column conditions
#prevalencies
#make dataframe with columns

#fill out prevalencies column

#loop through the antibiotic type variables
    #nested loop through the filtered_cohort columns

#loop through the prevalencies column


antibiotic_list = list(antibiotic_hist.keys())

antibiotic_preval_vars = antibiotic_list + antibiotic_prevalence_columns

antibiotic_columns = ['prevalence_variable'] + antibiotic_list + antibiotic_history_columns + ['any_antibiotic_0_6_months']
antibiotic_preval = pd.DataFrame(columns=antibiotic_columns)
antibiotic_preval['prevalence_variable'] = antibiotic_preval_vars

antibiotic_columns = antibiotic_columns[1:]

#treatment vs antibiotic 6 month history 
for antibiotic_prevalence_var in antibiotic_list:
    for antibiotic_type in antibiotic_list:    
        base_condition = (np.any(filtered_cohort_n[antibiotic_hist[antibiotic_type]], axis=1))
        
        
        print(f"% received {antibiotic_prevalence_var} given {antibiotic_type} 0_to_6 months {(filtered_cohort_n.antibiotic_type_0.isin([antibiotic_prevalence_var]) & base_condition).sum() / base_condition.sum() * 100:.4f}")
        antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == antibiotic_prevalence_var,antibiotic_type] = (filtered_cohort_n.antibiotic_type_0.isin([antibiotic_prevalence_var]) & base_condition).sum() / base_condition.sum()

        

        
for prevalence_variable in antibiotic_preval_vars:
    if prevalence_variable in antibiotic_list:
        compare_condition = (filtered_cohort_n.antibiotic_type_0.isin([prevalence_variable]))
    else:
        compare_condition = (filtered_cohort_n[prevalence_variable] == 1)
    base_condition = (np.any(filtered_cohort_n[antibiotic_history_columns], axis=1))
    prevalence = (compare_condition & base_condition).sum() / base_condition.sum()
    
    
    print(f"% received {prevalence_variable} given any 0_to_6 months {prevalence * 100:.4f}")
    antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,"any_antibiotic_0_6_months"] = prevalence
    print(antibiotic_history_columns)
    for column in antibiotic_history_columns:
        base_condition = (filtered_cohort_n[column] == 1)
        prevalence = (compare_condition & base_condition).sum() / base_condition.sum()
                                       
        
        print(f"% received {prevalence_variable} given {column} {prevalence * 100:.2f}")
        antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,column] = prevalence

#     for antibiotic_type in antibiotic_list:    
#         base_condition = (np.any(filtered_cohort[antibiotic_hist[antibiotic_type]], axis=1))
#         print(f"% received {prevalence_variable} given {antibiotic_type} 0_to_6 months {(compare_condition & base_condition).sum() / base_condition.sum() * 100:.4f}")
#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,antibiotic_type] = (compare_condition & base_condition).sum() / base_condition.sum()
    print()
    


% received first_line given first_line 0_to_6 months 66.7085
% received first_line given second_line 0_to_6 months 46.3119
% received first_line given alternatives 0_to_6 months 59.5661
% received second_line given first_line 0_to_6 months 30.4039
% received second_line given second_line 0_to_6 months 50.5823
% received second_line given alternatives 0_to_6 months 35.1085
% received alternatives given first_line 0_to_6 months 2.8876
% received alternatives given second_line 0_to_6 months 3.1057
% received alternatives given alternatives 0_to_6 months 5.3254
% received first_line given any 0_to_6 months 57.8277
% received first_line given nitrofurantoin_0_6_months 70.26
% received first_line given trimethoprim-sulfamethoxazole_0_6_months 64.06
% received first_line given ciprofloxacin_0_6_months 46.12
% received first_line given ofloxacin_0_6_months 60.87
% received first_line given levofloxacin_0_6_months 43.49
% received first_line given amoxicillin-clavulanic acid_0_6_months 59.38
% 

In [124]:
### Save to CSV
antibiotic_preval.to_csv("antibiotic_prevalence_table_v5.csv",index=False)

### End of Organized Cells

## Look at how Elizabeth generated nitrofurantoin_0_to_6_mo maybe?

In [14]:
in_name_1 = "manuscript_covariates_4_v4"
#out_name tbd
cohort_name = in_name_1

## Loading Dataframes

In [15]:
import matplotlib.pyplot as plt
import numpy as np
from typing import List, Dict, Optional, Union
import sys
import time
import importlib
import sparse
import datetime
import pandas as pd
import numpy as np
import scipy.sparse
import matplotlib.pyplot as plt
import os
#
sys.path.append("..")
import Utils.dbutils as dbutils
import Utils.data_utils as data_utils
import Generators.CohortGenerator as CohortGenerator
import Generators.FeatureGenerator as FeatureGenerator
import config
local_imports = (
    dbutils,
    data_utils,
    CohortGenerator,
    FeatureGenerator,
    config
)
for i in local_imports:
    i = importlib.reload(i)

In [5]:
## database connection parameters
# username = config.PG_USERNAME #we use peer authentication so don't need use vars, but in theory would pass them into config_path
# password = config.PG_PASSWORD
database_name = config.DB_NAME
print(database_name)
config_path = 'postgresql://{database_name}'.format(
    database_name = database_name
)
connect_args = {"host": '/var/run/postgresql/'} # connect_args to pass to sqlalchemy create_engine function

# schemas 
schema_name = 'eol_test_ncjones' # all created tables will be created using this schema
cdm_schema_name = config.OMOP_CDM_SCHEMA # the name of the schema housing your OMOP CDM tables
print(f"cdm schema: {cdm_schema_name}")
# caching
reset_schema = False # if true, rebuild all data from scratch

# set up database, reset schemas as needed
db = dbutils.Database(config_path, schema_name, connect_args, cdm_schema_name)
# if reset_schema:
#     db.execute(
#         'drop schema if exists {} cascade'.format(schema_name)
#     )
# db.execute(
#     'create schema if not exists {}'.format(schema_name)
# )

localhost/omop_v6
cdm schema: cdm_6871_21


In [6]:
%%time
# Get the full condition item table
sql = """
    select
        *
    from
        {omop_schema}.{cohort_name} c
""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,
    cohort_name=cohort_name
)
cohort = db.query(sql)

CPU times: user 10.3 s, sys: 500 ms, total: 10.8 s
Wall time: 13.1 s


In [7]:
filtered_cohort = cohort.copy().loc[cohort.antibiotic_type!='inappropriate']

#### End of clean version

### Old way of doing it

In [16]:
%%time
# Get the full condition item table
sql = """select * from {omop_schema}.concept
                                 where domain_id = 'Drug'""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,
)
all_drugs = db.query(sql)

CPU times: user 16.9 s, sys: 1.85 s, total: 18.7 s
Wall time: 20.7 s


In [137]:
%%time


sql = """SELECT
                                uti.index,
                                uti.condition_occurrence_id, 
                                uti.person_id,
                                uti.condition_start_date,
                                drug.drug_concept_id,
                                drug.drug_exposure_id,
                                drug.drug_exposure_start_date,
                                drug.drug_exposure_start_datetime
                           FROM {omop_schema}.manuscript_covariates_8 uti
                           JOIN {omop_schema}.drug_exposure drug ON 
                                    drug.person_id = uti.person_id AND
                                    (uti.condition_start_date >= drug.drug_exposure_start_date)
                           ORDER BY 
                                    uti.condition_occurrence_id
                        """.format(
    omop_schema=config.OMOP_CDM_SCHEMA,
)
drug_exposures = db.query(sql)

CPU times: user 23.3 s, sys: 1.55 s, total: 24.9 s
Wall time: 1min 16s


In [145]:
drug_exposures

Unnamed: 0,index,condition_occurrence_id,person_id,condition_start_date,drug_concept_id,drug_exposure_id,drug_exposure_start_date,drug_exposure_start_datetime
0,115318,3463305,492555,2012-12-02,1797515,11827113,2012-12-02,2012-12-02
1,115318,3463305,492555,2012-12-02,19049106,11826991,2012-12-02,2012-12-02
2,115318,4246893,11335603,2012-11-17,1797515,89928629,2012-11-17,2012-11-17
3,115318,4246893,11335603,2012-11-17,1549218,89928622,2012-11-17,2012-11-17
4,115318,4304030,838549,2013-02-18,1797515,18396999,2013-02-18,2013-02-18
...,...,...,...,...,...,...,...,...
4534298,115391,566217433,1737091,2020-12-11,19080217,32724060,2020-03-12,2020-03-12
4534299,115391,566217433,1737091,2020-12-11,19080217,32724046,2020-10-01,2020-10-01
4534300,115391,566217433,1737091,2020-12-11,46275349,32724058,2017-12-29,2017-12-29
4534301,115391,566217433,1737091,2020-12-11,1713694,32724059,2016-06-20,2016-06-20


In [18]:
## I noticed a problem with the presence of quinolone, it was counted as cephalexin, which means it was in alternatives and second line

In [138]:
%%time
# First Line
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("nitrofurantoin"), 'antibiotic_name'] = "nitrofurantoin"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("sulfamethoxazole"), 'antibiotic_name'] = "sulfamethoxazole"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("trimethoprim"), 'antibiotic_name'] = "trimethoprim"
all_drugs.loc[(all_drugs["concept_name"].str.lower().str.contains("trimethoprim")) & \
                             (all_drugs["concept_name"].str.lower().str.contains("sulfamethoxazole")), 'antibiotic_name'] = "trimethoprim-sulfamethoxazole"


#alternatives
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cephalexin"), 'antibiotic_name'] = "cephalexin"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("quinolone"), 'antibiotic_name'] = "quinolone"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("ceftriaxone"), 'antibiotic_name'] = "ceftriaxone"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefuroxime"), 'antibiotic_name'] = "cefuroxime"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefdinir"), 'antibiotic_name'] = "cefdinir"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefazolin"), 'antibiotic_name'] = "cefazolin"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefepime"), 'antibiotic_name'] = "cefepime"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefpodoxime"), 'antibiotic_name'] = "cefpodoxime"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefixime"), 'antibiotic_name'] = "cefixime"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("cefadroxil"), 'antibiotic_name'] = "cefadroxil"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("amoxicillin"), 'antibiotic_name'] = "amoxicillin"
all_drugs.loc[(all_drugs["concept_name"].str.lower().str.contains("amoxicillin")) & \
                             (all_drugs["concept_name"].str.lower().str.contains("clavulan")), 'antibiotic_name'] = "amoxicillin-clavulanic acid"


        



# # second line
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("ofloxacin"), 'antibiotic_name'] = "ofloxacin"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("levofloxacin"), 'antibiotic_name'] = "levofloxacin"
all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("ciprofloxacin"), 'antibiotic_name'] = "ciprofloxacin"

# # Inappropriate

# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("fluconazole"), 'antibiotic_name'] = "fluconazole"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("doxycycline"), 'antibiotic_name'] = "doxycycline"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("clindamycin"), 'antibiotic_name'] = "clindamycin"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("linezolid"), 'antibiotic_name'] = "linezolid"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("metronidazole"), 'antibiotic_name'] = "metronidazole"
# all_drugs.loc[all_drugs["concept_name"].str.lower().str.contains("azithromycin"), 'antibiotic_name'] = "azithromycin"


all_drugs = all_drugs.rename(columns={"concept_id": "drug_concept_id", "concept_name": "drug_name"})

all_drugs = all_drugs[~all_drugs.antibiotic_name.isna()]

KeyError: 'concept_name'

In [139]:
antibiotic_hist = dict()
for category, antibiotics in antibiotic_categories.items():
    antibiotic_list = []
    for antibiotic in antibiotics:
        antibiotic_list.append(antibiotic + '_0_6_months')
    antibiotic_hist[category] = antibiotic_list
antibiotic_hist

{'first_line': ['nitrofurantoin_0_6_months',
  'trimethoprim-sulfamethoxazole_0_6_months'],
 'second_line': ['ciprofloxacin_0_6_months',
  'ofloxacin_0_6_months',
  'levofloxacin_0_6_months'],
 'alternatives': ['amoxicillin-clavulanic acid_0_6_months',
  'cefpodoxime_0_6_months',
  'cefadroxil_0_6_months']}

In [140]:
# 1st line: nitrofurantoin, trimethoprim-sulfamethoxazole
# 2nd line: ofloxacin, ciprofloxacin and levofloxacin
# Alternative: amoxicillin-clavulanate, cefadroxil, and cefpodoxime
antibiotic_categories = {
'first_line' : ["nitrofurantoin","trimethoprim-sulfamethoxazole"],
"second_line" : ["ciprofloxacin","ofloxacin","levofloxacin"],
"alternatives" : ["amoxicillin-clavulanic acid","cefpodoxime","cefadroxil"]
}
all_antibiotics = [j for x in antibiotic_categories.values() for j in x]
# antibiotic_categories.update({'all':all_antibiotics})

In [146]:
%%time
#psuedocode
# Join the data frame on the drug concept id 

antibiotic_df = drug_exposures.merge(all_drugs[['drug_concept_id','drug_name','antibiotic_name']],how='left',on='drug_concept_id')
antibiotic_df = antibiotic_df.loc[~antibiotic_df.antibiotic_name.isnull()]
  

CPU times: user 1.71 s, sys: 598 ms, total: 2.31 s
Wall time: 2.31 s


In [142]:
all_antibiotics

['nitrofurantoin',
 'trimethoprim-sulfamethoxazole',
 'ciprofloxacin',
 'ofloxacin',
 'levofloxacin',
 'amoxicillin-clavulanic acid',
 'cefpodoxime',
 'cefadroxil']

In [115]:
%%time
from datetime import timedelta
## create columns for 0_to_6_months for each drug except the first line treatments and iterate through person

antibiotic_column_names = [x + '_0_6_months' for x in all_antibiotics]
filtered_cohort = cohort.copy().loc[cohort.antibiotic_type!='inappropriate']
# filtered_cohort = cohort.copy()

temp_anti_df = antibiotic_df.loc[antibiotic_df.antibiotic_name.isin(all_antibiotics) & (antibiotic_df.person_id.isin(filtered_cohort.person_id))]
for column in antibiotic_column_names:
    filtered_cohort[column] = 0


##populate them with the value of interest 
total = len(filtered_cohort)
cnt = 0
for idx,row in filtered_cohort.iterrows():
    date = row['condition_start_date']
    date_query = date - timedelta(days=180)
    person = row['person_id']
    person_antibiotic_df = temp_anti_df.loc[(temp_anti_df.person_id == person)]
    person_antibiotic_df = person_antibiotic_df.loc[(person_antibiotic_df.drug_exposure_start_date >= date_query) &\
                                        (person_antibiotic_df.drug_exposure_start_date < date)]
    for antibiotic_col in antibiotic_column_names:
        antibiotic = antibiotic_col.split('_')[0]
        if antibiotic in np.unique(person_antibiotic_df.antibiotic_name.values):
            filtered_cohort.loc[idx,antibiotic_col] = 1

#     return filtered_cohort
    cnt+=1
    if cnt % (total // 10) == 0:
        print(f"{cnt/total:.1}")

0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1e+00
CPU times: user 2min 21s, sys: 578 ms, total: 2min 22s
Wall time: 2min 22s


In [147]:
import pdb
from datetime import timedelta
## create columns for 0_to_6_months for each drug except the first line treatments and iterate through person

antibiotic_column_names = [x + '_0_months' for x in all_antibiotics]
filtered_cohort = filtered_cohort.copy().loc[cohort.antibiotic_type!='inappropriate']
# filtered_cohort = cohort.copy()

temp_anti_df = antibiotic_df.loc[antibiotic_df.antibiotic_name.isin(all_antibiotics) & (antibiotic_df.person_id.isin(filtered_cohort.person_id))]
for column in antibiotic_column_names:
    filtered_cohort[column] = 0


##populate them with the value of interest 
total = len(filtered_cohort)
cnt = 0
for idx,row in filtered_cohort.iterrows():
    date = row['condition_start_date']
    date_query = date - timedelta(days=180)
    person = row['person_id']
    person_antibiotic_df = temp_anti_df.loc[(temp_anti_df.person_id == person)]
#     person_antibiotic_df = person_antibiotic_df.loc[(person_antibiotic_df.drug_exposure_start_date >= date_query) &\
#                                         (person_antibiotic_df.drug_exposure_start_date < date)]
    person_antibiotic_df = person_antibiotic_df.loc[(person_antibiotic_df.drug_exposure_start_date == date)]
    for antibiotic_col in antibiotic_column_names:
        antibiotic = antibiotic_col.split('_')[0]
        if antibiotic in np.unique(person_antibiotic_df.antibiotic_name.values):
            filtered_cohort.loc[idx,antibiotic_col] = 1

#     return filtered_cohort
    cnt+=1
    if cnt % (total // 10) == 0:
        print(f"{cnt/total:.1}")
antibiotic_column_names = [x + '_0_months' for x in all_antibiotics] + [x + '_0_6_months' for x in all_antibiotics]


0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1e+00


In [143]:
filtered_cohort['nitrofurantoin_0_months'].sum()

2349

In [150]:
#after updating the drug exposures table to be inclusive of day 0
filtered_cohort['trimethoprim-sulfamethoxazole_0_months'].sum()

12622

In [26]:
filtered_cohort

Unnamed: 0,level_0,index,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,visit_occurrence_id,...,less_30,less_90,nitrofurantoin_0_6_months,trimethoprim-sulfamethoxazole_0_6_months,ciprofloxacin_0_6_months,ofloxacin_0_6_months,levofloxacin_0_6_months,amoxicillin-clavulanic acid_0_6_months,cefpodoxime_0_6_months,cefadroxil_0_6_months
0,0,3828990,465004090,101,81902,2019-04-28,2019-04-28,0,0,914389,...,0,0,0,0,0,0,0,0,0,0
1,1,3828990,148592964,481,81902,2015-06-09,2015-06-09,0,0,925277,...,0,0,0,0,0,0,0,0,0,0
2,2,3828990,247776427,481,37018854,2016-09-13,2016-09-13,0,0,925268,...,0,0,0,0,0,0,0,0,0,0
3,3,3828990,455153289,658,81902,2019-06-01,2019-06-01,0,0,929640,...,0,0,1,0,0,0,0,0,0,0
4,4,115391,565954572,658,37018854,2021-01-14,2021-01-14,0,0,929764,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68497,68497,3828990,410361010,34041930,81902,2018-10-30,2018-10-30,0,0,160758917,...,0,0,0,0,0,0,0,0,0,0
68498,68498,3828990,444880147,34177663,37018854,2019-02-23,2019-02-23,0,0,160795192,...,0,0,0,0,0,0,0,0,0,0
68499,68499,115391,547287974,34234398,37018854,2020-08-24,2020-08-24,0,0,160796494,...,0,0,0,0,0,0,0,0,0,0
68501,68501,3828990,515586779,34507232,37018854,2019-12-07,2019-12-07,0,0,160803610,...,0,0,0,0,0,0,0,0,0,0


In [32]:
antibiotic_column_names += ['nitrofurantoin_0_to_6_mo','trimethoprim-sulfamethoxazole_0_to_6_mo']

## New way

In [27]:
antibiotic_categories = {
'first_line' : ["nitrofurantoin","trimethoprim-sulfamethoxazole"],
"second_line" : ["ciprofloxacin","ofloxacin","levofloxacin"],
"alternatives" : ["amoxicillin-clavulanic acid","cefpodoxime","cefadroxil"]
}
all_antibiotics = [j for x in antibiotic_categories.values() for j in x]

antibiotic_column_names = [x + '_0_to_6_mo' for x in all_antibiotics]


In [29]:
antibiotic_column_names = [x + '_0_6_months' for x in all_antibiotics]

In [19]:
list(filtered_cohort.columns)

['level_0',
 'index',
 'condition_occurrence_id',
 'person_id',
 'condition_concept_id',
 'condition_start_date',
 'condition_start_datetime',
 'condition_end_date',
 'condition_end_datetime',
 'visit_occurrence_id',
 'visit_detail_id',
 'drug_concept_id',
 'drug_name',
 'antibiotic_name',
 'antibiotic_type',
 'visit_provider_id',
 'drug_exposure_id',
 'drug_exposure_start_date',
 'drug_exposure_start_datetime',
 'provider_id',
 'provider_name',
 'npi',
 'no_previous_180_day_event',
 'no_two_previous_365_day_event',
 'post_UTI_codes',
 'recurrent_uti',
 'first_uti',
 'previous_uti',
 'previous_utis',
 'days_since_previous_uti',
 'previous_uti_condition_occurence_id',
 'previous_uti_recurrent',
 'multi',
 'year_of_birth',
 'age',
 'no_previous_180_excluded_event',
 'no_previous_excluded_event_ever',
 'nitrofurantoin_switch_ever',
 'fosfomycin_switch_ever',
 'trimethoprim-sulfamethoxazole_switch_ever',
 'second_line_switch_ever',
 'alternatives_switch_ever',
 'inappropriate_switch_ever',

In [30]:
for col_name in antibiotic_column_names:
    try:
        filtered_cohort[col_name]
        print(col_name,"is in columns")
    except KeyError:
        print(col_name,"is not in columns")

nitrofurantoin_0_6_months is in columns
trimethoprim-sulfamethoxazole_0_6_months is in columns
ciprofloxacin_0_6_months is in columns
ofloxacin_0_6_months is in columns
levofloxacin_0_6_months is in columns
amoxicillin-clavulanic acid_0_6_months is in columns
cefpodoxime_0_6_months is in columns
cefadroxil_0_6_months is in columns


In [33]:
filtered_cohort[antibiotic_column_names].sum().T

nitrofurantoin_0_6_months                   2843
trimethoprim-sulfamethoxazole_0_6_months    2972
ciprofloxacin_0_6_months                    3574
ofloxacin_0_6_months                         306
levofloxacin_0_6_months                     1442
amoxicillin-clavulanic acid_0_6_months      3371
cefpodoxime_0_6_months                        26
cefadroxil_0_6_months                        116
nitrofurantoin_0_to_6_mo                    2889
trimethoprim-sulfamethoxazole_0_to_6_mo     3261
dtype: int64

In [25]:
filtered_cohort[antibiotic_column_names].sum().T

nitrofurantoin_0_6_months                   2843
trimethoprim-sulfamethoxazole_0_6_months    2972
ciprofloxacin_0_6_months                    3574
ofloxacin_0_6_months                         306
levofloxacin_0_6_months                     1442
amoxicillin-clavulanic acid_0_6_months      3371
cefpodoxime_0_6_months                        26
cefadroxil_0_6_months                        116
dtype: int64

In [10]:
# filtered_cohort[antibiotic_column_names].sum().T

In [9]:
# filtered_cohort[antibiotic_column_names].sum().T

In [8]:
# cohort[cols_0_to_6_mo].sum().T

In [26]:
# cols_0_to_6_mo = [column for column in cohort.columns if '0_to_6_mo' in column]

In [None]:
# num_f_line = cohort.loc[((cohort.antibiotic_type=='nitrofurantoin') | (cohort.antibiotic_type=='trimethoprim-sulfamethoxazole'))].shape[0]
# print("% received first line given first line 0_to_6 months {:.2f}".format(cohort.loc[((cohort.antibiotic_type=='nitrofurantoin') | (cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & ((cohort.nitrofurantoin_0_to_6_mo == 1) | (cohort["trimethoprim-sulfamethoxazole_0_to_6_mo"] == 1))].shape[0]/num_f_line*100))
# print(f"% received first line given any 0_to_6 months {cohort.loc[((cohort.antibiotic_type=='nitrofurantoin') | (cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (np.any(cohort[cols_0_to_6_mo],axis=1))].shape[0]/num_f_line*100:.3}")
# for category,antibiotics in antibiotic_categories.items():
#     print(category)
#     if category == 'first_line':
#         columns = ['nitrofurantoin_0_to_6_mo','trime']
#     else:
#         columns = [x + '_0_6_months' for x in antibiotics]
#     for antibiotic in antibiotics:

#     print("% received first line given {0} {1:.2f}".format(column, filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (filtered_cohort[column] == 1)].shape[0]/num_f_line*100))
# # print()
# # for column in cols_0_to_6_mo:
# #     print("% received first line given {0} {1:.2f}".format(column, filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (filtered_cohort[column] == 1)].shape[0]/num_f_line*100))

# print()
# # num_s_line = cohort.loc[(cohort.antibiotic_type=='second_line')].shape[0]
# # print(f"% received second line given second line 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='second_line') & (cohort.second_line_0_to_6_mo == 1)].shape[0]/num_s_line*100:.3}")
# # print(f"% received second line given any 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='second_line') & (np.any(cohort[cols_0_to_6_mo],axis=1))].shape[0]/num_s_line*100:.3}")
# # print()
# # num_alt = cohort.loc[(cohort.antibiotic_type=='alternatives')].shape[0]
# # print(f"% received alternatives given alternatives 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='alternatives') & (cohort.alternatives_0_to_6_mo == 1)].shape[0]/num_alt*100:.3}")
# print(f"% received alternatives given any 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='alternatives') & (np.any(cohort[cols_0_to_6_mo],axis=1))].shape[0]/num_alt*100:.3}")

In [39]:
# 1823+1832

3655

In [40]:
# antibiotic_column_names

['ciprofloxacin_0_6_months',
 'ofloxacin_0_6_months',
 'levofloxacin_0_6_months',
 'amoxicillin-clavulanic acid_0_6_months',
 'cefpodoxime_0_6_months',
 'cefadroxil_0_6_months',
 'nitrofurantoin_0_to_6_mo',
 'trimethoprim-sulfamethoxazole_0_to_6_mo']

In [43]:
# num_f_line = cohort.loc[((cohort.antibiotic_type=='nitrofurantoin') | (cohort.antibiotic_type=='trimethoprim-sulfamethoxazole'))].shape[0]
# print("% received first line given first line 0_to_6 months {:.2f}".format(filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & ((filtered_cohort.nitrofurantoin_0_to_6_mo == 1) | (filtered_cohort["trimethoprim-sulfamethoxazole_0_to_6_mo"] == 1))].shape[0]))
# print("% received first line given trimethoprim 0_to_6 months {:.2f}".format(filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & ((filtered_cohort["trimethoprim-sulfamethoxazole_0_to_6_mo"] == 1))].shape[0]))
# print("% received first line given nitrofurantoin 0_to_6 months {:.2f}".format(filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & ((filtered_cohort.nitrofurantoin_0_to_6_mo == 1))].shape[0]))



# print(f"% received first line given any 0_to_6 months {cohort.loc[((cohort.antibiotic_type=='nitrofurantoin') | (cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (np.any(cohort[cols_0_to_6_mo],axis=1))].shape[0]}")
# for column in antibiotic_column_names:
#     print("% received first line given {0} {1:.2f}".format(column, filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (filtered_cohort[column] == 1)].shape[0]))
# print("% received first line given {0} {1:.2f}".format(column, filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (filtered_cohort[column] == 1)].shape[0]))

# print()
# num_s_line = cohort.loc[(cohort.antibiotic_type=='second_line')].shape[0]
# print(f"% received second line given second line 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='second_line') & (cohort.second_line_0_to_6_mo == 1)].shape[0]/num_s_line*100:.3}")
# print(f"% received second line given any 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='second_line') & (np.any(cohort[cols_0_to_6_mo],axis=1))].shape[0]/num_s_line*100:.3}")
# print()
# num_alt = cohort.loc[(cohort.antibiotic_type=='alternatives')].shape[0]
# print(f"% received alternatives given alternatives 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='alternatives') & (cohort.alternatives_0_to_6_mo == 1)].shape[0]/num_alt*100:.3}")
# print(f"% received alternatives given any 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='alternatives') & (np.any(cohort[cols_0_to_6_mo],axis=1))].shape[0]/num_alt*100:.3}")

% received first line given first line 0_to_6 months 3439.00
% received first line given trimethoprim 0_to_6 months 1832.00
% received first line given nitrofurantoin 0_to_6 months 1823.00


In [None]:
# antibiotic_column_names = [x + '_0_6_months' for x in all_antibiotics]


In [44]:
# print("% received first line given first line 0_to_6 months {:.2f}".format(filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & ((filtered_cohort.nitrofurantoin_0_to_6_mo == 1) & (filtered_cohort["trimethoprim-sulfamethoxazole_0_to_6_mo"] == 1))].shape[0]))


% received first line given first line 0_to_6 months 216.00


In [45]:
# 1832+1823 - 216

3439

In [51]:
# 1454+156+558

2168

In [None]:
## cluster it into groups and compute it for each group and any or just redo and exclude inappropriates 

## confirm that my own computed features approximates the first line feature...

In [31]:
antibiotic_column_names

['nitrofurantoin_0_6_months',
 'trimethoprim-sulfamethoxazole_0_6_months',
 'ciprofloxacin_0_6_months',
 'ofloxacin_0_6_months',
 'levofloxacin_0_6_months',
 'amoxicillin-clavulanic acid_0_6_months',
 'cefpodoxime_0_6_months',
 'cefadroxil_0_6_months']

In [65]:
antibiotic_hist

{'first_line': ['nitrofurantoin_0_6_months',
  'trimethoprim-sulfamethoxazole_0_6_months'],
 'second_line': ['ciprofloxacin_0_6_months',
  'ofloxacin_0_6_months',
  'levofloxacin_0_6_months'],
 'alternatives': ['amoxicillin-clavulanic acid_0_6_months',
  'cefpodoxime_0_6_months',
  'cefadroxil_0_6_months']}

In [None]:
#pseudocode:
Inputs: antibiotic_treat + antibiotic_history grouped and a separate list of lists called any and antibiotic group

In [123]:
antibiotic_preval_values

['first_line',
 'second_line',
 'alternatives',
 'nitrofurantoin_0_6_months',
 'trimethoprim-sulfamethoxazole_0_6_months',
 'ciprofloxacin_0_6_months',
 'ofloxacin_0_6_months',
 'levofloxacin_0_6_months',
 'amoxicillin-clavulanic acid_0_6_months',
 'cefpodoxime_0_6_months',
 'cefadroxil_0_6_months']

In [120]:
#prevalencies
# first_line_treatment, second_line_treatment, alternatives_treatment (indexing antibiotic_type)

# all drug history, nitrofurantoin history (indexing )

# # conditional #6 month
# first_line history, second_line history, alternatives history, any drug history, nitrofurantoin all_drug_history,


# #columns
# Prevalencies, conditional data

# #rows
# len(prevalencies_column) + 1


# Pseudocode:

#make two dictionaries one for prevalencies and another for column conditions
#prevalencies
#make dataframe with columns

#fill out prevalencies column

#loop through the antibiotic type variables
    #nested loop through the filtered_cohort columns

#loop through the prevalencies column
    
    
#psuedocode:

#create an empty pandas dataframe


#create dataframe columns for the conditionals, keys, values into a dictionary
antibiotic_hist = {'first_line': (['nitrofurantoin','trimethoprim-sulfamethoxazole'],['nitrofurantoin_0_6_months',
  'trimethoprim-sulfamethoxazole_0_6_months']),
 'second_line': (['second_line'],['ciprofloxacin_0_6_months',
  'ofloxacin_0_6_months',
  'levofloxacin_0_6_months']),
 'alternatives': (['alternatives'],['amoxicillin-clavulanic acid_0_6_months',
  'cefpodoxime_0_6_months',
  'cefadroxil_0_6_months'])}

antibiotic_list = list(antibiotic_hist.keys())
a_6_month_hist_names = ['nitrofurantoin_0_6_months','trimethoprim-sulfamethoxazole_0_6_months','ciprofloxacin_0_6_months','ofloxacin_0_6_months','levofloxacin_0_6_months','amoxicillin-clavulanic acid_0_6_months','cefpodoxime_0_6_months','cefadroxil_0_6_months']
antibiotic_preval_vars = antibiotic_list + a_6_month_hist_names 

antibiotic_columns = ['prevalence_variable'] + antibiotic_preval_values + ['any_antibiotic_0_6_months']
antibiotic_preval = pd.DataFrame(columns=antibiotic_columns)
antibiotic_preval['prevalence_variable'] =antibiotic_preval_vars

antibiotic_columns = antibiotic_columns[1:]
# antibiotic_columns = [x + '0_6_months' if x in antibiotic_list else x for x in antibiotic_columns]

# { (some_key if condition else default_key):(something_if_true if condition
#           else something_if_false) for key, value in dict_.items() }

#treatment vs antibiotic 6 month history 
for antibiotic_type in antibiotic_list:    
    base_condition = (np.any(filtered_cohort[antibiotic_hist[antibiotic_type][1]], axis=1))
    print(f"% received {antibiotic_type} given {antibiotic_type} 0_to_6 months {(filtered_cohort.antibiotic_type.isin(antibiotic_hist[antibiotic_type][0]) & base_condition).sum() / base_condition.sum() * 100:.4f}")
    antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == antibiotic_type,antibiotic_type] = (filtered_cohort.antibiotic_type.isin(antibiotic_hist[antibiotic_type][0]) & base_condition).sum() / base_condition.sum()
for prevalence_variable in antibiotic_preval_vars:
    if prevalence_variable in antibiotic_list:
        compare_condition = (filtered_cohort.antibiotic_type.isin(antibiotic_hist[prevalence_variable][0]))
    else:
        compare_condition = (filtered_cohort[prevalence_variable] == 1)
    base_condition = (np.any(filtered_cohort[a_6_month_hist_names], axis=1))
    print(f"% received {prevalence_variable} given any 0_to_6 months {(compare_condition & base_condition).sum() / base_condition.sum() * 100:.4f}")
    antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,"any_antibiotic_0_6_months"] = (compare_condition & base_condition).sum() / base_condition.sum()
    
    for column in a_6_month_hist_names:
        base_condition = (filtered_cohort[column] == 1)
        print(f"% received {prevalence_variable} given {column} {(compare_condition & base_condition).sum() / base_condition.sum() * 100:.2f}")
        antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,column] = (compare_condition & base_condition).sum() / base_condition.sum()

#     for antibiotic_type in antibiotic_list:    
#         base_condition = (np.any(filtered_cohort[antibiotic_hist[antibiotic_type][1]], axis=1))
#         print(f"% received {prevalence_variable} given {antibiotic_type} 0_to_6 months {(compare_condition & base_condition).sum() / base_condition.sum() * 100:.4f}")
#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,antibiotic_type] = (compare_condition & base_condition).sum() / base_condition.sum()
    print()
    


% received first_line given first_line 0_to_6 months 59.3744
% received second_line given second_line 0_to_6 months 50.3423
% received alternatives given alternatives 0_to_6 months 10.5684
% received first_line given any 0_to_6 months 51.2280
% received first_line given nitrofurantoin_0_6_months 63.14
% received first_line given trimethoprim-sulfamethoxazole_0_6_months 56.29
% received first_line given ciprofloxacin_0_6_months 40.68
% received first_line given ofloxacin_0_6_months 50.98
% received first_line given levofloxacin_0_6_months 38.70
% received first_line given amoxicillin-clavulanic acid_0_6_months 52.39
% received first_line given cefpodoxime_0_6_months 38.46
% received first_line given cefadroxil_0_6_months 56.90

% received second_line given any 0_to_6 months 39.9227
% received second_line given nitrofurantoin_0_6_months 28.46
% received second_line given trimethoprim-sulfamethoxazole_0_6_months 35.70
% received second_line given ciprofloxacin_0_6_months 50.53
% received 

In [134]:
((filtered_cohort['nitrofurantoin_0_months'] == 1) & (filtered_cohort['nitrofurantoin_0_6_months'] == 1)).sum()

174

In [155]:
antibiotic_hist

{'first_line': (['nitrofurantoin', 'trimethoprim-sulfamethoxazole'],
  ['nitrofurantoin_0_6_months', 'trimethoprim-sulfamethoxazole_0_6_months']),
 'second_line': (['second_line'],
  ['ciprofloxacin_0_6_months',
   'ofloxacin_0_6_months',
   'levofloxacin_0_6_months']),
 'alternatives': (['alternatives'],
  ['amoxicillin-clavulanic acid_0_6_months',
   'cefpodoxime_0_6_months',
   'cefadroxil_0_6_months'])}

In [161]:
filtered_cohort['antibiotic_type_0'] = 0

filtered_cohort.loc[np.any(filtered_cohort[['nitrofurantoin_0_months', 'trimethoprim-sulfamethoxazole_0_months']],axis=1),'antibiotic_type_0'] = 'first_line'
filtered_cohort.loc[np.any(filtered_cohort[['ciprofloxacin_0_months', 'ofloxacin_0_months','levofloxacin_0_months']],axis=1),'antibiotic_type_0'] = 'second_line'
filtered_cohort.loc[np.any(filtered_cohort[['amoxicillin-clavulanic acid_0_months', 'cefpodoxime_0_months','cefadroxil_0_months']],axis=1),'antibiotic_type_0'] = 'alternatives'


In [157]:
antibiotic_list

['first_line', 'second_line', 'alternatives']

In [169]:
filtered_cohort.shape

(65881, 413)

In [164]:
#prevalencies
# first_line_treatment, second_line_treatment, alternatives_treatment (indexing antibiotic_type)

# all drug history, nitrofurantoin history (indexing )

# # conditional #6 month
# first_line history, second_line history, alternatives history, any drug history, nitrofurantoin all_drug_history,


# #columns
# Prevalencies, conditional data

# #rows
# len(prevalencies_column) + 1


# Pseudocode:

#make two dictionaries one for prevalencies and another for column conditions
#prevalencies
#make dataframe with columns

#fill out prevalencies column

#loop through the antibiotic type variables
    #nested loop through the filtered_cohort columns

#loop through the prevalencies column
    
    
#psuedocode:

#create an empty pandas dataframe


#create dataframe columns for the conditionals, keys, values into a dictionary
antibiotic_hist = {'first_line': (['nitrofurantoin','trimethoprim-sulfamethoxazole'],['nitrofurantoin_0_6_months',
  'trimethoprim-sulfamethoxazole_0_6_months']),
 'second_line': (['second_line'],['ciprofloxacin_0_6_months',
  'ofloxacin_0_6_months',
  'levofloxacin_0_6_months']),
 'alternatives': (['alternatives'],['amoxicillin-clavulanic acid_0_6_months',
  'cefpodoxime_0_6_months',
  'cefadroxil_0_6_months'])}

antibiotic_list = list(antibiotic_hist.keys())
a_6_month_hist_names = ['nitrofurantoin_0_6_months','trimethoprim-sulfamethoxazole_0_6_months','ciprofloxacin_0_6_months','ofloxacin_0_6_months','levofloxacin_0_6_months','amoxicillin-clavulanic acid_0_6_months','cefpodoxime_0_6_months','cefadroxil_0_6_months']
a_0_month_hist_names = [x.replace('_6','') for x in a_6_month_hist_names]
antibiotic_preval_vars = antibiotic_list + a_0_month_hist_names

antibiotic_columns = ['prevalence_variable'] + antibiotic_list + a_6_month_hist_names + ['any_antibiotic_0_6_months']
antibiotic_preval = pd.DataFrame(columns=antibiotic_columns)
antibiotic_preval['prevalence_variable'] =antibiotic_preval_vars

antibiotic_columns = antibiotic_columns[1:]
# antibiotic_columns = [x + '0_6_months' if x in antibiotic_list else x for x in antibiotic_columns]

# { (some_key if condition else default_key):(something_if_true if condition
#           else something_if_false) for key, value in dict_.items() }

#treatment vs antibiotic 6 month history 
for antibiotic_type in antibiotic_list:    
    base_condition = (np.any(filtered_cohort[antibiotic_hist[antibiotic_type][1]], axis=1))
    print(f"% received {antibiotic_type} given {antibiotic_type} 0_to_6 months {(filtered_cohort.antibiotic_type_0.isin([antibiotic_type]) & base_condition).sum() / base_condition.sum() * 100:.4f}")
    antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == antibiotic_type,antibiotic_type] = (filtered_cohort.antibiotic_type_0.isin([antibiotic_type]) & base_condition).sum() / base_condition.sum()
for prevalence_variable in antibiotic_preval_vars:
    if prevalence_variable in antibiotic_list:
        compare_condition = (filtered_cohort.antibiotic_type_0.isin([prevalence_variable]))
    else:
        compare_condition = (filtered_cohort[prevalence_variable] == 1)
    base_condition = (np.any(filtered_cohort[a_6_month_hist_names], axis=1))
    print(f"% received {prevalence_variable} given any 0_to_6 months {(compare_condition & base_condition).sum() / base_condition.sum() * 100:.4f}")
    antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,"any_antibiotic_0_6_months"] = (compare_condition & base_condition).sum() / base_condition.sum()
    
    for column in a_6_month_hist_names:
        base_condition = (filtered_cohort[column] == 1)
        print(f"% received {prevalence_variable} given {column} {(compare_condition & base_condition).sum() / base_condition.sum() * 100:.2f}")
        antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,column] = (compare_condition & base_condition).sum() / base_condition.sum()

#     for antibiotic_type in antibiotic_list:    
#         base_condition = (np.any(filtered_cohort[antibiotic_hist[antibiotic_type][1]], axis=1))
#         print(f"% received {prevalence_variable} given {antibiotic_type} 0_to_6 months {(compare_condition & base_condition).sum() / base_condition.sum() * 100:.4f}")
#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,antibiotic_type] = (compare_condition & base_condition).sum() / base_condition.sum()
    print()
    


% received first_line given first_line 0_to_6 months 45.4264
% received second_line given second_line 0_to_6 months 33.5224
% received alternatives given alternatives 0_to_6 months 2.7421
% received first_line given any 0_to_6 months 40.6152
% received first_line given nitrofurantoin_0_6_months 48.58
% received first_line given trimethoprim-sulfamethoxazole_0_6_months 42.93
% received first_line given ciprofloxacin_0_6_months 30.69
% received first_line given ofloxacin_0_6_months 41.83
% received first_line given levofloxacin_0_6_months 28.92
% received first_line given amoxicillin-clavulanic acid_0_6_months 45.06
% received first_line given cefpodoxime_0_6_months 23.08
% received first_line given cefadroxil_0_6_months 49.14

% received second_line given any 0_to_6 months 26.5400
% received second_line given nitrofurantoin_0_6_months 18.50
% received second_line given trimethoprim-sulfamethoxazole_0_6_months 22.07
% received second_line given ciprofloxacin_0_6_months 33.49
% received s

In [170]:
filtered_cohort_n = filtered_cohort.copy().loc[np.any(filtered_cohort[a_0_month_hist_names],axis=1)]
filtered_cohort_n.shape

(47237, 413)

In [171]:
#prevalencies
# first_line_treatment, second_line_treatment, alternatives_treatment (indexing antibiotic_type)

# all drug history, nitrofurantoin history (indexing )

# # conditional #6 month
# first_line history, second_line history, alternatives history, any drug history, nitrofurantoin all_drug_history,


# #columns
# Prevalencies, conditional data

# #rows
# len(prevalencies_column) + 1


# Pseudocode:

#make two dictionaries one for prevalencies and another for column conditions
#prevalencies
#make dataframe with columns

#fill out prevalencies column

#loop through the antibiotic type variables
    #nested loop through the filtered_cohort columns

#loop through the prevalencies column
    
    
#psuedocode:

#create an empty pandas dataframe


#create dataframe columns for the conditionals, keys, values into a dictionary
antibiotic_hist = {'first_line': (['nitrofurantoin','trimethoprim-sulfamethoxazole'],['nitrofurantoin_0_6_months',
  'trimethoprim-sulfamethoxazole_0_6_months']),
 'second_line': (['second_line'],['ciprofloxacin_0_6_months',
  'ofloxacin_0_6_months',
  'levofloxacin_0_6_months']),
 'alternatives': (['alternatives'],['amoxicillin-clavulanic acid_0_6_months',
  'cefpodoxime_0_6_months',
  'cefadroxil_0_6_months'])}

antibiotic_list = list(antibiotic_hist.keys())
a_6_month_hist_names = ['nitrofurantoin_0_6_months','trimethoprim-sulfamethoxazole_0_6_months','ciprofloxacin_0_6_months','ofloxacin_0_6_months','levofloxacin_0_6_months','amoxicillin-clavulanic acid_0_6_months','cefpodoxime_0_6_months','cefadroxil_0_6_months']
a_0_month_hist_names = [x.replace('_6','') for x in a_6_month_hist_names]
antibiotic_preval_vars = antibiotic_list + a_0_month_hist_names

antibiotic_columns = ['prevalence_variable'] + antibiotic_list + a_6_month_hist_names + ['any_antibiotic_0_6_months']
antibiotic_preval = pd.DataFrame(columns=antibiotic_columns)
antibiotic_preval['prevalence_variable'] =antibiotic_preval_vars

antibiotic_columns = antibiotic_columns[1:]
# antibiotic_columns = [x + '0_6_months' if x in antibiotic_list else x for x in antibiotic_columns]

# { (some_key if condition else default_key):(something_if_true if condition
#           else something_if_false) for key, value in dict_.items() }

#treatment vs antibiotic 6 month history 
for antibiotic_type in antibiotic_list:    
    base_condition = (np.any(filtered_cohort_n[antibiotic_hist[antibiotic_type][1]], axis=1))
    print(f"% received {antibiotic_type} given {antibiotic_type} 0_to_6 months {(filtered_cohort_n.antibiotic_type_0.isin([antibiotic_type]) & base_condition).sum() / base_condition.sum() * 100:.4f}")
    antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == antibiotic_type,antibiotic_type] = (filtered_cohort_n.antibiotic_type_0.isin([antibiotic_type]) & base_condition).sum() / base_condition.sum()
for prevalence_variable in antibiotic_preval_vars:
    if prevalence_variable in antibiotic_list:
        compare_condition = (filtered_cohort_n.antibiotic_type_0.isin([prevalence_variable]))
    else:
        compare_condition = (filtered_cohort_n[prevalence_variable] == 1)
    base_condition = (np.any(filtered_cohort_n[a_6_month_hist_names], axis=1))
    print(f"% received {prevalence_variable} given any 0_to_6 months {(compare_condition & base_condition).sum() / base_condition.sum() * 100:.4f}")
    antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,"any_antibiotic_0_6_months"] = (compare_condition & base_condition).sum() / base_condition.sum()
    
    for column in a_6_month_hist_names:
        base_condition = (filtered_cohort_n[column] == 1)
        print(f"% received {prevalence_variable} given {column} {(compare_condition & base_condition).sum() / base_condition.sum() * 100:.2f}")
        antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,column] = (compare_condition & base_condition).sum() / base_condition.sum()

#     for antibiotic_type in antibiotic_list:    
#         base_condition = (np.any(filtered_cohort[antibiotic_hist[antibiotic_type][1]], axis=1))
#         print(f"% received {prevalence_variable} given {antibiotic_type} 0_to_6 months {(compare_condition & base_condition).sum() / base_condition.sum() * 100:.4f}")
#         antibiotic_preval.loc[antibiotic_preval['prevalence_variable'] == prevalence_variable,antibiotic_type] = (compare_condition & base_condition).sum() / base_condition.sum()
    print()
    


% received first_line given first_line 0_to_6 months 67.3679
% received second_line given second_line 0_to_6 months 50.8907
% received alternatives given alternatives 0_to_6 months 3.7559
% received first_line given any 0_to_6 months 59.0771
% received first_line given nitrofurantoin_0_6_months 70.78
% received first_line given trimethoprim-sulfamethoxazole_0_6_months 64.84
% received first_line given ciprofloxacin_0_6_months 46.88
% received first_line given ofloxacin_0_6_months 64.65
% received first_line given levofloxacin_0_6_months 44.22
% received first_line given amoxicillin-clavulanic acid_0_6_months 61.52
% received first_line given cefpodoxime_0_6_months 66.67
% received first_line given cefadroxil_0_6_months 67.06

% received second_line given any 0_to_6 months 38.6039
% received second_line given nitrofurantoin_0_6_months 26.96
% received second_line given trimethoprim-sulfamethoxazole_0_6_months 33.33
% received second_line given ciprofloxacin_0_6_months 51.15
% received s

In [173]:
antibiotic_preval.to_csv("antibiotic_prevalence_table_v3.csv",index=False)

In [107]:
antibiotic_preval.to_csv("antibiotic_prevalence_table_v2.csv",index=False)

In [103]:
antibiotic_preval

Unnamed: 0,prevalence_variable,first_line,second_line,alternatives,nitrofurantoin_0_6_months,trimethoprim-sulfamethoxazole_0_6_months,ciprofloxacin_0_6_months,ofloxacin_0_6_months,levofloxacin_0_6_months,amoxicillin-clavulanic acid_0_6_months,cefpodoxime_0_6_months,cefadroxil_0_6_months,any_antibiotic_0_6_months
0,first_line,0.593744,,,0.631375,0.562921,0.406827,0.509804,0.386963,0.52388,0.384615,0.568966,0.51228
1,second_line,,0.503423,,0.284559,0.356999,0.505316,0.415033,0.520804,0.37259,0.423077,0.293103,0.399227
2,alternatives,,,0.105684,0.084066,0.080081,0.087857,0.075163,0.092233,0.10353,0.192308,0.137931,0.088493
3,nitrofurantoin_0_6_months,,,,1.0,0.106326,0.101847,0.058824,0.064494,0.063779,0.192308,0.077586,0.228923
4,trimethoprim-sulfamethoxazole_0_6_months,,,,0.11115,1.0,0.105484,0.058824,0.094313,0.060516,0.153846,0.094828,0.239311
5,ciprofloxacin_0_6_months,,,,0.128034,0.126851,1.0,0.088235,0.119279,0.078612,0.115385,0.077586,0.287785
6,ofloxacin_0_6_months,,,,0.006331,0.006057,0.007555,1.0,0.009015,0.012163,0.0,0.008621,0.02464
7,levofloxacin_0_6_months,,,,0.032712,0.04576,0.048125,0.042484,1.0,0.048057,0.115385,0.043103,0.116112
8,amoxicillin-clavulanic acid_0_6_months,,,,0.075624,0.068641,0.074147,0.133987,0.112344,1.0,0.153846,0.068966,0.271439
9,cefpodoxime_0_6_months,,,,0.001759,0.001346,0.000839,0.0,0.00208,0.001187,1.0,0.008621,0.002094


In [111]:
antibiotic_preval.to_csv("antibiotic_prevalence_table_v1.csv",index=False)

### Antibiotic Prevalence

In [55]:
num_f_line = cohort.loc[((cohort.antibiotic_type=='nitrofurantoin') | (cohort.antibiotic_type=='trimethoprim-sulfamethoxazole'))].shape[0]
print("% received first line given first line 0_to_6 months {:.2f}".format(filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (np.any(filtered_cohort[antibiotic_hist['first_line']],axis=1))].shape[0]/num_f_line*100))
print(f"% received first line given any 0_to_6 months {filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (np.any(filtered_cohort[antibiotic_column_names],axis=1))].shape[0]/num_f_line*100:.4}")
for column in antibiotic_column_names:
    print("% received first line given {0} {1:.2f}".format(column, filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & ((filtered_cohort[column].any(axis=1)))].shape[0]/num_f_line*100))
print()
num_s_line = cohort.loc[(cohort.antibiotic_type=='second_line')].shape[0]
print(f"% received second line given second line 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='second_line') & (np.any(filtered_cohort[antibiotic_hist['second_line']],axis=1))].shape[0]/num_s_line*100:.4}")
print(f"% received second line given any 0_to_6 months {filtered_cohort.loc[(filtered_cohort.antibiotic_type=='second_line') & (np.any(filtered_cohort[antibiotic_column_names],axis=1))].shape[0]/num_s_line*100:.4}")
for column in antibiotic_column_names:
    print("% received second line given {0} {1:.2f}".format(column, filtered_cohort.loc[(filtered_cohort.antibiotic_type=='second_line') & ((filtered_cohort[column].any(axis=1)))].shape[0]/num_s_line*100))
print()
num_alt = cohort.loc[(cohort.antibiotic_type=='alternatives')].shape[0]
print(f"% received alternatives given alternatives 0_to_6 months {filtered_cohort.loc[(filtered_cohort.antibiotic_type=='alternatives') & (np.any(filtered_cohort[antibiotic_hist['alternatives']]))].shape[0]/num_alt*100:.3}")
print(f"% received alternatives given any 0_to_6 months {filtered_cohort.loc[(filtered_cohort.antibiotic_type=='alternatives') & (np.any(filtered_cohort[antibiotic_column_names],axis=1))].shape[0]/num_alt*100:.4}")
for column in antibiotic_column_names:
    print("% received alternatives given {0} {1:.2f}".format(column, filtered_cohort.loc[(filtered_cohort.antibiotic_type=='alternatives') & ((filtered_cohort[column].any(axis=1)))].shape[0]/num_alt*100))

% received first line given first line 0_to_6 months 9.35
% received first line given any 0_to_6 months 18.64


ValueError: No axis named 1 for object type Series

In [52]:

num_f_line = cohort.loc[((cohort.antibiotic_type=='nitrofurantoin') | (cohort.antibiotic_type=='trimethoprim-sulfamethoxazole'))].shape[0]
print("% received first line given first line 0_to_6 months {:.2f}".format(filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (np.any(filtered_cohort[antibiotic_hist['first_line']],axis=1))].shape[0]/num_f_line*100))
print(f"% received first line given any 0_to_6 months {filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (np.any(filtered_cohort[antibiotic_column_names],axis=1))].shape[0]/num_f_line*100:.4}")
for column in antibiotic_column_names:
    print("% received first line given {0} {1:.2f}".format(column, filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (filtered_cohort[column] == 1)].shape[0]/num_f_line*100))
print()
num_s_line = cohort.loc[(cohort.antibiotic_type=='second_line')].shape[0]
print(f"% received second line given second line 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='second_line') & (np.any(filtered_cohort[antibiotic_hist['second_line']],axis=1))].shape[0]/num_s_line*100:.4}")
print(f"% received second line given any 0_to_6 months {filtered_cohort.loc[(filtered_cohort.antibiotic_type=='second_line') & (np.any(filtered_cohort[antibiotic_column_names],axis=1))].shape[0]/num_s_line*100:.4}")
for column in antibiotic_column_names:
    print("% received second line given {0} {1:.2f}".format(column, filtered_cohort.loc[(filtered_cohort.antibiotic_type=='second_line') & (filtered_cohort[column] == 1)].shape[0]/num_s_line*100))
print()
num_alt = cohort.loc[(cohort.antibiotic_type=='alternatives')].shape[0]
print(f"% received alternatives given alternatives 0_to_6 months {filtered_cohort.loc[(filtered_cohort.antibiotic_type=='alternatives') & (np.any(filtered_cohort[antibiotic_hist['alternatives']],axis=1))].shape[0]/num_alt*100:.3}")
print(f"% received alternatives given any 0_to_6 months {filtered_cohort.loc[(filtered_cohort.antibiotic_type=='alternatives') & (np.any(filtered_cohort[antibiotic_column_names],axis=1))].shape[0]/num_alt*100:.4}")
for column in antibiotic_column_names:
    print("% received alternatives given {0} {1:.2f}".format(column, filtered_cohort.loc[(filtered_cohort.antibiotic_type=='alternatives') & (filtered_cohort[column] == 1)].shape[0]/num_alt*100))

% received first line given first line 0_to_6 months 9.35
% received first line given any 0_to_6 months 18.64
% received first line given nitrofurantoin_0_6_months 5.14
% received first line given trimethoprim-sulfamethoxazole_0_6_months 4.79
% received first line given ciprofloxacin_0_6_months 4.16
% received first line given ofloxacin_0_6_months 0.45
% received first line given levofloxacin_0_6_months 1.60
% received first line given amoxicillin-clavulanic acid_0_6_months 5.06
% received first line given cefpodoxime_0_6_months 0.03
% received first line given cefadroxil_0_6_months 0.19
% received first line given nitrofurantoin_0_to_6_mo 5.22
% received first line given trimethoprim-sulfamethoxazole_0_to_6_mo 5.25

% received second line given second line 0_to_6 months 10.15
% received second line given any 0_to_6 months 19.89
% received second line given nitrofurantoin_0_6_months 3.19
% received second line given trimethoprim-sulfamethoxazole_0_6_months 4.18
% received second line g

In [61]:
def compute_and_print_percentage(cohort_type, history_columns, cohort_df, filtered_df, column_names):
    cohort_count = cohort_df.loc[cohort_df.antibiotic_type == cohort_type].shape[0]
    history_columns = antibiotic_hist[cohort_type]
    if cohort_count == 0:
        print(f"No data available for {cohort_type}")
        return
    
    base_condition = (np.any(filtered_df[history_columns], axis=1))
    print(f"% received {cohort_type} given {cohort_type} 0_to_6 months {filtered_df.loc[(filtered_df.antibiotic_type==cohort_type) & base_condition].shape[0] / base_condition.sum() * 100:.4f}")
    base_condition = (np.any(filtered_df[column_names], axis=1))
    print(f"% received {cohort_type} given any 0_to_6 months {filtered_df.loc[(filtered_df.antibiotic_type==cohort_type) & (np.any(filtered_df[column_names], axis=1))].shape[0] / base_condition.sum() * 100:.4f}")

    for column in column_names:
        base_condition = (filtered_df[column] == 1)
        print(f"% received {cohort_type} given {column} {filtered_df.loc[(filtered_df.antibiotic_type == cohort_type) & (filtered_df[column] == 1)].shape[0] / base_condition.sum() * 100:.2f}")

    print()


# For first line
first_line_conditions = ['nitrofurantoin', 'trimethoprim-sulfamethoxazole']
compute_and_print_percentage("first_line", antibiotic_hist['first_line'], cohort.loc[cohort.antibiotic_type.isin(first_line_conditions)], filtered_cohort, antibiotic_column_names)

# For second line
compute_and_print_percentage("second_line", antibiotic_hist['second_line'], cohort, filtered_cohort, antibiotic_column_names)

# For alternatives
compute_and_print_percentage("alternatives", antibiotic_hist['alternatives'], cohort, filtered_cohort, antibiotic_column_names)


No data available for first_line
% received second_line given second_line 0_to_6 months 50.3423
% received second_line given any 0_to_6 months 39.8263
% received second_line given nitrofurantoin_0_6_months 28.46
% received second_line given trimethoprim-sulfamethoxazole_0_6_months 35.70
% received second_line given ciprofloxacin_0_6_months 50.53
% received second_line given ofloxacin_0_6_months 41.50
% received second_line given levofloxacin_0_6_months 52.08
% received second_line given amoxicillin-clavulanic acid_0_6_months 37.26
% received second_line given cefpodoxime_0_6_months 42.31
% received second_line given cefadroxil_0_6_months 29.31
% received second_line given nitrofurantoin_0_to_6_mo 28.49
% received second_line given trimethoprim-sulfamethoxazole_0_to_6_mo 36.06

% received alternatives given alternatives 0_to_6 months 10.5684
% received alternatives given any 0_to_6 months 8.7801
% received alternatives given nitrofurantoin_0_6_months 8.41
% received alternatives given t

In [51]:
def compute_and_print_percentage(cohort_type, history_columns, cohort_df, filtered_df, column_names):
    
    if cohort_count == 0:
        print(f"No data available for {cohort_type}")
        return
    
    base_condition = (np.any(filtered_df[history_columns], axis=1))
    print(f"% received {cohort_type} given {cohort_type} 0_to_6 months {filtered_df.loc[(filtered_df.antibiotic_type==cohort_type) & base_condition)].shape[0] / base_condition.sum() * 100:.4f}")
    base_condition = (np.any(filtered_df[column_names], axis=1))
    print(f"% received {cohort_type} given any 0_to_6 months {filtered_df.loc[(filtered_df.antibiotic_type==cohort_type) & (np.any(filtered_df[column_names], axis=1))].shape[0] / base_condition.sum() * 100:.4f}")
    
    for column in column_names:
        base_condition = (filtered_df[column] == 1)
        print(f"% received {cohort_type} given {column} {filtered_df.loc[(filtered_df.antibiotic_type == cohort_type) & (filtered_df[column] == 1)].shape[0] / base_condition.sum() * 100:.2f}")

    print()


# For first line
first_line_conditions = ['nitrofurantoin', 'trimethoprim-sulfamethoxazole']
compute_and_print_percentage("first_line", antibiotic_hist['first_line'], cohort.loc[cohort.antibiotic_type.isin(first_line_conditions)], filtered_cohort, antibiotic_column_names)

# For second line
compute_and_print_percentage("second_line", antibiotic_hist['second_line'], cohort, filtered_cohort, antibiotic_column_names)

# For alternatives
compute_and_print_percentage("alternatives", antibiotic_hist['alternatives'], cohort, filtered_cohort, antibiotic_column_names)


No data available for first_line
% received second_line given second_line 0_to_6 months 10.1522
% received second_line given any 0_to_6 months 19.8943
% received second_line given nitrofurantoin_0_6_months 3.19
% received second_line given trimethoprim-sulfamethoxazole_0_6_months 4.18
% received second_line given ciprofloxacin_0_6_months 7.12
% received second_line given ofloxacin_0_6_months 0.50
% received second_line given levofloxacin_0_6_months 2.96
% received second_line given amoxicillin-clavulanic acid_0_6_months 4.95
% received second_line given cefpodoxime_0_6_months 0.04
% received second_line given cefadroxil_0_6_months 0.13
% received second_line given nitrofurantoin_0_to_6_mo 3.25
% received second_line given trimethoprim-sulfamethoxazole_0_to_6_mo 4.64

% received alternatives given alternatives 0_to_6 months 6.5965
% received alternatives given any 0_to_6 months 19.8253
% received alternatives given nitrofurantoin_0_6_months 4.26
% received alternatives given trimethopri

In [None]:
## marginalizing over the bottom number /P(D)

In [None]:
def prob():
    
def conditional_prob():

['nitrofurantoin_0_6_months',
 'trimethoprim-sulfamethoxazole_0_6_months',
 'ciprofloxacin_0_6_months',
 'ofloxacin_0_6_months',
 'levofloxacin_0_6_months',
 'amoxicillin-clavulanic acid_0_6_months',
 'cefpodoxime_0_6_months',
 'cefadroxil_0_6_months',
 'nitrofurantoin_0_to_6_mo',
 'trimethoprim-sulfamethoxazole_0_to_6_mo']

TypeError: '<' not supported between instances of 'NoneType' and 'int'

In [48]:
def compute_checks(df,col,values):
    #either check multiple values or single value is in column
    if len(conds) > 1:
        final_cond = df[col].isin(values)
    else:
        final_cond = df[col] == values[0]
    #returns conditionally filtered dataframe
    return final_cond


def compute_checks(arr,values):
    
    #values is a list 
    #single column checks
    # df[cols]
    if arr.shape[1] == 1:
        if values:
            #check if a column has that value
            if values < 2:
                return arr == values[0] 
            #check if a column has multiple values
            else values > 2:
                return (arr.isin(values))
        else:
            #check if a column is equal to 1
            return (arr == 1)
    else:
        #check if any out of multiple columns are 1s
        return np.any(arr,axis=1)
    
    
def compute_antibiotic_prevalency_info(antibiotic_info,antibiotic_columns,df):
    #antibiotic_info: key, value
    #antibiotic_col_name, 
    for column in antibiotic_column_names:
        print(f"second line and {column} #",filtered_cohort.loc[compute_checks(filtered_cohort.antibiotic_type,['second_line']) & compute_checks(filtered_cohort[column])].shape[0])
        print(f"{column} #",filtered_cohort.loc[(filtered_cohort[column] == 1)].shape[0])
        print("% received second line given {0} {1:.2f}".format(column, filtered_cohort.loc[(filtered_cohort.antibiotic_type=='second_line') & (filtered_cohort[column] == 1)].shape[0]/filtered_cohort.loc[(filtered_cohort[column] == 1)].shape[0]*100))

# num_f_line = cohort.loc[((cohort.antibiotic_type=='nitrofurantoin') | (cohort.antibiotic_type=='trimethoprim-sulfamethoxazole'))].shape[0]
# print("% received first line given first line 0_to_6 months {:.2f}".format(filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (np.any(filtered_cohort[antibiotic_hist['first_line']],axis=1))].shape[0]/num_f_line*100))
# print(f"% received first line given any 0_to_6 months {filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (np.any(filtered_cohort[antibiotic_column_names],axis=1))].shape[0]/num_f_line*100:.4}")
# for column in antibiotic_column_names:
#     print("% received first line given {0} {1:.2f}".format(column, filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (filtered_cohort[column] == 1)].shape[0]/num_f_line*100))
# print()
print(f"% received second line given second line 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='second_line') & (np.any(filtered_cohort[antibiotic_hist['second_line']],axis=1))].shape[0]/num_s_line*100:.4}")
print(f"% received second line given any 0_to_6 months {filtered_cohort.loc[(filtered_cohort.antibiotic_type=='second_line') & (np.any(filtered_cohort[antibiotic_column_names],axis=1))].shape[0]/num_s_line*100:.4}")
# print("above are incorrect I think")
for column in antibiotic_column_names:
    print(f"second line and {column} #",filtered_cohort.loc[(filtered_cohort.antibiotic_type=='second_line') & (filtered_cohort[column] == 1)].shape[0])
    print(f"{column} #",filtered_cohort.loc[(filtered_cohort[column] == 1)].shape[0])
    print("% received second line given {0} {1:.2f}".format(column, filtered_cohort.loc[(filtered_cohort.antibiotic_type=='second_line') & (filtered_cohort[column] == 1)].shape[0]/filtered_cohort.loc[(filtered_cohort[column] == 1)].shape[0]*100))

    print()
# num_alt = cohort.loc[(cohort.antibiotic_type=='alternatives')].shape[0]
# print(f"% received alternatives given alternatives 0_to_6 months {filtered_cohort.loc[(filtered_cohort.antibiotic_type=='alternatives') & (np.any(filtered_cohort[antibiotic_hist['alternatives']],axis=1))].shape[0]/num_alt*100:.3}")
# print(f"% received alternatives given any 0_to_6 months {filtered_cohort.loc[(filtered_cohort.antibiotic_type=='alternatives') & (np.any(filtered_cohort[antibiotic_column_names],axis=1))].shape[0]/num_alt*100:.4}")
# for column in antibiotic_column_names:
#     print("% received alternatives given {0} {1:.2f}".format(column, filtered_cohort.loc[(filtered_cohort.antibiotic_type=='alternatives') & (filtered_cohort[column] == 1)].shape[0]/num_alt*100))

second line # 25354
second line and nitrofurantoin_0_6_months # 809
nitrofurantoin_0_6_months # 2843
% received second line given nitrofurantoin_0_6_months 28.46
% received second line given nitrofurantoin_0_6_months 3.19

second line # 25354
second line and trimethoprim-sulfamethoxazole_0_6_months # 1061
trimethoprim-sulfamethoxazole_0_6_months # 2972
% received second line given trimethoprim-sulfamethoxazole_0_6_months 35.70
% received second line given trimethoprim-sulfamethoxazole_0_6_months 4.18

second line # 25354
second line and ciprofloxacin_0_6_months # 1806
ciprofloxacin_0_6_months # 3574
% received second line given ciprofloxacin_0_6_months 50.53
% received second line given ciprofloxacin_0_6_months 7.12

second line # 25354
second line and ofloxacin_0_6_months # 127
ofloxacin_0_6_months # 306
% received second line given ofloxacin_0_6_months 41.50
% received second line given ofloxacin_0_6_months 0.50

second line # 25354
second line and levofloxacin_0_6_months # 751
levof

In [40]:
filtered_cohort.loc[(filtered_cohort[antibiotic_group] == 1) & (filtered_cohort[column] == 1)]

Unnamed: 0,level_0,index,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,visit_occurrence_id,...,less_30,less_90,nitrofurantoin_0_6_months,trimethoprim-sulfamethoxazole_0_6_months,ciprofloxacin_0_6_months,ofloxacin_0_6_months,levofloxacin_0_6_months,amoxicillin-clavulanic acid_0_6_months,cefpodoxime_0_6_months,cefadroxil_0_6_months
5,5,103649,329480918,744,81902,2017-10-21,2017-10-21,0,0,931274,...,0,1,0,1,1,0,0,0,0,0
31,31,115318,115217371,2666,81902,2014-10-25,2014-10-25,0,0,978523,...,0,0,0,1,1,0,0,0,0,0
34,34,115318,255374951,3137,81902,2016-09-13,2016-09-13,0,0,990030,...,0,0,0,1,0,0,0,0,0,0
48,48,115391,453491575,4033,194081,2019-05-22,2019-05-22,0,0,1014148,...,0,0,0,1,0,0,0,0,0,0
62,62,115318,162983711,5637,81902,2015-07-02,2015-07-02,0,0,1053374,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68392,68392,243383,535896648,30317912,194081,2020-04-26,2020-04-26,0,0,158408533,...,0,0,0,1,0,0,0,0,0,0
68403,68403,243383,558534098,30398462,81902,2020-08-17,2020-08-17,0,0,158492049,...,0,0,0,1,1,0,0,0,0,0
68411,68411,115391,550905658,30438956,81902,2020-08-04,2020-08-04,0,0,158541150,...,0,0,0,0,0,0,0,0,0,0
68465,68465,115391,558244369,30904264,81902,2020-10-25,2020-10-25,0,0,159060233,...,0,0,1,1,0,0,0,0,0,0


['nitrofurantoin_0_6_months',
 'trimethoprim-sulfamethoxazole_0_6_months',
 'ciprofloxacin_0_6_months',
 'ofloxacin_0_6_months',
 'levofloxacin_0_6_months',
 'amoxicillin-clavulanic acid_0_6_months',
 'cefpodoxime_0_6_months',
 'cefadroxil_0_6_months',
 'nitrofurantoin_0_to_6_mo',
 'trimethoprim-sulfamethoxazole_0_to_6_mo']

In [37]:
for antibiotic_group in antibiotic_column_names:
    for column in antibiotic_column_names:
        print("% received {0} given {1} {2:.2f}".format(antibiotic_group,column, filtered_cohort.loc[(filtered_cohort[antibiotic_group] == 1) & (filtered_cohort[column] == 1)].shape[0]/num_alt*100))

% received nitrofurantoin_0_6_months given nitrofurantoin_0_6_months 50.69
% received nitrofurantoin_0_6_months given trimethoprim-sulfamethoxazole_0_6_months 5.63
% received nitrofurantoin_0_6_months given ciprofloxacin_0_6_months 6.49
% received nitrofurantoin_0_6_months given ofloxacin_0_6_months 0.32
% received nitrofurantoin_0_6_months given levofloxacin_0_6_months 1.66
% received nitrofurantoin_0_6_months given amoxicillin-clavulanic acid_0_6_months 3.83
% received nitrofurantoin_0_6_months given cefpodoxime_0_6_months 0.09
% received nitrofurantoin_0_6_months given cefadroxil_0_6_months 0.16
% received nitrofurantoin_0_6_months given nitrofurantoin_0_to_6_mo 50.69
% received nitrofurantoin_0_6_months given trimethoprim-sulfamethoxazole_0_to_6_mo 6.01
% received trimethoprim-sulfamethoxazole_0_6_months given nitrofurantoin_0_6_months 5.63
% received trimethoprim-sulfamethoxazole_0_6_months given trimethoprim-sulfamethoxazole_0_6_months 52.99
% received trimethoprim-sulfamethoxazo

In [50]:
num_f_line = cohort.loc[((cohort.antibiotic_type=='nitrofurantoin') | (cohort.antibiotic_type=='trimethoprim-sulfamethoxazole'))].shape[0]
print("% received first line given first line 0_to_6 months {:.2f}".format(filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & ((filtered_cohort.nitrofurantoin_0_to_6_mo == 1) | (filtered_cohort["trimethoprim-sulfamethoxazole_0_to_6_mo"] == 1))].shape[0]))
print(f"% received first line given any 0_to_6 months {cohort.loc[((cohort.antibiotic_type=='nitrofurantoin') | (cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (np.any(cohort[antibiotic_column_names],axis=1))].shape[0]}")
for column in antibiotic_column_names:
    print("% received first line given {0} {1:.2f}".format(column, filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (filtered_cohort[column] == 1)].shape[0]))
print()


# for column in cols_0_to_6_mo:
#     print("% received first line given {0} {1:.2f}".format(column, filtered_cohort.loc[((filtered_cohort.antibiotic_type=='nitrofurantoin') | (filtered_cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & (filtered_cohort[column] == 1)].shape[0]))

print()
# num_s_line = cohort.loc[(cohort.antibiotic_type=='second_line')].shape[0]
# print(f"% received second line given second line 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='second_line') & (cohort.second_line_0_to_6_mo == 1)].shape[0]/num_s_line*100:.3}")
# print(f"% received second line given any 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='second_line') & (np.any(cohort[cols_0_to_6_mo],axis=1))].shape[0]/num_s_line*100:.3}")
# print()
# num_alt = cohort.loc[(cohort.antibiotic_type=='alternatives')].shape[0]
# print(f"% received alternatives given alternatives 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='alternatives') & (cohort.alternatives_0_to_6_mo == 1)].shape[0]/num_alt*100:.3}")
# print(f"% received alternatives given any 0_to_6 months {cohort.loc[(cohort.antibiotic_type=='alternatives') & (np.any(cohort[cols_0_to_6_mo],axis=1))].shape[0]/num_alt*100:.3}")

% received first line given first line 0_to_6 months 3439.00
% received first line given any 0_to_6 months 12807
% received first line given ciprofloxacin_0_6_months 1454.00
% received first line given ofloxacin_0_6_months 156.00
% received first line given levofloxacin_0_6_months 558.00
% received first line given amoxicillin-clavulanic acid_0_6_months 1766.00
% received first line given cefpodoxime_0_6_months 10.00
% received first line given cefadroxil_0_6_months 66.00
% received first line given nitrofurantoin_0_to_6_mo 1823.00
% received first line given trimethoprim-sulfamethoxazole_0_to_6_mo 1832.00

% received first line given nitrofurantoin_0_to_6_mo 1823.00
% received first line given fosfomycin_0_to_6_mo 0.00
% received first line given trimethoprim-sulfamethoxazole_0_to_6_mo 1832.00
% received first line given second_line_0_to_6_mo 2918.00
% received first line given alternatives_0_to_6_mo 4555.00
% received first line given inappropriate_0_to_6_mo 5924.00



In [22]:
filtered_cohort.loc[(filtered_cohort['antibiotic_type'] == 'nitrofurantoin') | (filtered_cohort['antibiotic_type'] == 'trimethoprim-sulfamethoxazole')]

Unnamed: 0,level_0,index,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,visit_occurrence_id,...,Thyroid Disorder_new0_6_months,Thyroid Disorder_new6_months_1_yr,Thyroid Disorder_new1_2_yr,Thyroid Disorder_new_full_condition_name,ciprofloxacin_0_6_months,ofloxacin_0_6_months,levofloxacin_0_6_months,amoxicillin-clavulanic acid_0_6_months,cefpodoxime_0_6_months,cefadroxil_0_6_months
0,0,3828990,465004090,101,81902,2019-04-28,2019-04-28,0,0,914389,...,0,0,0,,0,0,0,0,0,0
1,1,3828990,148592964,481,81902,2015-06-09,2015-06-09,0,0,925277,...,0,0,0,,0,0,0,0,0,0
2,2,3828990,247776427,481,37018854,2016-09-13,2016-09-13,0,0,925268,...,0,0,0,,0,0,0,0,0,0
3,3,3828990,455153289,658,81902,2019-06-01,2019-06-01,0,0,929640,...,0,0,0,,0,0,0,0,0,0
4,4,115391,565954572,658,37018854,2021-01-14,2021-01-14,0,0,929764,...,0,0,0,,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68497,68497,3828990,410361010,34041930,81902,2018-10-30,2018-10-30,0,0,160758917,...,0,0,0,,0,0,0,0,0,0
68498,68498,3828990,444880147,34177663,37018854,2019-02-23,2019-02-23,0,0,160795192,...,0,0,0,,0,0,0,0,0,0
68499,68499,115391,547287974,34234398,37018854,2020-08-24,2020-08-24,0,0,160796494,...,0,0,0,,0,0,0,0,0,0
68501,68501,3828990,515586779,34507232,37018854,2019-12-07,2019-12-07,0,0,160803610,...,0,0,0,,0,0,0,0,0,0


# second line | second line 0_to_6 months 0.13224737713970183
# second line | any 0_to_6 months 0.38064999605584915


In [105]:
from datetime import timedelta
## create columns for 0_to_6_months for each drug except the first line treatments and iterate through person
antibiotic_column_names = ['amoxicillin' + '_0_6_months']
filtered_cohort = cohort.copy().loc[cohort.antibiotic_type!='inappropriate'][:3]
for column in antibiotic_column_names:
    filtered_cohort[column] = 0


##populate them with the value of interest 
for idx,row in filtered_cohort.iterrows():
    date = row['condition_start_date']
    person = row['person_id']
    for antibiotic_col in antibiotic_column_names:
        antibiotic = antibiotic_col.split('_')[0]
        date_query = date - timedelta(days=180)
        print(date_query)
        print(1*np.any((antibiotic_df.antibiotic_name == antibiotic) & \
                                        (antibiotic_df.person_id == person)\
                                       & (antibiotic_df.drug_exposure_start_date >= date_query) &\
                                        (antibiotic_df.drug_exposure_start_date < date)))


2018-10-30 00:00:00
0
2014-12-11 00:00:00
1
2016-03-17 00:00:00
0


In [198]:
temp_anti_df = antibiotic_df.loc[antibiotic_df.antibiotic_name.isin(all_antibiotics) & (antibiotic_df.person_id.isin(filtered_cohort.person_id))]

# alternatives | alternatives 0_to_6 months 0.17721518987341772
# alternatives | any 0_to_6 months 0.37029773578177927


In [None]:
cohort.loc[cohort.antibiotic_type.isin(antibiotic_categories['first_line'])]

In [44]:
cohort.loc[((cohort.antibiotic_type=='nitrofurantoin') | (cohort.antibiotic_type=='trimethoprim-sulfamethoxazole')) & ((cohort.nitrofurantoin_0_to_6_mo == 1) | (cohort["trimethoprim-sulfamethoxazole_0_to_6_mo"] == 1))].shape[0]

3439

In [None]:
# loop requires cohort table, and drug table

#filter cohort for non inappropriates
cohort = cohort.loc[cohort.antibiotic_type!= 'inappropriate']
#create condition_name_dictionary

#use keys of condition_name_dictionary as columns in the cohort
# for row  in cohort.iterrows: 
#     for p_name in condition_name:
#          Date
#          Needed_date = Date - 6 months

#          Filtered_Drug_table = antibiotic_df.isin(antibiotic_categories[p_name]
         
#          Cohort[p_name] = 1*np.any(filtered_drug_table)

In [21]:
all_drugs.loc[all_drugs.antibiotic_name.isin(all_antibiotics)]

Unnamed: 0,drug_concept_id,drug_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason,antibiotic_name
1717,581784,CEFDINIR - cefdinir capsule,Drug,SPL,Prescription Drug,C,e296387e-eb20-4451-a184-f91748a4e9c7,2020-04-07,2099-12-31,n,cefdinir
1835,581902,"AMOXICILLIN - amoxicillin powder, for suspension",Drug,SPL,Prescription Drug,C,85916ec0-0a9e-fad0-e053-2a91aa0aeca4,2019-04-02,2099-12-31,n,amoxicillin
1898,581965,CEPHALEXIN ORAL SUSP - cephalexin oral susp po...,Drug,SPL,Prescription Drug,C,85a53325-1306-7e6a-e053-2991aa0a2db5,2019-04-03,2099-12-31,n,cephalexin
1907,581974,CEPHALEXIN - cephalexin capsule,Drug,SPL,Prescription Drug,C,2d12e6a9-f7dd-25ed-e054-00144ff8d46c,2020-01-17,2099-12-31,n,cephalexin
2083,582151,amoxicillin 500mg/1 ORAL CAPSULE,Drug,SPL,Prescription Drug,C,0d23f5b4-8c04-4f8e-ba74-857864601d92,2007-02-05,2099-12-31,n,amoxicillin
...,...,...,...,...,...,...,...,...,...,...,...
3934585,46368657,cephalexin 500mg/1 ORAL CAPSULE,Drug,NDC,9-digit NDC,n,617860434,2015-10-12,2099-12-31,n,cephalexin
3934592,46368664,"amoxicillin powder, for suspension 125mg/5mL O...",Drug,NDC,9-digit NDC,n,619190018,2015-01-01,2099-12-31,n,amoxicillin
3934620,46368692,sulfamethoxazole and trimethoprim 160mg/1 / 80...,Drug,NDC,9-digit NDC,n,619190766,2015-01-01,2099-12-31,n,trimethoprim-sulfamethoxazole
3934825,46368897,amoxicillin and clavulanate potassium 200mg/5m...,Drug,NDC,9-digit NDC,n,631870542,2004-08-13,2099-12-31,n,amoxicillin-clavulanic acid


In [10]:
all_drugs.head()

Unnamed: 0,drug_concept_id,drug_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason,antibiotic_name
1487,581554,POLYMYXIN B SULFATE AND TRIMETHOPRIM - polymyx...,Drug,SPL,Prescription Drug,C,4ac9725b-ce6b-3410-e054-00144ff8d46c,2017-03-15,2099-12-31,n,trimethoprim
1492,581559,POLYMYXIN B SULFATE AND TRIMETHOPRIM - polymyx...,Drug,SPL,Prescription Drug,C,50bdf240-8b95-6032-e054-00144ff88e88,2017-05-30,2099-12-31,n,trimethoprim
1717,581784,CEFDINIR - cefdinir capsule,Drug,SPL,Prescription Drug,C,e296387e-eb20-4451-a184-f91748a4e9c7,2020-04-07,2099-12-31,n,cefdinir
1835,581902,"AMOXICILLIN - amoxicillin powder, for suspension",Drug,SPL,Prescription Drug,C,85916ec0-0a9e-fad0-e053-2a91aa0aeca4,2019-04-02,2099-12-31,n,amoxicillin
1898,581965,CEPHALEXIN ORAL SUSP - cephalexin oral susp po...,Drug,SPL,Prescription Drug,C,85a53325-1306-7e6a-e053-2991aa0a2db5,2019-04-03,2099-12-31,n,cephalexin


In [9]:
drug_exposures.head()

Unnamed: 0,index,condition_occurrence_id,person_id,condition_start_date,drug_concept_id,drug_exposure_id,drug_exposure_start_date,drug_exposure_start_datetime
0,243383,4522439,8888650,2012-11-08,21176265,81248734,2011-06-27,2011-06-27
1,243383,4522439,8888650,2012-11-08,19023551,81248645,2012-11-06,2012-11-06
2,243383,4522439,8888650,2012-11-08,40221904,81248817,2012-11-06,2012-11-06
3,243383,4522439,8888650,2012-11-08,36249642,81248668,2012-11-06,2012-11-06
4,3828990,4575833,739531,2013-01-10,19022749,16599187,2013-01-09,2013-01-09


In [None]:


# if drug =='nitrofurantoin':
#         anti_new.loc[idx, "nitrofurantoin"] = 1
#         anti_new.loc[idx, "antibiotic_type"] ="nitrofurantoin"
#     elif drug =='trimethoprim-sulfamethoxazole' or drug =='trimethoprim' or drug =='sulfamethoxazole':
#         anti_new.loc[idx, 'trimethoprim-sulfamethoxazole'] = 1
#         anti_new.loc[idx, "antibiotic_type"] ='trimethoprim-sulfamethoxazole'

       
#     elif ("cef" in drug.lower()) or drug =='cephalexin' or ("amoxicillin" in drug.lower()):
#         anti_new.loc[idx, 'alternatives'] = 1
#         anti_new.loc[idx, "antibiotic_type"] ='alternatives'
        
#     elif drug == "quinolone":
#         anti_new.loc[idx, 'second_line'] = 1
#         anti_new.loc[idx, "antibiotic_type"] ='second_line'
#     elif 'ofloxacin' in drug.lower() :
#         anti_new.loc[idx, 'second_line'] = 1
#         anti_new.loc[idx, "antibiotic_type"] ='second_line'

### Viewing the antibiotic list and filtering by non first line, second line and alternatives

## Config and Helper Functions

In [11]:
group_to_cols = {"Arthritis" : ["arthritis","Arthritis_new"],
                 "Autoimmune" : ["sjögren",  "rheumatoid_arthritis", "reactive_arthritis", "lupus_erythematosus", "dermatomyositis","Autoimmune_new"],
                 "Cancer" : ["Cancer_new","cancer","carcinoma","malignan","leukemia","lymphoma","sarcoma"],
                 "Chronic Kidney" : ["Chronic Kidney_new","chronic_kidney","chronic_renal_failure"],
            "Diabetes Mellitus" : ["Diabetes Mellitus_new","diabetes_mellitus"], "Thyroid Disorder" : ["Thyroid Disorder_new","hashimoto_thyroiditis", "graves"], "HIV" : ["HIV_new","hiv"], "Hypertension": ["Hypertension_new","hypertension"], "Menopause":["Menopause_new","menopause"]}

In [12]:
year_buckets = {"UTI_2012_14":[2012,2013,2014],"UTI_2015_17":[2015,2016,2017],"UTI_2018_21":[2018,2019,2020,2021]}

for group_time, years in year_buckets.items():
    cohort[group_time] = 1*(cohort.condition_start_date.dt.year.isin(years))


In [13]:
specialties = ['specialty_family_medicine_group',
 'specialty_internal_medicine_group',
 'specialty_emergency/acute_group',
 'specialty_advanced_specialist_group',
 'specialty_OBGYN_group',
 'specialty_other_group',
 'specialty_urology_group']

specialty_groups = {' '.join(x.replace("specialty_","").replace("_group","").split("_")) : [x] for x in specialties}

In [14]:
cohort["uti_1_year"] = 1*((cohort.days_since_previous_uti <= 365) & (cohort.days_since_previous_uti > 0))


In [15]:
for condition_cat, condition_cols in group_to_cols.items():
    group_to_cols[condition_cat] = [x + '1_2_yr' for x in condition_cols] #to convert back to the nomenclature in the pd dataframe getting all events:
    group_to_cols[condition_cat] += [x + '0_6_months' for x in condition_cols]
    group_to_cols[condition_cat] += [x + '6_months_1_yr' for x in condition_cols]
group_to_cols.update({"UTI history in 1 year" : ["uti_1_year"], "Fever at presentation" : ['fever'], "Urinalysis ordered" : ['urine_test_present'], 'Blood test ordered' : ['cbc_present'], "Last UTI in 2012-2014":["UTI_2012_14"], "Last UTI in 2015-2017":["UTI_2015_17"], "Last UTI in 2018-2021":["UTI_2018_21"]})

group_to_cols.update(specialty_groups)
ages = cohort.age.dropna()

In [38]:
list(cohort.columns)

['level_0',
 'index',
 'condition_occurrence_id',
 'person_id',
 'condition_concept_id',
 'condition_start_date',
 'condition_start_datetime',
 'condition_end_date',
 'condition_end_datetime',
 'visit_occurrence_id',
 'visit_detail_id',
 'drug_concept_id',
 'drug_name',
 'antibiotic_name',
 'antibiotic_type',
 'visit_provider_id',
 'drug_exposure_id',
 'drug_exposure_start_date',
 'drug_exposure_start_datetime',
 'provider_id',
 'provider_name',
 'npi',
 'no_previous_180_day_event',
 'no_two_previous_365_day_event',
 'post_UTI_codes',
 'recurrent_uti',
 'first_uti',
 'previous_uti',
 'previous_utis',
 'days_since_previous_uti',
 'previous_uti_condition_occurence_id',
 'previous_uti_recurrent',
 'multi',
 'year_of_birth',
 'age',
 'no_previous_180_excluded_event',
 'no_previous_excluded_event_ever',
 'nitrofurantoin_switch_ever',
 'fosfomycin_switch_ever',
 'trimethoprim-sulfamethoxazole_switch_ever',
 'second_line_switch_ever',
 'alternatives_switch_ever',
 'inappropriate_switch_ever',

In [13]:
group_to_cols

{'Arthritis': ['arthritis1_2_yr',
  'Arthritis_new1_2_yr',
  'arthritis0_6_months',
  'Arthritis_new0_6_months',
  'arthritis6_months_1_yr',
  'Arthritis_new6_months_1_yr'],
 'Autoimmune': ['sjögren1_2_yr',
  'rheumatoid_arthritis1_2_yr',
  'reactive_arthritis1_2_yr',
  'lupus_erythematosus1_2_yr',
  'dermatomyositis1_2_yr',
  'Autoimmune_new1_2_yr',
  'sjögren0_6_months',
  'rheumatoid_arthritis0_6_months',
  'reactive_arthritis0_6_months',
  'lupus_erythematosus0_6_months',
  'dermatomyositis0_6_months',
  'Autoimmune_new0_6_months',
  'sjögren6_months_1_yr',
  'rheumatoid_arthritis6_months_1_yr',
  'reactive_arthritis6_months_1_yr',
  'lupus_erythematosus6_months_1_yr',
  'dermatomyositis6_months_1_yr',
  'Autoimmune_new6_months_1_yr'],
 'Cancer': ['Cancer_new1_2_yr',
  'cancer1_2_yr',
  'carcinoma1_2_yr',
  'malignan1_2_yr',
  'leukemia1_2_yr',
  'lymphoma1_2_yr',
  'sarcoma1_2_yr',
  'Cancer_new0_6_months',
  'cancer0_6_months',
  'carcinoma0_6_months',
  'malignan0_6_months',
  '

In [16]:
# cohort['temp_chronic_kidney_series'] = np.where(cohort[group_to_cols["Chronic Kidney"]].sum(axis=1) > 0, 1, 0)

In [20]:
def compute_condition_metrics(df : pd.DataFrame ,name_dict : Dict[str, List[str]] =group_to_cols) -> Dict[str, List[Union[int,np.ndarray]]]:
    '''Computes the mean of a condition column and computes the standard deviation'''
    new_df = df.copy()
    means = []
    stdevs = []
    counts = []
    group_names = []
    for group_name, group_items in name_dict.items():
        new_df["temp_series"] = np.where(df[group_items].sum(axis=1) > 0, 1, 0)
        mean, std, = np.mean(new_df["temp_series"]), np.std(new_df["temp_series"])
        means.append(mean)
        stdevs.append(std)
        counts.append(len(new_df.loc[new_df["temp_series"] == 1]))
        group_names.append(group_name)
    return {"Condition Category" : group_names, "Mean" : means, "Standard Deviation" : stdevs, "Agg_count" : len(new_df), "Counts" : counts}


def get_columns_of_interest(df : pd.DataFrame ,name_dict : Dict[str, List[str]] =group_to_cols) -> Dict[str, List[Union[int,np.ndarray]]]:
    '''Computes the mean of a condition column and computes the standard deviation'''
    new_df = df.copy()
    group_names = []
    for group_name, group_items in name_dict.items():
        new_df[group_name.upper()] = np.where(df[group_items].sum(axis=1) > 0, 1, 0)
        group_names.append(group_name)
    subset_columns = [x.upper() for x in group_names] + ['antibiotic_type']
    subset_df = new_df[subset_columns]
    return subset_df

def compute_age_metrics(ages: pd.Series) -> Dict[str, List[Union[int,np.ndarray]]]:
    
                                                       
                                                       
    # First quartile (Q1)
    Q1 = np.percentile(ages, 25, interpolation ='midpoint')
  
    # Third quartile (Q3)
    Q3 = np.percentile(ages, 75, interpolation ='midpoint')

    # Interquaritle range (IQR)
    IQR = Q3 - Q1
    
    age_metrics = {'Max' : [np.max(ages)],
    'Min' : [np.min(ages)],
    'Mean' : [np.mean(ages)],
    'Median' : [np.median(ages)],
    'Standard Deviation': [np.std(ages)],
    'IQR' : [IQR]}


    return age_metrics

def to_df(data : Dict[str,List[object]]) -> pd.DataFrame :
    return pd.DataFrame(data=data)
                                 
def log_csv_table(data : Dict[str,List[object]],name : str ="table_1_agg_conditions") -> None:
    df = to_df(data=data)
    return df.to_csv(f"{name}.csv",index=False)


### Compute P Values

In [34]:

from collections import defaultdict
import scipy
import numpy as np
from pprint import pprint

np.random.seed(seed=42)
#initialize empty dictionary
comparison_to_p_value = dict()

atypes_to_cols = {'first_line': ['nitrofurantoin','trimethoprim-sulfamethoxazole'],'second_line':['second_line'],'alternatives':['alternatives']}

#for second line and alternatives dfs
for condition_query in ['MENOPAUSE','UTI HISTORY IN 1 YEAR','HYPERTENSION','DIABETES MELLITUS','ARTHRITIS','CANCER','CHRONIC KIDNEY','AUTOIMMUNE','THYROID DISORDER']:
    for atype in ['second_line','alternatives']:
        first_line_df = temp_df.loc[temp_df.antibiotic_type.isin(atypes_to_cols['first_line'])]
        atype_df = temp_df.loc[temp_df.antibiotic_type.isin(atypes_to_cols[atype])]
        #for every single condition
        #populate ditionary with ttest comparison and p value
        ttest_name = f"{condition_query}_{atype}"
        comparison_to_p_value[ttest_name] = scipy.stats.ttest_ind(first_line_df[condition_query].values,atype_df[condition_query].values)[1]

#print out the dictionary
pprint(comparison_to_p_value,sort_dicts=False)

{'MENOPAUSE_second_line': 0.2578565473119409,
 'MENOPAUSE_alternatives': 0.3462776798692885,
 'UTI HISTORY IN 1 YEAR_second_line': 0.000826890022034254,
 'UTI HISTORY IN 1 YEAR_alternatives': 0.07016806719316435,
 'HYPERTENSION_second_line': 3.0755403781309966e-08,
 'HYPERTENSION_alternatives': 7.86258337317837e-05,
 'DIABETES MELLITUS_second_line': 8.142289209964599e-07,
 'DIABETES MELLITUS_alternatives': 1.9353592706318785e-06,
 'ARTHRITIS_second_line': 0.0365642805330792,
 'ARTHRITIS_alternatives': 0.25797415638231685,
 'CANCER_second_line': 0.050519521248556666,
 'CANCER_alternatives': 0.8280592335856661,
 'CHRONIC KIDNEY_second_line': 3.346024967713792e-07,
 'CHRONIC KIDNEY_alternatives': 1.351986091889897e-07,
 'AUTOIMMUNE_second_line': 0.7635159570851222,
 'AUTOIMMUNE_alternatives': 0.6767078818549699,
 'THYROID DISORDER_second_line': 0.4980512456366324,
 'THYROID DISORDER_alternatives': 0.3625048188380927}


## Condition and Age Information

In [11]:
# #loop over modified dataframes that include a single condition and append it to dictionary
atype_to_metrics = {'all_antibiotics': [cohort.antibiotic_type.value_counts().index.tolist(),None],'first_line': [['nitrofurantoin','trimethoprim-sulfamethoxazole'],None],'second_line':[['second_line'],None],'alternatives':[['alternatives'],None]}

condition_writer = pd.ExcelWriter('Logs/excel_table1/' + 'table_1_conditions_v7' + '.xlsx', engine = 'xlsxwriter')
age_writer = pd.ExcelWriter('Logs/excel_table1/' + 'table_1_age_v7' + '.xlsx', engine = 'xlsxwriter')

for a_type, a_names in atype_to_metrics.items():
    temp_c_metrics = compute_condition_metrics(cohort.loc[cohort.antibiotic_type.isin(a_names[0])])
#     atype_to_metrics[a_type][1] = temp_c_metrics
    to_df(temp_c_metrics).to_excel(condition_writer, sheet_name = a_type)
#     log_csv_table(temp_metrics, f"table_1_{a_type}_conditions")

    temp_a_metrics = compute_age_metrics(cohort.loc[cohort.antibiotic_type.isin(a_names[0])].age)
    to_df(temp_a_metrics).to_excel(age_writer, sheet_name = a_type)


condition_writer.save()
condition_writer.close()
age_writer.save()
age_writer.close()


  warn("Calling close() on already closed file.")
