### Generates lab, lab existence features, censor and outcome variables

In [28]:
in_name = "manuscript_covariates_3_final"
out_name = "manuscript_covariates_4_final"

## Imports

In [29]:
#requires covariate_config
import config
import covariate_config as cov_cfg
import sys
import time
import importlib
import datetime
import pandas as pd
import numpy as np
import scipy.sparse
import matplotlib.pyplot as plt

import Utils.dbutils as dbutils
import Utils.data_utils as data_utils
import Generators.CohortGenerator as CohortGenerator
import Generators.FeatureGenerator as FeatureGenerator
import config
local_imports = (
    dbutils,
    data_utils,
    CohortGenerator,
    FeatureGenerator,
    config
)
for i in local_imports:
    i = importlib.reload(i)
import psycopg2
import pandas as pd
import numpy
import sparse

## Setup config

In [30]:
## database connection parameters
# username = config.PG_USERNAME #we use peer authentication so don't need use vars, but in theory would pass them into config_path
# password = config.PG_PASSWORD
database_name = config.DB_NAME
print(database_name)

config_path = 'postgresql://{database_name}'.format(
    database_name = database_name
)
connect_args = {"host": '/var/run/postgresql/'} # connect_args to pass to sqlalchemy create_engine function

# schemas 
schema_name = 'eol_test_noah' # all created tables will be created using this schema
cdm_schema_name = config.OMOP_CDM_SCHEMA # the name of the schema housing your OMOP CDM tables
print(f"cdm schema: {cdm_schema_name}")
# caching
reset_schema = False # if true, rebuild all data from scratch

# set up database, reset schemas as needed
db = dbutils.Database(config_path, schema_name, connect_args, cdm_schema_name)
if reset_schema:
    db.execute(
        'drop schema if exists {} cascade'.format(schema_name)
    )
db.execute(
    'create schema if not exists {}'.format(schema_name)
)

localhost/omop_v6
cdm schema: cdm_6871_21
Executed 1 SQLs


## Define covariate variables

In [31]:
uti = cov_cfg.uti
neph = cov_cfg.neph
sepsis = cov_cfg.sepsis

T_eff = cov_cfg.T_eff

labid_map = cov_cfg.labid_map

### Adverse Event Variables
AE_c_diff = cov_cfg.AE_c_diff #90 days
AE_GI = cov_cfg.AE_GI #15 days
AE_AKI = cov_cfg.AE_AKI #30 days
AE_skin = cov_cfg.AE_skin #30 days
adverse = cov_cfg.adverse
AE_other = cov_cfg.AE_other #30 days

# Fever
fever = cfg.fever

#abnormal body temp
ab_temp = cfg.ab_temp



## Cohort Integration**

In [32]:
username='cdm_6871_21'
password='MLHC2021'
database_name = 'omop_v6'
conn_string = "dbname="+database_name + " host=/var/run/postgresql"
conn=psycopg2.connect(conn_string)
print('Connected!')



sql_command = f"select * from cdm_6871_21.{in_name};"
cohort = pd.read_sql(sql_command,conn)

Connected!


## Database Imports to generate features**

In [33]:
username='cdm_6871_21'
password='MLHC2021'
database_name = 'omop_v6'
conn_string = "dbname="+database_name + " host=/var/run/postgresql"
conn=psycopg2.connect(conn_string)
print('Connected!')



sql_command = "select concept_name, concept_id from cdm_6871_21.concept;"
concept = pd.read_sql(sql_command,conn)

Connected!


In [34]:
%%time
sql_cmd = '''
    select
        v.visit_detail_id,
        v.visit_detail_concept_id,
        v.person_id
    from {omop_schema}.visit_detail v 
    where v.person_id in (select person_id from cdm_6871_21.{in_name})'''.format(
    omop_schema=config.OMOP_CDM_SCHEMA,in_name=in_name)
visits = pd.read_sql(sql_cmd, conn)
visits.head()

CPU times: user 15.3 s, sys: 2.27 s, total: 17.6 s
Wall time: 3min 1s


Unnamed: 0,visit_detail_id,visit_detail_concept_id,person_id
0,19828375,581477,1850532
1,406283058,581477,4781602
2,406283059,581477,4781602
3,406283060,581477,4781602
4,406283061,581477,4781602


In [35]:
%%time
sql_cmd = """
    select 
        a.measurement_concept_id as concept_id,
        a.measurement_datetime as measurement_start_date,
        a.value_as_number as measurement_value,
        a.person_id
    from {omop_schema}.measurement a
    where a.person_id in (select person_id from cdm_6871_21.{in_name})""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,in_name=in_name
)
measurement_df = pd.read_sql(sql_cmd, conn)
measurement_df.head()

CPU times: user 49 s, sys: 8.47 s, total: 57.5 s
Wall time: 6min 57s


Unnamed: 0,concept_id,measurement_start_date,measurement_value,person_id
0,3018311,2016-09-16,18.0,658
1,3015746,2013-11-12,,744
2,2212630,2017-02-11,,906
3,3021601,2015-03-23,,976
4,3024461,2019-05-11,,1624


In [36]:
%%time
sql = """
    select 
        a.condition_source_value as concept_code,
        a.visit_detail_id,
        a.condition_concept_id as concept_id,
        a.condition_start_date as condition_start_date,
        a.condition_occurrence_id as condition_occurrence_id,
        a.person_id
    from
        {omop_schema}.condition_occurrence a
    where a.person_id in (select person_id from cdm_6871_21.{in_name})""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,in_name=in_name
)
condition_df = db.query(sql)
display(condition_df.head())

Unnamed: 0,concept_code,visit_detail_id,concept_id,condition_start_date,condition_occurrence_id,person_id
0,ICD9CM: 462,56552635,25297,2013-04-17,21225018,1830
1,ICD9CM: 193,41784164,133424,2013-11-26,52779811,25534
2,ICD9CM: 496,27028624,255573,2013-09-09,36329287,25630
3,ICD9CM: 340,24529963,374919,2013-04-24,27269674,40749
4,ICD9CM: 193,123636441,133424,2015-05-11,149264787,43593


CPU times: user 58.2 s, sys: 16.2 s, total: 1min 14s
Wall time: 7min 10s


## Augmenting UTI Conditions with visit name

In [37]:
%%time
uti_conditions = condition_df
uti_conditions = uti_conditions.merge(visits[['visit_detail_id','visit_detail_concept_id']], how='left', on='visit_detail_id')
uti_conditions['visit_name'] = uti_conditions.merge(concept.rename(columns={'concept_id':'visit_detail_concept_id'}), how='left',on='visit_detail_concept_id').concept_name

CPU times: user 17.4 s, sys: 3.53 s, total: 20.9 s
Wall time: 21 s


## Create the condition tables

In [38]:
t_e = uti_conditions[uti_conditions.concept_id.isin(T_eff)]
f_e = uti_conditions[uti_conditions.concept_id.isin(fever)]
b_e = uti_conditions[uti_conditions.concept_id.isin(ab_temp)]
a_e = uti_conditions[uti_conditions.concept_id.isin(adverse)]
t_ie = uti_conditions[(uti_conditions.concept_id.isin(T_eff)) & (uti_conditions.visit_detail_concept_id == 9201)]
measurement_df = measurement_df.loc[measurement_df.measurement_value.notnull()].reset_index()

## Create the variables

In [39]:
t_e['t_uti'] = 0
t_e['t_neph'] = 0
t_e['t_sepsis'] = 0
t_e.loc[t_e['concept_id'].isin(uti),'t_uti'] = 1
t_e.loc[t_e['concept_id'].isin(neph),'t_neph'] = 1
t_e.loc[t_e['concept_id'].isin(sepsis),'t_sepsis'] = 1


t_ie['t_uti'] = 0
t_ie['t_neph'] = 0
t_ie['t_sepsis'] = 0
t_ie.loc[t_ie['concept_id'].isin(uti),'t_uti'] = 1
t_ie.loc[t_ie['concept_id'].isin(neph),'t_neph'] = 1
t_ie.loc[t_ie['concept_id'].isin(sepsis),'t_sepsis'] = 1

a_e['AE_c_diff'] = 0
a_e['AE_skin'] = 0
a_e['AE_other'] = 0
a_e['AE_GI'] = 0
a_e['AE_AKI'] = 0


a_e.loc[a_e['concept_id'].isin(AE_c_diff),'AE_c_diff'] = 1
a_e.loc[a_e['concept_id'].isin(AE_skin),'AE_skin'] = 1
a_e.loc[a_e['concept_id'].isin(AE_other),'AE_other'] = 1
a_e.loc[a_e['concept_id'].isin(AE_GI),'AE_GI'] = 1
a_e.loc[a_e['concept_id'].isin(AE_AKI),'AE_AKI'] = 1

f_e['fever'] = 0
f_e.loc[f_e['concept_id'].isin(fever),'fever'] = 1

b_e['ab_temp'] = 0
b_e.loc[b_e['concept_id'].isin(ab_temp), 'ab_temp'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t_e['t_uti'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t_e['t_neph'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t_e['t_sepsis'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc

## Update cohort table with new outcome variables

In [40]:
uti_temp1 = cohort.copy()

In [41]:
%%time
adverse_temp = a_e.copy()
adverse_temp['occurs_30_days_after_uti'] = False
adverse_temp['occurs_90_days_after_uti'] = False
adverse_temp['occurs_90_days_before_uti'] = False

index_name = 'condition_occurrence_id_new'
utis_all = uti_temp1.copy()
c= 0
for index, row in adverse_temp.iterrows():
    p = row.person_id
    d = row.condition_start_date
    uti_person = utis_all[utis_all.person_id == p]
    dates = uti_person.condition_start_date
    c += 1
    l = len(adverse_temp)
    if c % 1000 == 0:
        print(f"Iter: {c} / {l}")
    for uti_date in dates:
        uti_event_index = int(uti_person[uti_person.condition_start_date == uti_date]['condition_occurrence_id'])
        if (d - uti_date).days < 30 and (d - uti_date).days > 0:
            adverse_temp.loc[index, 'occurs_30_days_after_uti'] = True
            adverse_temp.loc[index, index_name]  =uti_event_index 
        if (d - uti_date).days < 90 and (d - uti_date).days > 0:
            adverse_temp.loc[index, 'occurs_90_days_after_uti'] = True 
            adverse_temp.loc[index, index_name]  =uti_event_index 
        if (uti_date-d).days < 180 and (uti_date-d).days > 0:
            adverse_temp.loc[index, 'occurs_90_days_before_uti'] = True          
            
a_e = adverse_temp[adverse_temp['occurs_90_days_before_uti']==False]   

Iter: 1000 / 305941
Iter: 2000 / 305941
Iter: 3000 / 305941
Iter: 4000 / 305941
Iter: 5000 / 305941
Iter: 6000 / 305941
Iter: 7000 / 305941
Iter: 8000 / 305941
Iter: 9000 / 305941
Iter: 10000 / 305941
Iter: 11000 / 305941
Iter: 12000 / 305941
Iter: 13000 / 305941
Iter: 14000 / 305941
Iter: 15000 / 305941
Iter: 16000 / 305941
Iter: 17000 / 305941
Iter: 18000 / 305941
Iter: 19000 / 305941
Iter: 20000 / 305941
Iter: 21000 / 305941
Iter: 22000 / 305941
Iter: 23000 / 305941
Iter: 24000 / 305941
Iter: 25000 / 305941
Iter: 26000 / 305941
Iter: 27000 / 305941
Iter: 28000 / 305941
Iter: 29000 / 305941
Iter: 30000 / 305941
Iter: 31000 / 305941
Iter: 32000 / 305941
Iter: 33000 / 305941
Iter: 34000 / 305941
Iter: 35000 / 305941
Iter: 36000 / 305941
Iter: 37000 / 305941
Iter: 38000 / 305941
Iter: 39000 / 305941
Iter: 40000 / 305941
Iter: 41000 / 305941
Iter: 42000 / 305941
Iter: 43000 / 305941
Iter: 44000 / 305941
Iter: 45000 / 305941
Iter: 46000 / 305941
Iter: 47000 / 305941
Iter: 48000 / 305941
I

In [None]:
from datetime import timedelta
start = datetime.datetime.now()

uti_temp1['t_uti_sum'] = 0
uti_temp1['t_neph_sum'] = 0
uti_temp1['t_sepsis_sum'] = 0


uti_temp1['t_i_sepsis_sum'] = 0
uti_temp1['t_i_uti_sum'] = 0
uti_temp1['t_i_neph_sum'] = 0


uti_temp1['AE_c_diff'] = 0
uti_temp1['AE_skin'] = 0
uti_temp1['AE_other'] = 0
uti_temp1['AE_GI'] = 0
uti_temp1['AE_AKI'] = 0


uti_temp1['prev_t_uti'] = 0
uti_temp1['prev_AE_c_diff'] = 0
uti_temp1['prev_AE_skin'] = 0
uti_temp1['prev_AE_other'] = 0
uti_temp1['prev_AE_GI'] = 0
uti_temp1['prev_AE_AKI'] = 0


uti_temp1['fever'] = 0
uti_temp1['ab_temp'] = 0

for col_name, _ in labid_map.items():
    col = {f'{col_name}_0': None}
    uti_temp1  = uti_temp1.assign(**col)

index_name = "index"
c = 0
for index, row in uti_temp1.iterrows():
    uti_index = row[index_name]
    person = row.person_id
    d = row.condition_start_date
    previous_uti = row.previous_uti
    pid = row.previous_uti_condition_occurence_id
    prev_d = condition_df.loc[condition_df.condition_occurrence_id == pid,'condition_start_date']

    
    l = len(uti_temp1)
    c+= 1
    if c % 1000 == 0:
        print(f"Iter: {c} / {l}")
        print(f"Time elapsed: {(datetime.datetime.now() - start)}")
    


    #AB_TEMP
    
    temp_df = b_e[b_e['person_id'] == person]
    final_date = d + timedelta(days=3)
    cond_ct = ((temp_df.condition_start_date <= final_date) & (temp_df.condition_start_date >= d))
    filtered_be = temp_df.loc[cond_ct].drop_duplicates(['concept_id','person_id','condition_start_date'], keep='last')    


    #FEVER
    
    temp_df = f_e[f_e['person_id'] == person]
    final_date = d + timedelta(days=3)
    cond_ct = ((temp_df.condition_start_date <= final_date) & (temp_df.condition_start_date >= d))
    filtered_fe = temp_df.loc[cond_ct].drop_duplicates(['concept_id','person_id','condition_start_date'], keep='last')    



    uti_temp1.loc[index,'fever'] = 1 if filtered_fe.fever.sum() > 0 else 0

    
# LAB MEASUREMENTS (URINE TEST STRIP + Body temp) 

    temp_df = measurement_df[measurement_df['person_id'] == person]

   

    for lab_measurement, concept_ids in labid_map.items():

        num = 0
        order = 0
        final_date = d + timedelta(days=num)
        cond_cc = ((temp_df.measurement_start_date.dt.date <= final_date) & (temp_df.measurement_start_date.dt.date >= d) & (temp_df.concept_id.isin(concept_ids)))
        filtered_me = temp_df.loc[cond_cc].drop_duplicates(['concept_id','person_id','measurement_start_date','measurement_value'], keep='last')
        filtered_me = filtered_me.sort_values('measurement_start_date')        

        if len(filtered_me) < 1:
            pass
        else:
            uti_temp1.loc[index,f'{lab_measurement}_0'] = filtered_me.measurement_value.values[order]

            
#TREATMENT EFFICACY UTI
    
    #current uti cond
    temp_df = t_e[t_e['person_id'] == person]
    final_date = d + timedelta(days=31)
    cond_ct = ((temp_df.condition_start_date < final_date) & (temp_df.condition_start_date > d))
    filtered_te = temp_df.loc[cond_ct].drop_duplicates(['concept_id','person_id','condition_start_date'], keep='last')    



    uti_temp1.loc[index,'t_uti_sum'] = filtered_te.t_uti.sum()
    uti_temp1.loc[index,'t_neph_sum'] = filtered_te.t_neph.sum()
    
    if previous_uti:
        #previous uti cond

        final_date = prev_d + timedelta(days=31)
        cond_pt = ((temp_df.condition_start_date < final_date.item()) & (temp_df.condition_start_date > prev_d.item()))
        filtered_te = temp_df.loc[cond_pt].drop_duplicates(['concept_id','person_id','condition_start_date'], keep='last')  
        uti_temp1.loc[index,'prev_t_uti'] = 1 if filtered_te.t_uti.sum() > 0 else 0

# Treatment Efficacy (Sepsis)
    temp_df = t_e[t_e['person_id'] == person]
    final_date = d + timedelta(days=31)
    cond_ct = ((temp_df.condition_start_date < final_date) & (temp_df.condition_start_date > d))
    filtered_te = temp_df.loc[cond_ct].drop_duplicates(['concept_id','person_id','condition_start_date'], keep='last')    

    uti_temp1.loc[index,'t_sepsis_sum'] = filtered_te.t_sepsis.sum()     
    
    
#  TREATMENT EFFICACY of Intakes
    
#  current uti cond
    temp_df = t_ie[t_ie['person_id'] == person]
    final_date = d + timedelta(days=31)
    cond_ct = ((temp_df.condition_start_date < final_date) & (temp_df.condition_start_date > d))
    filtered_tie = temp_df.loc[cond_ct].drop_duplicates(['concept_id','person_id','condition_start_date'], keep='last')    



    uti_temp1.loc[index,'t_i_uti_sum'] = filtered_tie.t_uti.sum()
    uti_temp1.loc[index,'t_i_neph_sum'] = filtered_tie.t_neph.sum()
    uti_temp1.loc[index,'t_i_sepsis_sum'] = filtered_tie.t_sepsis.sum()

        
       

# ADVERSE EVENTS
    
#     curr adverse cond
    temp_df = a_e[a_e['person_id'] == person]
    final_date = d + timedelta(days=16)
    cond_a15 = ((temp_df.condition_start_date < final_date) & (temp_df.condition_start_date > d))
    final_date = d + timedelta(days=31)
    cond_a30 = ((temp_df.condition_start_date < final_date) & (temp_df.condition_start_date > d))
    final_date = d + timedelta(days=91)
    cond_a90 = ((temp_df.condition_start_date < final_date) & (temp_df.condition_start_date > d))

    filtered_ae_15 = temp_df.loc[cond_a15].drop_duplicates(['concept_id','person_id','condition_start_date'], keep='last')
    filtered_ae_30 = temp_df.loc[cond_a30].drop_duplicates(['concept_id','person_id','condition_start_date'], keep='last')    
    filtered_ae_90 = temp_df.loc[cond_a90].drop_duplicates(['concept_id','person_id','condition_start_date'], keep='last')   

    uti_temp1.loc[index,'AE_c_diff'] = 1 if filtered_ae_90.AE_c_diff.sum() > 0 else 0
    uti_temp1.loc[index,'AE_skin'] = 1 if filtered_ae_30.AE_skin.sum() > 0 else 0
    uti_temp1.loc[index,'AE_other'] = 1 if filtered_ae_30.AE_other.sum() > 0 else 0
    uti_temp1.loc[index,'AE_GI'] = 1 if filtered_ae_15.AE_GI.sum() > 0 else 0
    uti_temp1.loc[index,'AE_AKI'] = 1 if filtered_ae_30.AE_AKI.sum() > 0 else 0

    if previous_uti:
        
#         #previous adverse cond
        temp_df = a_e[a_e['person_id'] == person]
        
        final_date = prev_d + timedelta(days=16)
        cond_pa15 = ((temp_df.condition_start_date < final_date.item()) & (temp_df.condition_start_date > prev_d.item()))
        final_date = prev_d + timedelta(days=31)
        cond_pa30 = ((temp_df.condition_start_date < final_date.item()) & (temp_df.condition_start_date > prev_d.item()))
        final_date = prev_d + timedelta(days=91)
        cond_pa90 = ((temp_df.condition_start_date < final_date.item()) & (temp_df.condition_start_date > prev_d.item()))

        filtered_ae_15 = temp_df.loc[cond_pa15].drop_duplicates(['concept_id','person_id','condition_start_date'], keep='last') 
        filtered_ae_30 = temp_df.loc[cond_pa30].drop_duplicates(['concept_id','person_id','condition_start_date'], keep='last')    
        filtered_ae_90 = temp_df.loc[cond_pa90].drop_duplicates(['concept_id','person_id','condition_start_date'], keep='last')   

        uti_temp1.loc[index,'prev_AE_c_diff'] = 1 if filtered_ae_90.AE_c_diff.sum() > 0 else 0
        uti_temp1.loc[index,'prev_AE_skin'] = 1 if filtered_ae_30.AE_skin.sum() > 0 else 0
        uti_temp1.loc[index,'prev_AE_other'] = 1 if filtered_ae_30.AE_other.sum() > 0 else 0
        uti_temp1.loc[index,'prev_AE_GI'] = 1 if filtered_ae_15.AE_GI.sum() > 0 else 0
        uti_temp1.loc[index,'prev_AE_AKI'] = 1 if filtered_ae_30.AE_AKI.sum() > 0 else 0
    
    
    
    
    
    
    
    
    



for lab_measurement, concept_ids in labid_map.items():
    uti_temp1.loc[uti_temp1[f'{lab_measurement}_0'].isnull(),f"{lab_measurement}_0_exists"] = 0
    uti_temp1.loc[uti_temp1[f'{lab_measurement}_0'].notnull(),f"{lab_measurement}_0_exists"] = 1

    
uti_temp1['t_sum'] = uti_temp1['t_uti_sum'] + uti_temp1['t_neph_sum'] + uti_temp1['t_sepsis_sum']
uti_temp1['t_bin'] = 0
uti_temp1.loc[uti_temp1['t_sum'] > 0, 't_bin'] = 1

uti_temp1['t_sepsis_bin'] = 0
uti_temp1.loc[uti_temp1['t_sepsis_sum'] > 0, 't_sepsis_bin'] = 1

uti_temp1['t_i_uti_bin'] = 0
uti_temp1.loc[uti_temp1['t_i_uti_sum'] > 0, 't_i_uti_bin'] = 1

uti_temp1['t_i_neph_bin'] = 0
uti_temp1.loc[uti_temp1['t_i_neph_sum'] > 0, 't_i_neph_bin'] = 1

uti_temp1['t_i_sepsis_bin'] = 0
uti_temp1.loc[uti_temp1['t_i_sepsis_sum'] > 0, 't_i_sepsis_bin'] = 1

uti_temp1['t_i_sum'] = uti_temp1['t_i_uti_sum'] + uti_temp1['t_i_neph_sum'] + uti_temp1['t_i_sepsis_sum']
uti_temp1['t_i_bin'] = 0
uti_temp1.loc[uti_temp1['t_i_sum'] > 0, 't_i_bin'] = 1

uti_temp1['t_uti_bin'] = 0
uti_temp1.loc[uti_temp1['t_uti_sum'] > 0, 't_uti_bin'] = 1

uti_temp1['t_neph_bin'] = 0
uti_temp1.loc[uti_temp1['t_neph_sum'] > 0, 't_neph_bin'] = 1
#adding final AE outcome
adverse_event_outcomes = ['AE_c_diff',
 'AE_skin', 'AE_GI', 'AE_AKI',
 'AE_other']

uti_temp1['AE_any'] = uti_temp1[adverse_event_outcomes].any(axis='columns').astype(int)

Iter: 1000 / 60118
Time elapsed: 0:02:45.095592
Iter: 2000 / 60118
Time elapsed: 0:05:28.130602
Iter: 3000 / 60118
Time elapsed: 0:08:09.216625
Iter: 4000 / 60118
Time elapsed: 0:10:50.822551
Iter: 5000 / 60118
Time elapsed: 0:13:33.647998
Iter: 6000 / 60118
Time elapsed: 0:16:15.945029
Iter: 7000 / 60118
Time elapsed: 0:18:57.891178
Iter: 8000 / 60118
Time elapsed: 0:21:38.576564
Iter: 9000 / 60118
Time elapsed: 0:24:18.799139
Iter: 10000 / 60118
Time elapsed: 0:27:01.117472
Iter: 11000 / 60118
Time elapsed: 0:29:42.667505
Iter: 12000 / 60118
Time elapsed: 0:32:23.783943
Iter: 13000 / 60118
Time elapsed: 0:35:05.160452


## Add Lab count features and years since diagnosis

In [None]:
#urine and blood existence parameters
existence_features = [f for f in uti_temp1.columns if f.endswith('_exists')]
cbc_keywords = ['band',
 'basophil',
 'blast',
 'eosinophil',
 'granulocyte',
 'lymphocyte',
 'metamyelocte',
 'monocyte',
 'myelocyte',
 'polymorphonuclear',
 'neutrophil',
 'segmented_nphil',
 'variant_lympho']

cbc_existence_features = [f for f in existence_features if any(k in f for k in cbc_keywords)]
cbc_existence_features = [f for f in cbc_existence_features if not 'promyelocyte' in f]


urine_features = [f for f in existence_features if '_urine_' in f]

In [None]:
uti_temp1['cbc_present'] = uti_temp1.apply(lambda row: any(row[f]==1 for f in cbc_existence_features), axis=1)
uti_temp1['urine_test_present'] = uti_temp1.apply(lambda row: any(row[f]==1 for f in urine_features), axis=1)

In [None]:
uti_temp1['condition_start_date'] = pd.to_datetime(uti_temp1.condition_start_date)
uti_temp1['years_since_diagnosis'] = 2021 - uti_temp1.condition_start_date.dt.year

## Preprocess features

In [None]:
#fill in missing features for days_since_previous_uti
feat_null = pd.Series(uti_temp1.isna().any())
cols_to_fill = list(feat_null.loc[feat_null == True].index)
uti_temp1[cols_to_fill] = uti_temp1[cols_to_fill].fillna(0)

#turn all bool to 1 and 0
bool_cols = uti_temp1.select_dtypes(include=['bool']).columns
uti_temp1[bool_cols] = uti_temp1[bool_cols]*1

In [None]:
uti_temp1['body_temp_0'] = uti_temp1['body_temp_0'].apply(lambda x: (x-32)/1.8 if x>90 else x)

In [None]:
sql_cmd = 'select * from cdm_6871_21.observation_period where person_id in (select person_id from cdm_6871_21.{in_name})'.format(
    person_id=tuple(set(cohort.person_id)),in_name=in_name)
obs = pd.read_sql(sql_cmd, conn)
obs.head()
last_obs = obs.groupby('person_id')[['observation_period_end_date']].max()
last_obs = last_obs.reset_index()
last_obs.rename(columns={'observation_period_end_date':'last_visit_date'}, inplace=True)
final = last_obs.join(cohort.sort_values(by='condition_start_date').drop_duplicates(subset=['person_id']).set_index('person_id'), on='person_id')
final['last_visit_date'] = pd.to_datetime(final['last_visit_date'])
final['condition_start_date'] = pd.to_datetime(final['condition_start_date'])
final['followup_time'] = final['last_visit_date'] - final['condition_start_date']
uti_temp1 = uti_temp1.merge(final[['followup_time','person_id']],on="person_id")

uti_temp1['less_15'] = (uti_temp1['followup_time'] < pd.Timedelta(15, unit='d')).astype(int)
uti_temp1['less_30'] = (uti_temp1['followup_time'] < pd.Timedelta(30, unit='d')).astype(int)
uti_temp1['less_90'] = (uti_temp1['followup_time'] < pd.Timedelta(90, unit='d')).astype(int)

## Save to Dataframe

In [56]:
#spot check columns
sorted(list(uti_temp1.columns))

['AE_AKI',
 'AE_GI',
 'AE_any',
 'AE_c_diff',
 'AE_other',
 'AE_skin',
 'ab_temp',
 'addison0_6_months',
 'addison1_2_yr',
 'addison6_months_1_yr',
 'addison_full_condition_name',
 'age',
 'alternatives_0_to_6_mo',
 'alternatives_12_to_24_mo',
 'alternatives_1_to_7_days',
 'alternatives_6_to_12_mo',
 'alternatives_most_recent',
 'alternatives_switch_ever',
 'alternatives_switch_recent',
 'antibiotic_name',
 'antibiotic_type',
 'arthritis0_6_months',
 'arthritis1_2_yr',
 'arthritis6_months_1_yr',
 'arthritis_concept_ancestor0_6_months',
 'arthritis_concept_ancestor1_2_yr',
 'arthritis_concept_ancestor6_months_1_yr',
 'arthritis_concept_ancestor_full_condition_name',
 'arthritis_full_condition_name',
 'autoimmune_concept_ancestor0_6_months',
 'autoimmune_concept_ancestor1_2_yr',
 'autoimmune_concept_ancestor6_months_1_yr',
 'autoimmune_concept_ancestor_full_condition_name',
 'band_nphil_0',
 'band_nphil_0_exists',
 'band_nphil_p100_0',
 'band_nphil_p100_0_exists',
 'basophil_p100_0',
 'b

In [65]:
uti_treatment = uti_temp1.copy()
uti_treatment = uti_treatment.drop(columns=['level_0'])
uti_treatment.to_sql(out_name,con=db.engine, if_exists="replace", schema="cdm_6871_21")
cmd = f'grant select on table cdm_6871_21.{out_name} to cdm_6871_21'
db.execute(cmd)

  sql.to_sql(


Executed 1 SQLs


## Grant access to Ming-Chieh (optional)

In [66]:
cmd = f"grant select on table cdm_6871_21.{out_name} to littlecanargie"
db.execute(cmd)

Executed 1 SQLs
