### Load HIV Cohort + Diagnoses + E&M Visits

In [61]:
import pandas as pd

# persons with 3 or more HIV dx codes after 2018-01-01
cohort = pd.read_csv("HIV_cohort.csv")

# target conditions like HCV/obesity/depression/anxiety/etc.
conditions = pd.get_dummies(pd.read_csv("condition_query.csv"), 
                           prefix=None).groupby('person_id').sum().reset_index()

# number of E&M visits for established patients (CPT 99211-99215)
visits = pd.read_csv("outpatient_visits.csv")

# merge aforementioned dataframes
dx_visit_df = cohort.merge(visits, 
             how = 'left', on = 'person_id').fillna(0).merge(
            conditions,
            how = 'left', on = 'person_id').fillna(0)

### Normalize VL Laboratory Tests

In [94]:
# all viral load labs starting 2018-01-01
vl_labs = pd.read_csv("viral_loads.csv")

# convert strings to float using a 'reference data frame'
vl_summary = vl_labs.groupby('value_source_value').size().reset_index().sort_values([0], ascending = False)
vl_summary.columns = ['value_source_value', 'value']

# normalize VL labs
i, floats, dmap = 0, [], {'Not Detected':0, '<20':0, '>10000000':10000, 'BT':0, 'TNP':0}
for result in vl_summary['value_source_value'].tolist():
    try:
        floats.append(float(result))
    except:
        floats.append(float(dmap[result]))

# vls column to create binary variable for 'viral load suppression'
vl_summary['vls'] = [x < 200 for x in floats]

VLS_df = vl_labs.merge(vl_summary, 
                       on = 'value_source_value')[['person_id', 'measurement_date', 'vls']]

# get most recent lab for each patient
VLS_df = VLS_df.merge(
    VLS_df.groupby('person_id').measurement_date.first().reset_index(),
    on = ['person_id', 'measurement_date'])

### Create Final Dataframe

In [97]:
df = dx_visit_df.merge(VLS_df[['person_id', 'vls']], on = 'person_id')
df.to_csv("omop_hiv_jan11.csv")