### Load Diagnoses

In [33]:
import pandas as pd

# persons with 3 or more HIV dx codes after 2018-01-01
cohort = pd.read_csv("demographic_info.csv").\
                merge(pd.read_csv("HIV_cohort.csv"), on ='person_id')
                      
new_HIV_dx = pd.read_csv("newly_diagnosed_HIV.csv").person_id.tolist()
cohort['new_dx'] = [x in new_HIV_dx for x in cohort.person_id.tolist()]

# target conditions like HCV/obesity/depression/anxiety/etc.
conditions = pd.get_dummies(pd.read_csv("condition_query.csv"), 
                           prefix=None).groupby('person_id').sum().reset_index()

# merge aforementioned dataframes
dx_df = cohort.merge(conditions,
            how = 'left', on = 'person_id').fillna(0)

### Outpatient Visits, Emergency Deparment Encounters, and Inpatient Admissions

In [34]:
# number of E&M visits for established patients (CPT 99211-99215) from 6-2018 to 6-2019
outpatient = pd.read_csv("outpatient_visits.csv")
outpatient['office_visits'] = outpatient['visits']

# number of ED and inpatient encounters from 6-2018 to 6-2019
hospital = pd.read_csv("hospital_visits.csv")

visits_df = outpatient[['person_id', 'office_visits']].merge(hospital, 
                on = 'person_id', how = 'outer').fillna(0)

### Laboratory Values: HIV Viral Load and HbA1c

In [35]:
######################
# HIV RNA viral load #
######################

# all viral load labs starting 2018-01-01
vl_labs = pd.read_csv("viral_loads.csv")

# convert strings to float using a 'reference data frame'
vl_summary = vl_labs.groupby('value_source_value').size().reset_index().sort_values([0], ascending = False)
vl_summary.columns = ['value_source_value', 'value']

# normalize VL labs
i, floats, dmap = 0, [], {'Not Detected':0, '<20':0, '>10000000':10000, 'BT':0, 'TNP':0}
for result in vl_summary['value_source_value'].tolist():
    try:
        floats.append(float(result))
    except:
        floats.append(float(dmap[result]))

# vls column to create binary variable for 'viral load suppression'
vl_summary['vls'] = [x < 200 for x in floats]

VLS_df = vl_labs.merge(vl_summary, 
                       on = 'value_source_value')[['person_id', 'measurement_date', 'vls']]

# get most recent lab for each patient
VLS_df = VLS_df.merge(
    VLS_df.groupby('person_id').measurement_date.first().reset_index(),
    on = ['person_id', 'measurement_date'])



#########
# HbA1c #
#########

hba1c_df = pd.read_csv('hba1c_values.csv').sort_values(['person_id','measurement_date'])
# get most recent lab for each patient
hba1c_df = hba1c_df.groupby('person_id').last()
hba1c_df['hba1c'] = hba1c_df['value_as_number']

#############
# join labs #
#############

lab_df = VLS_df.merge(hba1c_df, on = 'person_id', how = 'left')

### Create Final Dataframe

In [36]:
df = dx_df.merge(visits_df, how = 'left').fillna(0).merge(
    lab_df[['person_id', 'vls', 'hba1c']], on = 'person_id')

In [37]:
df = df.loc[~df.hba1c.isna(), ].drop_duplicates('person_id')

### Add unstable housing, CD4, 

In [31]:
import random
df['cd4'] = [random.randint(350,500) if x == True else random.randint(0,300) for x in df.vls]

### Truncate dataframe and add names

In [49]:
df = df.iloc[1:len(names)+1,]

names = ['Adah Labarge','Adrian Milhorn','Agueda Peachey','Akilah Trezza','Al Ratcliff','Alan Holdsworth','Albert Ayler',
'Alejandrina Amaro','Alejandro Fite','Aleshia Elmendorf','Alfonso Rinker','Alice Coltrane','Alysa Lapine',
'Amado Knaack','Amalia Ambriz','Amanda Townsend','Angeles Whitesel','Angie Ellery','Anja Kubacki',
'Anjanette Neiman','Annalisa Lillard','Annamarie Patterson','Annelle Dubrey','Annetta Saxton',
'Anthony Braxton','Antonetta Bangert','Antonio Carlos Jobim','Antwan Atchinson','Arline Popp','Art Blakey',
'Art Pepper','Art Tatum','Artie Shaw','Ashlie Monteiro','Audrey Naron','Aura Truman','Aurelio Haigh','Austin Taranto',
'Bailey Bixby','Bao Whitehead','Becky Craig','Belkis Elizalde','Ben Webster','Benedict Elders','Bennie Moten','Benny Carter',
'Benny Goodman','Bernard Parkes','Bessie Blackwell','Bessie Smith','Betsy Hargrave','Beverley Derosia',
'Bill Evans','Billy Strayhorn','Bix Beiderbecke','Blue Shepherd','Brandon Nguyen','Bridget Clodfelter',
'Britney Siegler','Bruna Averett','Bryanna Kalina','Bud Powell','Buddy Bolden','Buddy Rich','Candra Fine',
'Candyce Kidwell','Cannonball Adderley','Carlos Milian','Carlotta Bialek','Carlton Lachapelle','Carmen Binford',
'Catarina Amerman','Catharine Galan','Catherine Hayes','Cathi Sumrell','Cathleen Hovey','Catina Diez',
'Cayla Isaacs','Cecil Taylor','Cecila Robert','Cecily Schwartzman','Chan Helbert','Charita Soule','Charles Abucus',
'Charles Mingus','Charlie Christian','Charlie Haden','Charlie Laguna','Cheryl Schell','Chet Baker','Chi Dawley',
'Chick Corea','Chieko Nale','Clarence Davis','Clemmie Tinney','Cleora Lagrange','Cletus Bendixen','Clifford Brown',
'Coleman Hawkins','Coreen Fleisher','Corinna Goodspeed','Cortney Kelsch','Count Basie','Cruz Cogdill',
'Danette Mountjoy','Danny Portwood','Daria Royer','Dave Brubeck','Dave Holland','David Murray','Deadra Garfinkel',
'Deanne Deitch','Dell Mcpherson','Della Bartkowiak','Deloise Spalding','Denice Agtarap','Denis Clapper',
'Desire Bracy','Despina Corder','Dexter Gordon','Dianne Bass','Dixie Belgarde','Django Reinhardt',
'Doreatha Stpeter','Dorothy Ashby','Dovie Yarnall','Daniel Wells','Doyle Wells','Dustin Belvin',
'Earl Hines','Ed Oshea','Eddie Lang','Edward Huggett','Edwin Brooke','Ehtel Zeolla','Elaine Justus','Elba Debellis',
'Elda Huls','Eldora Lett','Elia Vanvliet','Ella Fitzgerald','Elsa Maxwell','Sally Maxwell','Elton Dearborn',
'Elvis Tarwater','Elvis Wiseman','Emanuel Howell','Emery Koger','Emilio Strawser','Emily Barrows',
'Emmett Ferguson','Enola Drane','Eric Dolphy','Erma Vasquez','Erroll Garner','Ettie Schmalz','Evalyn Stroman',
'Elsie Danna','Evita Pulver','Ezekiel Diederich','Fae Wickersham','Fannie Madrid','Fats Navarro','Fats Waller',
'Fawn Mccarthy','Felipe Girard','Fernando Fernandez','Rodney Fernandez','Fidelia Klingman','Fletcher Henderson',
'Floyd Scalia','Fonda Hosey','Francina Lummus','Frank Sinatra','Freda Cowart','Freddie Hubbard','Frida Newquist',
'Gala Terrazas','Galina Searle','Gene Ammons','Geoffrey Peaxach','George Joseph','George Russell',
'Gerry Mulligan','Gertude Brautigam','Gil Evans','Ginette Hooks','Glynis Twitty','Gracia Trejo','Graciela Fett',
'Graig Payeur','Guadalupe Allen','Guadalupe Booz','Hae Voorhis','Harris Pixler','Herbie Hancock',
'Herta Kash','Hoa Sherron','Homer Mclawhorn','Horace Silver','Hung Mcquillen','Hwa Center','Ila Yohe',
'Ion Camel','Ileen Carnes','In Schacherer','Inger Collazo','Inocencia Giltner','Ira Dejong','Isiah Hoffman',
'Isaac Hoffman','Isaura Robeson','Isis Alsup','Israel Delgado','Ivory Mcginley','J. Johnson','Jacelyn Pullin',
'Jacob Laurie','Jacquelyne Detty','Jaime Craney','James Moody','Jana Magnani','Janeen Osman','Janette Seale',
'Janey Fritz','Jann Cubbage','Jean Hale','Jeannie Barnett','Jeannie Simo','Jefferey Straughter',
'Jeffrey Reed','Jelly Roll Morton','Jerrold Carlow','Jetta Viars','Jimmie Lunceford','Jimmy Smith',
'Joe Henderson','Joe Zawinul','John Aster','John Coltrane','John McLaughlin','Johnathan Matthews',
'Johnie Teran','Jolyn Diggins','Jone Metz','Joselyn Bultman','Joye Kalman','Judy Lippert','Kamala Arvidson',
'Kara Mccarthy','Karri Litchfield','Kary Belding','Karyn Luby','Kasha Hatton','Kassie Dimuzio','Kathleen James',
'Kathlyn Hulings','Keith Jarrett','Keitha Aaronson','Keitha Guida','Kerry Watt','Kid Ory','Kiley Gamino',
'Kimberley Wever','King Oliver','Kristen Goodspeed','Kristin Oreilly','Krysta Vogan','Kurtis Lasky','Lady Caulkins',
'Laine Oberle','Lance Borst','Larue Peavey','Lashawnda Shortt','Lashon Arana','Latashia Galli','Lauren Simonson',
'Laurence Andrews','Laurice Savino','Leah Fraga','Lee Konitz','Lee Morgan','Leeanne Babich','Leia Mapes',
'Lemuel Cecena','Lennie Tristano','Lenny Spiro','Lenora Xu','Leonila Glen','Leora Mattera','Leslee Silva',
'Lester Young','Lien Pharris','Lilliam Thornell','Lillian Richardson','Larry Richardson','Lilliana Respass',
'Linda Smith','Linwood Huss','Lionel Hampton','Lisa Obrien','Lisabeth Briski','Lisbeth Ackerman',
'Lizbeth Dannenberg','Loida Coloma','Lora Fortes','Loria Carver','Lorine Demas','Louanne Eggleton','Louis Armstrong',
'Louis Jordan','Lovetta Duford','Loyd Chachere','Lucas Rexford','Mabel Falconer','Mark Hubbard','Mable Hubbard',
'Madelaine Ruzicka','Magen Wass','Maia Funnell','Majorie Sage','Margret Ontiveros','Maria Cameron',
'Marianela Thrash','Marilynn Sprague','Marquetta Whicker','Mary Lou Williams','Maurita Kluender','Mauro Napier',
'Max Roach','McCoy Tyner','Mee Kole','Melia Kirklin','Mia Hersom','Miles Davis','Misti Greenfield',
'Mitzi Darr','Monica Stefanik','Monika Maisonet','Monique Rosenzweig','Mora Hoar','Morton Spano','Muhal Richard Adams',
'Myra Larabee','Naida Vanhorn','Nakisha Showalter','Nancy Wise','Nannie Sulser','Narcisa Branscum','Nat King Cole','Nenita Barkman',
'Nettie Mccullen','Nevada Harkey','Nichole Gorley','Nicola Richeson','Nat Garcia','Nicole Garcia',
'Nikki Sherron','Ninfa Dane','Noemi Marek','Nohemi Oconnell','Noreen Babin','Odis Fults','Ola Stumbaugh',
'Olin Runyan','Oliva Berndt','Oliver Nelson','Oralee Krizan','Ornette Coleman','Orpha Larger',
'Orval Regan','Oscar Peterson','Oswaldo Velazco','Ouida July','Ozell Kerschner','Pa Ganey',
'Paulita Sheehan','Pearl Kahle','Peg Loken','Percy Welch','Perry Graf','Phil Palmateer',
'Philly Joe Jones','Rahsaan Roland Kirk','Rasheeda Mackey','Raymond Dock','Rea Sommerfeld','Regenia Redman',
'Reginia Simons','Reid Slama','Reyna Criado','Richard Hodges','Riley Vath','Rita Rorie','Rolanda Schulz',
'Roma Dorey','Romana Galvan','Rona Trenholm','Ronnie Rolins','Rosalba Weddell','Rosalee Copp',
'Rosalind Thiry','Rosamond Lema','Rosaura Domer','Roselle Raybon','Rosendo Stookey','Roy Eldridge',
'Royce Bergquist','Royce Viands','Rubye Vancuren','Rufina Cronkhite','Sabine Haddad','Sabra Dantonio','Sabra Wier',
'Sadie Watts','Salome Filmore','Sam Andrews','Sandee Bronson','Sanjuanita Zang','Santiago Barris',
'Santiago Johnston','Sarah Vaughan','Sarah Gonzales','Scott Gonzales','Scottie Wolanski','Serita Cordoba',
'Shane Zingaro','Shanel Bhakta','Shantel Aye','Sharla Jacquez','Shelba Gutierrez','Sheri Milhorn',
'Sherwood Oatis','Shirley Arispe','Shoshana Astin','Sidney Bechet','Soila Belote','Solange Railey',
'Sonja Mcbride','Sonny Rollins','Stan Getz','Stan Kenton','Stefani Jacoby','Summer Pickrell','Sun Ra',
'Susan Sherman','Susan Weaver','Susann Revis','Susy Jones','Sylvie Emmer','Tajuana Yeates',
'Tambra Lewis','Tamesha Peng','Tanja Uren','Tatum Rielly','Taylor Hodges','Teddy Wilson','Tegan Shreffler',
'Teisha Hartwig','Thaddeus Kelch','Theolonious Monk','Tiana Fettig','Tiffaney Fraser','Tomika Flor',
'Tomiko Sass','Tori Feinstein','Trish Hachey','Tyesha Adkisson','Tyesha Giltner','Tyra Tijerina','Tyrell Reaves',
'Tyrone Kling','Un Gibby','Ute Romig','Vena Vossen','Verda Werley','Verlie Mullin','Vi Batz',
'Vicky Hilty','Vincenzo Saffell','Viola Hart','Wade Floyd','Wallace Fuller','Wally Barile','Wanda Goold',
'Wava Magness','Wayne Shorter','Wes Montgomery','Wiley Maben','Winona Mckinnis','Winston Marsalis',
'Woody Herman','Wynton Marsalis','Xochitl Basnett','Yahaira Plummer','Ying Schlenker','Zenobia Rojas',
'Zina Vandenburg','Zulema Mudd','Zela Zedane']

df['name'] = names

In [53]:
df.to_csv("../ehr_dataset.csv")

In [54]:
df.columns

Index(['person_id', 'age', 'gender', 'new_dx', 'dx_alcoholism', 'dx_anxiety',
       'dx_bipolar_disorder', 'dx_cardiovascular_disease', 'dx_ckd',
       'dx_depression', 'dx_diabetes', 'dx_drug_abuse', 'dx_hcv',
       'dx_hypertension', 'dx_schizophrenia', 'office_visits', 'er_visits',
       'inpatient_admissions', 'vls', 'hba1c', 'name'],
      dtype='object')