# Deidentify/merge data

## Setup

### Imports & config

In [79]:
import datetime
import os
import uuid

import numpy as np
from pandashape import PandaShaper
import pandas as pd

from group_maps.lab_groups import get_lab_group
from group_maps.med_groups import get_is_ibd_med, meds_dict
from group_maps.get_med_group import get_med_group

In [80]:
data_in_path = './data/'
data_in_registry_path = f'./data/registry/De-identified with linkers 05-15-2020/'
data_in_r3_path = f'./data/r3/R3_1646_BINION_DATA_2020_05_12/'
data_out_path = './out/'

In [81]:
def merge_project_patient_id(df_left, df_patients, data_source):
    left_column_name = 'STUDY_ID' if data_source == 'r3' else 'AUTO_ID'
    patients_column_name = 'R3_STUDY_ID' if data_source == 'r3' else 'REGISTRY_AUTO_ID'

    df_merged = df_left.merge(df_patients[[patients_column_name, 'PROJECT_PATIENT_ID']], left_on=left_column_name, right_on=patients_column_name, how='left')
    df_merged.drop(columns=[left_column_name, patients_column_name], inplace=True)
    return df_merged

## Deidentify/Merge

### Patients

#### Load R3 data

In [82]:
df_patients_r3 = pd.read_csv(f'{data_in_r3_path}R3_1646_BINION_DEMOGRAPHICS_2020_05_12.csv')
df_patients_r3.head()

Unnamed: 0,STUDY_ID,BIRTH_YEAR,DEATH_DATE,GENDER
0,6999994666,1958,,MALE
1,6999993266,1928,09/24/2017,FEMALE
2,6999997423,1958,,FEMALE
3,6999947820,1954,,FEMALE
4,6999947304,1988,,MALE


In [83]:
df_patients_r3['DATA_SOURCE'] = 'r3'
df_patients_r3['MARITAL STATUS'] = ''
df_patients_r3['RACE'] = ''
df_patients_r3['ETHNIC_GROUP'] = ''
df_patients_r3['EMPLOYMENT_STATUS'] = ''
df_patients_r3['PATIENT_STATUS'] = df_patients_r3['DEATH_DATE'].apply(lambda x: 'Alive' if x == '' else 'Deceased')
df_patients_r3.drop('DEATH_DATE', axis=1, inplace=True)
df_patients_r3.rename(columns={ 'STUDY_ID': 'R3_STUDY_ID'}, inplace=True)
df_patients_r3.head()

Unnamed: 0,R3_STUDY_ID,BIRTH_YEAR,GENDER,DATA_SOURCE,MARITAL STATUS,RACE,ETHNIC_GROUP,EMPLOYMENT_STATUS,PATIENT_STATUS
0,6999994666,1958,MALE,r3,,,,,Deceased
1,6999993266,1928,FEMALE,r3,,,,,Deceased
2,6999997423,1958,FEMALE,r3,,,,,Deceased
3,6999947820,1954,FEMALE,r3,,,,,Deceased
4,6999947304,1988,MALE,r3,,,,,Deceased


#### Load registry data

In [84]:
df_patients_registry = pd.read_excel(f'{data_in_registry_path}deid_patient_master_2020.xlsx')
df_patients_registry.drop([
    'Unnamed: 0',
    'FYI_FLAG_NAME',
    'ZIP',
    'DATE_FLAG_CREATED',
    'PAT_ID',
    'PAT_MRN_ID'
], axis=1, inplace=True)
df_patients_registry.rename(columns={'AUTO_ID': 'REGISTRY_AUTO_ID'}, inplace=True)
df_patients_registry['DATA_SOURCE'] = 'registry'
df_patients_registry.head()

Unnamed: 0,REGISTRY_AUTO_ID,GENDER,BIRTH_YEAR,MARITAL STATUS,PATIENT_STATUS,RACE,ETHNIC_GROUP,EMPLOYMENT_STATUS,DATA_SOURCE
0,0,M,1963,Married,Alive,White,Not Hispanic or Latino,Full Time,registry
1,1,M,1952,Married,Alive,White,Not Hispanic or Latino,Full Time,registry
2,2,M,1987,Single,Alive,White,Not Hispanic or Latino,Not Employed,registry
3,3,F,1984,Married,Alive,White,Not Hispanic or Latino,Not Employed,registry
4,4,M,1993,Single,Alive,Filipino,Not Hispanic or Latino,Not Employed,registry


#### Merge R3/registry

For now I'm ignoring the big demographic file because it has no STUDY_ID and it's hard to relate to everything else.

In [85]:
df_patients_merged = df_patients_r3.append(df_patients_registry)
print(df_patients_merged.shape)
df_patients_merged.head()

(6235, 10)


Unnamed: 0,R3_STUDY_ID,BIRTH_YEAR,GENDER,DATA_SOURCE,MARITAL STATUS,RACE,ETHNIC_GROUP,EMPLOYMENT_STATUS,PATIENT_STATUS,REGISTRY_AUTO_ID
0,6999995000.0,1958,MALE,r3,,,,,Deceased,
1,6999993000.0,1928,FEMALE,r3,,,,,Deceased,
2,6999997000.0,1958,FEMALE,r3,,,,,Deceased,
3,6999948000.0,1954,FEMALE,r3,,,,,Deceased,
4,6999947000.0,1988,MALE,r3,,,,,Deceased,


Note that a patient will have exactly one of `REGISTRY_AUTO_ID` or `R3_STUDY_ID`.

#### Standardize categoricals

In [86]:
df_patients_merged['GENDER'].unique()

array(['MALE', 'FEMALE', 'M', 'F'], dtype=object)

In [87]:
df_patients_merged['MARITAL STATUS'].unique()

array(['', 'Married', 'Single', 'Divorced', 'Widowed', 'Unknown',
       'Legally Separated', nan, 'Committed relationship',
       'Significant other'], dtype=object)

In [88]:
df_patients_merged['MARITAL STATUS'].replace([np.nan, 'Unknown'], '', inplace=True)
df_patients_merged['MARITAL STATUS'].unique()

array(['', 'Married', 'Single', 'Divorced', 'Widowed',
       'Legally Separated', 'Committed relationship', 'Significant other'],
      dtype=object)

In [89]:
df_patients_merged['RACE'].unique()

array(['', 'White', 'Filipino', nan, 'Declined', 'Black', 'Vietnamese',
       'Not Specified', 'Indian (Asian)', 'Chinese', 'Other Asian',
       'Alaska Native', 'American Indian', 'Japanese', 'Korean',
       'Other Pacific Islander'], dtype=object)

In [90]:
df_patients_merged['RACE'].replace([np.nan, 'Declined', 'Not Specified'], '', inplace=True)
df_patients_merged['RACE'].unique()

array(['', 'White', 'Filipino', 'Black', 'Vietnamese', 'Indian (Asian)',
       'Chinese', 'Other Asian', 'Alaska Native', 'American Indian',
       'Japanese', 'Korean', 'Other Pacific Islander'], dtype=object)

In [91]:
df_patients_merged['EMPLOYMENT_STATUS'].unique()

array(['', 'Full Time', 'Not Employed', 'Student - Full Time', nan,
       'Part Time', 'Retired', 'Self Employed', 'Unknown',
       'Student - Part Time'], dtype=object)

In [92]:
df_patients_merged['EMPLOYMENT_STATUS'].replace([np.nan, 'Unknown'], '', inplace=True)
df_patients_merged['EMPLOYMENT_STATUS'].unique()

array(['', 'Full Time', 'Not Employed', 'Student - Full Time',
       'Part Time', 'Retired', 'Self Employed', 'Student - Part Time'],
      dtype=object)

In [93]:
df_patients_merged['ETHNIC_GROUP'].unique()

array(['', 'Not Hispanic or Latino', nan, 'Declined', 'Not Specified',
       'Hispanic or Latino'], dtype=object)

In [94]:
df_patients_merged['ETHNIC_GROUP'].replace([np.nan, 'Declined', 'Not Specified'], '', inplace=True)
df_patients_merged['ETHNIC_GROUP'].unique()

array(['', 'Not Hispanic or Latino', 'Hispanic or Latino'], dtype=object)

In [95]:
df_patients_merged['IS_ALIVE'] = df_patients_merged['PATIENT_STATUS'].apply(lambda x: 'TRUE' if x == 'ALIVE' else 'FALSE')
df_patients_merged.drop(columns='PATIENT_STATUS', inplace=True)

#### Create our own unique ID

In [96]:
df_patients_merged['PROJECT_PATIENT_ID'] = df_patients_merged.apply(lambda x: str(uuid.uuid4()), axis=1)
df_patients_merged.head()

Unnamed: 0,R3_STUDY_ID,BIRTH_YEAR,GENDER,DATA_SOURCE,MARITAL STATUS,RACE,ETHNIC_GROUP,EMPLOYMENT_STATUS,REGISTRY_AUTO_ID,IS_ALIVE,PROJECT_PATIENT_ID
0,6999995000.0,1958,MALE,r3,,,,,,False,64a0ac31-091d-45c5-86e3-86516ee4299a
1,6999993000.0,1928,FEMALE,r3,,,,,,False,7da2e9db-ae3b-41bc-a467-fa2d9afe25ea
2,6999997000.0,1958,FEMALE,r3,,,,,,False,524c0db3-6ef0-47c6-9b99-098d0bf998cd
3,6999948000.0,1954,FEMALE,r3,,,,,,False,ed3a3f1a-0c26-402b-b922-159589036b46
4,6999947000.0,1988,MALE,r3,,,,,,False,4d3ea8f4-15b6-477f-805e-7932c6aabc74


#### Write out

In [97]:
# one version that relates the PAT_IDs and STUDY_IDs we do have to the original record
df_patients_merged.to_csv(f'{data_out_path}patients_master_deid.csv')

# one in the format needed by the preprocessing script
df_patients_merged.drop(columns=['R3_STUDY_ID', 'REGISTRY_AUTO_ID']).to_csv(f'{data_out_path}patients_merged.csv')

### Encounters

We're only interested in encounters with the following departments:

In [98]:
encounter_dept_list = ['COLON RECTAL DDC HBC', 'GAS HBC OAKLAND DDC PH', 'GASTRO DDC', 'GASTRO IBD MED HOME', 'GI SURG DDC HBC']

#### Load R3

In [99]:
df_enc_r3 = pd.read_csv(f'{data_in_r3_path}R3_1646_BINION_ENCOUNTERS_2020_05_12.csv')
df_enc_r3.head()

Unnamed: 0,STUDY_ID,VISIT_ID,START_DATE,END_DATE,ENC_TYPE,LOCATION,APPT_STATUS,ADMIT_SOURCE,HOSPITAL_SERVICE,PATIENT_TYPE,PATIENT_CLASS,CHIEF_COMPLAINT,CHIEF_COMPLAINT_ONSET_DATE
0,6999994666,452902438,09/21/2011 00:00:00,,OFFICE VISIT,PIMA GREENTREE,COMPLETED,,,,,,
1,6999994666,450398659,10/10/2011 00:00:00,,OFFICE VISIT,RAVI GI GREENTREE,COMPLETED,,,,,,
2,6999994666,452908020,09/07/2011 00:00:00,,TELEPHONE,PIMA GREENTREE,,,,,,,
3,6999994666,452509743,09/12/2011 00:00:00,,APPOINTMENT,PIMA GREENTREE,CANCELED,,,,,,
4,6999994666,451199413,09/21/2011 00:00:00,,HISTORY,PIMA GREENTREE,,,,,,,


Filter for the departments of interest.

In [100]:
df_enc_r3 = df_enc_r3[df_enc_r3['LOCATION'].isin(encounter_dept_list)]

**TODO:** New filter? 'CANCELED' seems like something we'd want to avoid in APPT_STATUS.

In [101]:
df_enc_r3['APPT_STATUS'].unique()

array([nan, 'COMPLETED', 'CANCELED', 'NO SHOW', 'UNRESOLVED'],
      dtype=object)

In [102]:
df_enc_r3.drop(columns=[
    'END_DATE', 
    'ADMIT_SOURCE', 
    'HOSPITAL_SERVICE', 
    'PATIENT_TYPE', 
    'PATIENT_CLASS', 
    'CHIEF_COMPLAINT', 
    'CHIEF_COMPLAINT_ONSET_DATE',
    'APPT_STATUS',
    'VISIT_ID'
], inplace=True)
df_enc_r3.rename(columns = { 
    'START_DATE': 'ENCOUNTER_DATE',
    'ENC_TYPE': 'ENCOUNTER_TYPE',
    'LOCATION': 'DEPT_NAME'
}, inplace=True)
df_enc_r3['ICD9_CODE'] = ''
df_enc_r3['ICD10_CODE'] = ''
df_enc_r3['PRIMARY_DX'] = ''

df_enc_r3.head()

Unnamed: 0,STUDY_ID,ENCOUNTER_DATE,ENCOUNTER_TYPE,DEPT_NAME,ICD9_CODE,ICD10_CODE,PRIMARY_DX
504,6999968167,08/01/2011 00:00:00,TELEPHONE,GI SURG DDC HBC,,,
507,6999968167,08/05/2011 00:00:00,TELEPHONE,GI SURG DDC HBC,,,
625,6999968167,04/20/2011 00:00:00,TELEPHONE,GAS HBC OAKLAND DDC PH,,,
627,6999968167,04/21/2011 00:00:00,SCAN,GAS HBC OAKLAND DDC PH,,,
628,6999968167,04/21/2011 00:00:00,SCAN,GI SURG DDC HBC,,,


In [103]:
df_enc_r3[~df_enc_r3['STUDY_ID'].isin(df_patients_merged['R3_STUDY_ID'])]

Unnamed: 0,STUDY_ID,ENCOUNTER_DATE,ENCOUNTER_TYPE,DEPT_NAME,ICD9_CODE,ICD10_CODE,PRIMARY_DX


In [104]:
df_enc_r3 = merge_project_patient_id(df_enc_r3, df_patients_merged, data_source = 'r3')
df_enc_r3.head()

Unnamed: 0,ENCOUNTER_DATE,ENCOUNTER_TYPE,DEPT_NAME,ICD9_CODE,ICD10_CODE,PRIMARY_DX,PROJECT_PATIENT_ID
0,08/01/2011 00:00:00,TELEPHONE,GI SURG DDC HBC,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9
1,08/05/2011 00:00:00,TELEPHONE,GI SURG DDC HBC,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9
2,04/20/2011 00:00:00,TELEPHONE,GAS HBC OAKLAND DDC PH,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9
3,04/21/2011 00:00:00,SCAN,GAS HBC OAKLAND DDC PH,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9
4,04/21/2011 00:00:00,SCAN,GI SURG DDC HBC,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9


#### Load registry

In [105]:
df_enc_registry = pd.read_excel(f'{data_in_registry_path}deid_IBD_Registry_BA1951_Office_Phone_Email_Encs_2020-01-05-09-43-50.xlsx')

In [106]:
# filter departments
df_enc_registry = df_enc_registry[df_enc_registry['DEPT_NAME'].isin(encounter_dept_list)]
print(df_enc_registry.shape)
df_enc_registry.head()

(137227, 12)


Unnamed: 0.1,Unnamed: 0,AUTO_ID,ENC_TYPE_C,ENC_TYPE_NAME,CONTACT_DATE,DEPT_ID,DEPT_NAME,VISIT_PROV_ID,VISIT_PROV_NAME,ICD9_CODE,ICD10_CODE,PRIMARY_DX
0,0,0,101,Office Visit,2010-10-14 00:00:00,1045102.0,GAS HBC OAKLAND DDC PH,58718,"SCHWARTZ, MARC B",556.9,K51.90,"Ulcerative colitis, unspecified"
1,1,0,101,Office Visit,2011-04-07 00:00:00,1045102.0,GAS HBC OAKLAND DDC PH,58718,"SCHWARTZ, MARC B",556.9,K51.90,"Ulcerative colitis, unspecified"
2,2,0,101,Office Visit,2011-07-07 00:00:00,1045102.0,GAS HBC OAKLAND DDC PH,58718,"SCHWARTZ, MARC B",556.9,K51.90,"Ulcerative colitis, unspecified"
3,3,0,101,Office Visit,2012-01-12 00:00:00,1045102.0,GAS HBC OAKLAND DDC PH,58718,"SCHWARTZ, MARC B",556.9,K51.90,"Ulcerative colitis, unspecified"
4,4,0,101,Office Visit,2012-07-12 00:00:00,1045102.0,GAS HBC OAKLAND DDC PH,58718,"SCHWARTZ, MARC B",556.9,K51.90,"Ulcerative colitis, unspecified"


In [107]:
df_enc_registry = df_enc_registry[['AUTO_ID', 'ENC_TYPE_NAME', 'CONTACT_DATE', 'DEPT_NAME', 'ICD9_CODE', 'ICD10_CODE', 'PRIMARY_DX']]
df_enc_registry.rename(columns={
    'ENC_TYPE_NAME': 'ENCOUNTER_TYPE',
    'CONTACT_DATE': 'ENCOUNTER_DATE'
}, inplace=True)
df_enc_registry.head()

Unnamed: 0,AUTO_ID,ENCOUNTER_TYPE,ENCOUNTER_DATE,DEPT_NAME,ICD9_CODE,ICD10_CODE,PRIMARY_DX
0,0,Office Visit,2010-10-14 00:00:00,GAS HBC OAKLAND DDC PH,556.9,K51.90,"Ulcerative colitis, unspecified"
1,0,Office Visit,2011-04-07 00:00:00,GAS HBC OAKLAND DDC PH,556.9,K51.90,"Ulcerative colitis, unspecified"
2,0,Office Visit,2011-07-07 00:00:00,GAS HBC OAKLAND DDC PH,556.9,K51.90,"Ulcerative colitis, unspecified"
3,0,Office Visit,2012-01-12 00:00:00,GAS HBC OAKLAND DDC PH,556.9,K51.90,"Ulcerative colitis, unspecified"
4,0,Office Visit,2012-07-12 00:00:00,GAS HBC OAKLAND DDC PH,556.9,K51.90,"Ulcerative colitis, unspecified"


In [108]:
df_enc_registry = merge_project_patient_id(df_enc_registry, df_patients_merged, data_source='registry')
print(df_enc_registry.shape)
df_enc_registry.head()

(137227, 7)


Unnamed: 0,ENCOUNTER_TYPE,ENCOUNTER_DATE,DEPT_NAME,ICD9_CODE,ICD10_CODE,PRIMARY_DX,PROJECT_PATIENT_ID
0,Office Visit,2010-10-14 00:00:00,GAS HBC OAKLAND DDC PH,556.9,K51.90,"Ulcerative colitis, unspecified",b8d4e6bd-e1b7-4a36-8df1-911926190d53
1,Office Visit,2011-04-07 00:00:00,GAS HBC OAKLAND DDC PH,556.9,K51.90,"Ulcerative colitis, unspecified",b8d4e6bd-e1b7-4a36-8df1-911926190d53
2,Office Visit,2011-07-07 00:00:00,GAS HBC OAKLAND DDC PH,556.9,K51.90,"Ulcerative colitis, unspecified",b8d4e6bd-e1b7-4a36-8df1-911926190d53
3,Office Visit,2012-01-12 00:00:00,GAS HBC OAKLAND DDC PH,556.9,K51.90,"Ulcerative colitis, unspecified",b8d4e6bd-e1b7-4a36-8df1-911926190d53
4,Office Visit,2012-07-12 00:00:00,GAS HBC OAKLAND DDC PH,556.9,K51.90,"Ulcerative colitis, unspecified",b8d4e6bd-e1b7-4a36-8df1-911926190d53


#### Merge data sources

In [109]:
df_enc_merged = df_enc_r3.append(df_enc_registry)
print(df_enc_merged.shape)
df_enc_merged.head()

(149741, 7)


Unnamed: 0,ENCOUNTER_DATE,ENCOUNTER_TYPE,DEPT_NAME,ICD9_CODE,ICD10_CODE,PRIMARY_DX,PROJECT_PATIENT_ID
0,08/01/2011 00:00:00,TELEPHONE,GI SURG DDC HBC,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9
1,08/05/2011 00:00:00,TELEPHONE,GI SURG DDC HBC,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9
2,04/20/2011 00:00:00,TELEPHONE,GAS HBC OAKLAND DDC PH,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9
3,04/21/2011 00:00:00,SCAN,GAS HBC OAKLAND DDC PH,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9
4,04/21/2011 00:00:00,SCAN,GI SURG DDC HBC,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9


#### Standardized categoricals

In [110]:
df_enc_merged['ENCOUNTER_TYPE'].unique()

array(['TELEPHONE', 'SCAN', 'OFFICE VISIT', 'HISTORY', 'REFILL',
       'INFORMATIONAL', 'PROCEDURE NOTE', 'APPOINTMENT',
       'TREATMENT PROTOCOL', 'LETTER', 'ERRONEOUS ENCOUNTER',
       'RELEASE OF INFORMATION', 'OP REPORT', 'PATIENT EMAIL',
       'ORDERS ONLY', 'ABSTRACT', 'LETTER (OUT)', 'LAB RESULTS', 'BPA',
       'HOSPITAL RESERVATION', 'IMAGING', 'LAB VISIT',
       'NEW PATIENT VISIT', 'IP CONSULT', 'TRANSCRIPTION', 'EKG',
       'PROCEDURE VISIT', 'TESTING VISIT', 'TRANSFER SUMMARY',
       'INPATIENT H&P', 'NUTRITION', 'SURGERY SCHEDULING', 'GI',
       'Office Visit', 'New Patient Visit', 'Patient Message',
       'Telephone', 'Procedure Visit', 'Consult'], dtype=object)

In [111]:
df_enc_merged['ENCOUNTER_TYPE'] = df_enc_merged['ENCOUNTER_TYPE'].str.upper()
df_enc_merged['ENCOUNTER_TYPE'].unique()

array(['TELEPHONE', 'SCAN', 'OFFICE VISIT', 'HISTORY', 'REFILL',
       'INFORMATIONAL', 'PROCEDURE NOTE', 'APPOINTMENT',
       'TREATMENT PROTOCOL', 'LETTER', 'ERRONEOUS ENCOUNTER',
       'RELEASE OF INFORMATION', 'OP REPORT', 'PATIENT EMAIL',
       'ORDERS ONLY', 'ABSTRACT', 'LETTER (OUT)', 'LAB RESULTS', 'BPA',
       'HOSPITAL RESERVATION', 'IMAGING', 'LAB VISIT',
       'NEW PATIENT VISIT', 'IP CONSULT', 'TRANSCRIPTION', 'EKG',
       'PROCEDURE VISIT', 'TESTING VISIT', 'TRANSFER SUMMARY',
       'INPATIENT H&P', 'NUTRITION', 'SURGERY SCHEDULING', 'GI',
       'PATIENT MESSAGE', 'CONSULT'], dtype=object)

#### Write out

In [112]:
df_enc_merged.to_csv(f'{data_out_path}encounters_merged.csv')

### Labs

#### R3

In [113]:
df_labs_r3 = pd.read_csv(f'{data_in_r3_path}R3_1646_BINION_LAB_RESULTS_2020_05_12.csv')
df_labs_r3.head()

Unnamed: 0,STUDY_ID,VISIT_ID,ORDER_PROC_ID,RESULT_DATE,COMPONENT_NAME,ORD_VALUE,ORD_NUM_VALUE,REFERENCE_UNIT,REFERENCE_LOW,REFERENCE_HIGH,RESULT_FLAG,LAB_RESULT_STATUS,SPECIMEN_COLLECTED_DATE,SPECIMEN_RECEIVED_DATE,SPECIMEN_TYPE,SPECIMEN_SOURCE
0,6999997423,351489117,203602100,03/27/2015,SPECIMEN DESCRIPTION,Stool,,,,,,FINAL,03/25/2015,03/25/2015,STOOL,
1,6999997423,351489117,203602100,03/27/2015,SPECIAL REQUESTS,,,,,,,FINAL,03/25/2015,03/25/2015,STOOL,
2,6999997423,351489117,203602100,03/27/2015,CULTURE,,,,,,,FINAL,03/25/2015,03/25/2015,STOOL,
3,6999997423,351489117,203602100,03/27/2015,REPORT,Final Result 03/27/2015,,,,,,FINAL,03/25/2015,03/25/2015,STOOL,
4,6999997423,351489117,114670251,03/23/2015,LIPASE,164,164.0,U/L,50.0,393.0,,FINAL,03/23/2015,03/23/2015,,


In [114]:
df_labs_r3 = df_labs_r3[[
    'STUDY_ID', 
    'RESULT_DATE', 
    'ORDER_PROC_ID', 
    'COMPONENT_NAME', 
    'ORD_VALUE', 
    'ORD_NUM_VALUE', 
    'REFERENCE_UNIT',
    'REFERENCE_LOW', 
    'REFERENCE_HIGH', 
    'RESULT_FLAG'
]]
df_labs_r3['GROUP'] = ''
df_labs_r3.head()

Unnamed: 0,STUDY_ID,RESULT_DATE,ORDER_PROC_ID,COMPONENT_NAME,ORD_VALUE,ORD_NUM_VALUE,REFERENCE_UNIT,REFERENCE_LOW,REFERENCE_HIGH,RESULT_FLAG,GROUP
0,6999997423,03/27/2015,203602100,SPECIMEN DESCRIPTION,Stool,,,,,,
1,6999997423,03/27/2015,203602100,SPECIAL REQUESTS,,,,,,,
2,6999997423,03/27/2015,203602100,CULTURE,,,,,,,
3,6999997423,03/27/2015,203602100,REPORT,Final Result 03/27/2015,,,,,,
4,6999997423,03/23/2015,114670251,LIPASE,164,164.0,U/L,50.0,393.0,,


In [115]:
df_labs_r3 = merge_project_patient_id(df_labs_r3, df_patients_merged, 'r3')
df_labs_r3.head()

Unnamed: 0,RESULT_DATE,ORDER_PROC_ID,COMPONENT_NAME,ORD_VALUE,ORD_NUM_VALUE,REFERENCE_UNIT,REFERENCE_LOW,REFERENCE_HIGH,RESULT_FLAG,GROUP,PROJECT_PATIENT_ID
0,03/27/2015,203602100,SPECIMEN DESCRIPTION,Stool,,,,,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd
1,03/27/2015,203602100,SPECIAL REQUESTS,,,,,,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd
2,03/27/2015,203602100,CULTURE,,,,,,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd
3,03/27/2015,203602100,REPORT,Final Result 03/27/2015,,,,,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd
4,03/23/2015,114670251,LIPASE,164,164.0,U/L,50.0,393.0,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd


#### Registry

In [116]:
df_labs_registry = pd.read_excel(f'{data_in_registry_path}labs_filtered.xlsx')
df_labs_registry.head()

Unnamed: 0.1,Unnamed: 0,AUTO_ID,ORDER_DATE,PROC_CODE,PROC_NAME,ORDER_STATUS,CPT_CODE,LAB_COMP_ID,LAB_COMP_NAME,RESULT_DATE,ORD_VALUE,ORD_NUM_VALUE,REF_LOW,REF_HIGH,REF_NORMAL_VALS,REF_UNIT,RESULT_FLAG,GROUP
0,2,0,2010-08-12 00:00:00,8502502,CBC & DIFF INC PLATELET,Completed,85025,74,ABS EOSINOPHILS,2010-08-12 00:00:00,0.0,0.0,0.00,0.4,,X10E+09/L,,eos
1,4,0,2010-08-12 00:00:00,8502502,CBC & DIFF INC PLATELET,Completed,85025,72,ABS MONOCYTES,2010-08-12 00:00:00,0.5,0.5,0.30,0.9,,X10E+09/L,,monocytes
2,7,0,2010-08-12 00:00:00,8007602,HEPATIC FUNCTION PANEL,Completed,HFPA,20,ALBUMIN,2010-08-12 00:00:00,2.9,2.9,3.4,5.0,,g/dL,Low,albumin
3,20,0,2010-08-12 00:00:00,8614001,CRP QUANTITATION,Completed,86140,530,CRP QUANTITATION,2010-08-12 00:00:00,10.085,10.085,<0.748,,,mg/dL,High,crp
4,23,0,2010-08-12 00:00:00,8502502,CBC & DIFF INC PLATELET,Completed,85025,73,EOSINOPHILS,2010-08-12 00:00:00,0.0,0.0,0,6.0,,%,,eos


In [117]:
df_labs_registry = df_labs_registry[[
    'AUTO_ID', 
    'RESULT_DATE', 
    'LAB_COMP_ID', 
    'LAB_COMP_NAME', 
    'ORD_VALUE',
    'ORD_NUM_VALUE', 
    'REF_UNIT', 
    'REF_LOW', 
    'REF_HIGH', 
    'RESULT_FLAG', 
    'GROUP'
]]

df_labs_registry.rename(columns={
    'LAB_COMP_ID': 'ORDER_PROC_ID',
    'LAB_COMP_NAME': 'COMPONENT_NAME',
    'REF_UNIT': 'REFERENCE_UNIT',
    'REF_LOW': 'REFERENCE_LOW',
    'REF_HIGH': 'REFERENCE_HIGH',
}, inplace=True)

df_labs_registry = merge_project_patient_id(df_labs_registry, df_patients_merged, 'registry')
df_labs_registry.head()

Unnamed: 0,RESULT_DATE,ORDER_PROC_ID,COMPONENT_NAME,ORD_VALUE,ORD_NUM_VALUE,REFERENCE_UNIT,REFERENCE_LOW,REFERENCE_HIGH,RESULT_FLAG,GROUP,PROJECT_PATIENT_ID
0,2010-08-12 00:00:00,74,ABS EOSINOPHILS,0.0,0.0,X10E+09/L,0.00,0.4,,eos,b8d4e6bd-e1b7-4a36-8df1-911926190d53
1,2010-08-12 00:00:00,72,ABS MONOCYTES,0.5,0.5,X10E+09/L,0.30,0.9,,monocytes,b8d4e6bd-e1b7-4a36-8df1-911926190d53
2,2010-08-12 00:00:00,20,ALBUMIN,2.9,2.9,g/dL,3.4,5.0,Low,albumin,b8d4e6bd-e1b7-4a36-8df1-911926190d53
3,2010-08-12 00:00:00,530,CRP QUANTITATION,10.085,10.085,mg/dL,<0.748,,High,crp,b8d4e6bd-e1b7-4a36-8df1-911926190d53
4,2010-08-12 00:00:00,73,EOSINOPHILS,0.0,0.0,%,0,6.0,,eos,b8d4e6bd-e1b7-4a36-8df1-911926190d53


#### Merge data sources

In [118]:
df_labs_merged = df_labs_r3.append(df_labs_registry)
df_labs_merged.shape

(1486967, 11)

#### Cleanup

Set the `GROUP` column for all entries.

In [119]:
df_labs_merged['GROUP'] = df_labs_merged['GROUP'].replace(np.nan, '')
df_labs_merged['GROUP'].unique()

array(['', 'eos', 'monocytes', 'albumin', 'crp', 'hemoglobin', 'esr',
       'vitamin_d'], dtype=object)

In [120]:
df_labs_merged['GROUP'] = df_labs_merged.apply(lambda x: x['GROUP'] if x['GROUP'] != '' else get_lab_group(x['COMPONENT_NAME']), axis='columns')
print('total labs', df_labs_merged.shape[0])
print('labs without a group', df_labs_merged[df_labs_merged['GROUP'] == ''].shape[0])

total labs 1486967
labs without a group 945527


**TODO:** Confirm these result flag cleanup decisions.

In [121]:
# standardize casing
df_labs_merged['RESULT_FLAG'] = df_labs_merged['RESULT_FLAG'].str.upper()
# set default blank
df_labs_merged['RESULT_FLAG'] = df_labs_merged['RESULT_FLAG'].replace(['(NONE)', np.nan], '')
# ignore "panic" for now - everything is either high, low, abnormal, or nothing
df_labs_merged['RESULT_FLAG'] = df_labs_merged['RESULT_FLAG'].replace('PANIC', 'ABNORMAL')
df_labs_merged['RESULT_FLAG'] = df_labs_merged['RESULT_FLAG'].str.replace(' PANIC', '')
df_labs_merged['RESULT_FLAG'].unique()

array(['', 'HIGH', 'LOW', 'ABNORMAL'], dtype=object)

#### Write out

In [122]:
df_labs_merged.to_csv(f'{data_out_path}labs_merged.csv')

### Meds

#### R3

In [123]:
df_meds_r3 = pd.read_csv(f'{data_in_r3_path}R3_1646_BINION_MED_ORDERS_2020_05_12.csv')
df_meds_r3 = df_meds_r3[['STUDY_ID', 'MED_ORDER_ID', 'MEDICATION_NAME', 'SIMPLE_GENERIC', 'ORDER_DATE', 'START_DATE', 'END_DATE', 'PHARM_CLASS']]
df_meds_r3.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,STUDY_ID,MED_ORDER_ID,MEDICATION_NAME,SIMPLE_GENERIC,ORDER_DATE,START_DATE,END_DATE,PHARM_CLASS
0,6999994666,367023442,LISINOPRIL 10 MG TABLET,LISINOPRIL,08/13/2018,08/13/2018,,"ANTIHYPERTENSIVES, ACE INHIBITORS"
1,6999994666,367023441,LISINOPRIL 5 MG TABLET,LISINOPRIL,08/13/2018,08/13/2018,,"ANTIHYPERTENSIVES, ACE INHIBITORS"
2,6999994666,367023443,LEVOTHYROXINE 100 MCG TABLET,LEVOTHYROXINE SODIUM,08/13/2018,08/13/2018,,THYROID HORMONES
3,6999994666,367023438,VARICELLA-ZOSTER GLYCOE VACC-AS01B ADJ(PF) 50 ...,VARICELLA-ZOSTER GE/AS01B/PF,08/13/2018,08/13/2018,08/13/2018,VIRAL/TUMORIGENIC VACCINES
4,6999994666,146936464,SIMVASTATIN 20 MG TABLET,SIMVASTATIN,01/16/2014,01/16/2014,04/25/2014,ANTIHYPERLIPIDEMIC - HMG COA REDUCTASE INHIBITORS


In [124]:
df_meds_r3.rename(columns={ 
    'MED_ORDER_ID': 'ORDER_ID',
    'MEDICATION_NAME': 'MED_NAME',
    'SIMPLE_GENERIC': 'SIMPLE_GENERIC_NAME',
    'ORDER_DATE': 'ORDERING_DATE'
}, inplace=True)
df_meds_r3['GROUP'] = ''
df_meds_r3 = merge_project_patient_id(df_meds_r3, df_patients_merged, 'r3')
df_meds_r3.head()

Unnamed: 0,ORDER_ID,MED_NAME,SIMPLE_GENERIC_NAME,ORDERING_DATE,START_DATE,END_DATE,PHARM_CLASS,GROUP,PROJECT_PATIENT_ID
0,367023442,LISINOPRIL 10 MG TABLET,LISINOPRIL,08/13/2018,08/13/2018,,"ANTIHYPERTENSIVES, ACE INHIBITORS",,64a0ac31-091d-45c5-86e3-86516ee4299a
1,367023441,LISINOPRIL 5 MG TABLET,LISINOPRIL,08/13/2018,08/13/2018,,"ANTIHYPERTENSIVES, ACE INHIBITORS",,64a0ac31-091d-45c5-86e3-86516ee4299a
2,367023443,LEVOTHYROXINE 100 MCG TABLET,LEVOTHYROXINE SODIUM,08/13/2018,08/13/2018,,THYROID HORMONES,,64a0ac31-091d-45c5-86e3-86516ee4299a
3,367023438,VARICELLA-ZOSTER GLYCOE VACC-AS01B ADJ(PF) 50 ...,VARICELLA-ZOSTER GE/AS01B/PF,08/13/2018,08/13/2018,08/13/2018,VIRAL/TUMORIGENIC VACCINES,,64a0ac31-091d-45c5-86e3-86516ee4299a
4,146936464,SIMVASTATIN 20 MG TABLET,SIMVASTATIN,01/16/2014,01/16/2014,04/25/2014,ANTIHYPERLIPIDEMIC - HMG COA REDUCTASE INHIBITORS,,64a0ac31-091d-45c5-86e3-86516ee4299a


#### Registry

In [125]:
df_meds_registry = pd.read_excel(f'{data_in_registry_path}filtered_meds.xlsx')
df_meds_registry = df_meds_registry[['AUTO_ID', 'ORDER_ID', 'MED_NAME', 'SIMPLE_GENERIC_NAME', 'ORDERING_DATE', 'START_DATE', 'END_DATE', 'PHARM_CLASS', 'GROUP']]
df_meds_registry = merge_project_patient_id(df_meds_registry, df_patients_merged, 'registry')
df_meds_registry.head()

Unnamed: 0,ORDER_ID,MED_NAME,SIMPLE_GENERIC_NAME,ORDERING_DATE,START_DATE,END_DATE,PHARM_CLASS,GROUP,PROJECT_PATIENT_ID
0,41818775,AZATHIOPRINE 50 MG TABLET,AZATHIOPRINE,2010-08-12 00:00:00,,2010-08-20 00:00:00,IMMUNOSUPPRESSIVES,Immunomodulators,b8d4e6bd-e1b7-4a36-8df1-911926190d53
1,41818777,"MESALAMINE 400 MG TABLET,DELAYED RELEASE",MESALAMINE,2010-08-12 00:00:00,,2011-04-07 00:00:00,"DRUG TX-CHRONIC INFLAM. COLON DX,5-AMINOSALICYLAT",5 ASA,b8d4e6bd-e1b7-4a36-8df1-911926190d53
2,41818779,MESALAMINE RECT,MESALAMINE,2010-08-12 00:00:00,,2011-04-07 00:00:00,"CHRONIC INFLAM. COLON DX, 5-A-SALICYLAT,RECTAL TX",5 ASA,b8d4e6bd-e1b7-4a36-8df1-911926190d53
3,54197256,PREDNISONE 20 MG TABLET,PREDNISONE,2010-08-12 00:00:00,2010-08-12 00:00:00,2011-04-07 00:00:00,GLUCOCORTICOIDS,Systemic steroids,b8d4e6bd-e1b7-4a36-8df1-911926190d53
4,54424224,AZATHIOPRINE 50 MG TABLET,AZATHIOPRINE,2010-08-20 00:00:00,2010-08-20 00:00:00,2011-01-07 00:00:00,IMMUNOSUPPRESSIVES,Immunomodulators,b8d4e6bd-e1b7-4a36-8df1-911926190d53


#### Merge data sources

In [126]:
df_meds_merged = df_meds_r3.append(df_meds_registry)
df_meds_merged.head()

Unnamed: 0,ORDER_ID,MED_NAME,SIMPLE_GENERIC_NAME,ORDERING_DATE,START_DATE,END_DATE,PHARM_CLASS,GROUP,PROJECT_PATIENT_ID
0,367023442,LISINOPRIL 10 MG TABLET,LISINOPRIL,08/13/2018,08/13/2018,,"ANTIHYPERTENSIVES, ACE INHIBITORS",,64a0ac31-091d-45c5-86e3-86516ee4299a
1,367023441,LISINOPRIL 5 MG TABLET,LISINOPRIL,08/13/2018,08/13/2018,,"ANTIHYPERTENSIVES, ACE INHIBITORS",,64a0ac31-091d-45c5-86e3-86516ee4299a
2,367023443,LEVOTHYROXINE 100 MCG TABLET,LEVOTHYROXINE SODIUM,08/13/2018,08/13/2018,,THYROID HORMONES,,64a0ac31-091d-45c5-86e3-86516ee4299a
3,367023438,VARICELLA-ZOSTER GLYCOE VACC-AS01B ADJ(PF) 50 ...,VARICELLA-ZOSTER GE/AS01B/PF,08/13/2018,08/13/2018,08/13/2018,VIRAL/TUMORIGENIC VACCINES,,64a0ac31-091d-45c5-86e3-86516ee4299a
4,146936464,SIMVASTATIN 20 MG TABLET,SIMVASTATIN,01/16/2014,01/16/2014,04/25/2014,ANTIHYPERLIPIDEMIC - HMG COA REDUCTASE INHIBITORS,,64a0ac31-091d-45c5-86e3-86516ee4299a


#### Cleanup

In [127]:
# filter non-ibd meds
df_meds_merged = df_meds_merged[df_meds_merged.apply(lambda x: get_is_ibd_med(x['SIMPLE_GENERIC_NAME']), axis='columns')]
df_meds_merged.shape

(88402, 9)

In [128]:
# set group name
df_meds_merged['GROUP'] = df_meds_merged.apply(lambda x: x['GROUP'] if x['GROUP'] != '' else get_med_group(x['SIMPLE_GENERIC_NAME'], x['MED_NAME']), axis='columns')
df_meds_merged_nogroup = df_meds_merged[df_meds_merged['GROUP'] == '']

print('Total meds:', len(df_meds_merged))
print('Meds (no group):', len(df_meds_merged_nogroup))
df_meds_merged = df_meds_merged[df_meds_merged['GROUP'] != '']

Total meds: 88402
Meds (no group): 13492


**TODO:** Do we need to deal with these generics/med names with no group? Many of these have generic names that are related to groups in the meds dict. Basically, confirm Dmitriy's logic for this.

In [129]:
df_meds_merged_nogroup['SIMPLE_GENERIC_NAME'].unique()

array(['HYDROCORTISONE', 'HYDROCORTISONE SOD SUCCINATE',
       'CHOLECALCIFEROL (VITAMIN D3)', 'PREDNISONE', 'MERCAPTOPURINE',
       'MESALAMINE', 'PREDNISOLONE', 'ADALIMUMAB',
       'DEXAMETHASONE SOD PHOSPHATE', 'CALCIUM CARBONATE/VITAMIN D3',
       'ERGOCALCIFEROL (VITAMIN D2)', 'AZATHIOPRINE',
       'CERTOLIZUMAB PEGOL', 'DEXAMETHASONE', 'METHYLPREDNISOLONE',
       'METHOTREXATE SODIUM/PF', 'METHOTREXATE SODIUM', 'METHOTREXATE/PF',
       'SULFASALAZINE', 'METHYLPREDNISOLONE SOD SUCC',
       'PREDNISOLONE SOD PHOSPHATE', 'METHOTREXATE', 'ETANERCEPT',
       'GOLIMUMAB', 'USTEKINUMAB', 'LEFLUNOMIDE',
       'METHYLPREDNISOLONE SOD SUCC/PF', 'METHYLPREDNISOLONE ACETATE',
       'MESALAMINE W/CLEANSING WIPES', 'HYDROCORTISONE SOD PHOSPHATE'],
      dtype=object)

#### Write out

In [130]:
df_meds_merged.to_csv(f'{data_out_path}meds_merged.csv')

### Diagnoses

#### R3

##### Problem list

In [131]:
df_problems_r3 = pd.read_csv(f'{data_in_r3_path}R3_1646_BINION_PROB_LIST_2020_05_12.csv')
df_problems_r3.head()

Unnamed: 0,STUDY_ID,DX_CODE_TYPE,DX_CODE,DX_NAME,REPORTED_DATE,ONSET_DATE,RESOLVED_DATE,RESOLVED_REASON,PROBLEM_STATUS
0,6999994666,ICD9,535.50,Unspecified gastritis and gastroduodenitis wit...,11/15/2011,11/15/2011,,,ACTIVE
1,6999994666,ICD9,578.1,Blood in stool,10/10/2011,10/10/2011,,,ACTIVE
2,6999994666,ICD9,555.2,Regional enteritis of small intestine with lar...,04/04/2012,04/04/2012,,,ACTIVE
3,6999994666,ICD10,H26.9,Cataract,06/20/2016,06/20/2016,,,ACTIVE
4,6999994666,ICD10,H04.129,Dry eye,06/20/2016,06/20/2016,,,ACTIVE


In [132]:
df_problems_r3 = pd.read_csv(f'{data_in_r3_path}R3_1646_BINION_PROB_LIST_2020_05_12.csv')
df_problems_r3 = df_problems_r3[['STUDY_ID', 'DX_CODE_TYPE', 'DX_CODE', 'DX_NAME','ONSET_DATE']]
df_problems_r3.rename(columns={ 'ONSET_DATE': 'DX_DATE' }, inplace=True)
df_problems_r3 = merge_project_patient_id(df_problems_r3, df_patients_merged, 'r3')
df_problems_r3.head()

Unnamed: 0,DX_CODE_TYPE,DX_CODE,DX_NAME,DX_DATE,PROJECT_PATIENT_ID
0,ICD9,535.50,Unspecified gastritis and gastroduodenitis wit...,11/15/2011,64a0ac31-091d-45c5-86e3-86516ee4299a
1,ICD9,578.1,Blood in stool,10/10/2011,64a0ac31-091d-45c5-86e3-86516ee4299a
2,ICD9,555.2,Regional enteritis of small intestine with lar...,04/04/2012,64a0ac31-091d-45c5-86e3-86516ee4299a
3,ICD10,H26.9,Cataract,06/20/2016,64a0ac31-091d-45c5-86e3-86516ee4299a
4,ICD10,H04.129,Dry eye,06/20/2016,64a0ac31-091d-45c5-86e3-86516ee4299a


##### Dx

In [133]:
df_dx_r3 = pd.read_csv(f'{data_in_r3_path}R3_1646_BINION_DIAGNOSES_2020_05_12.csv')
df_dx_r3.head()

Unnamed: 0,STUDY_ID,VISIT_ID,DIAGNOSIS_TYPE,DX_CODE,DIAGNOSIS_NAME,PRIMARY_DX_IND,DX_FROM_DATE,DX_TO_DATE
0,6999994666,463260127,ICD9CM,790.29,Other abnormal glucose,Y,11/11/2014,
1,6999994666,476176269,ICD9CM,543.9,Other and unspecified diseases of appendix,N,11/05/2014,
2,6999994666,476176269,ICD9CM,455.6,Unspecified hemorrhoids without mention of com...,N,11/05/2014,
3,6999994666,463591352,ICD9CM,V81.1,Screening for hypertension,N,11/07/2014,
4,6999994666,463591352,ICD10CM,V81.1,Occupant of railway train or railway vehicle i...,N,11/07/2014,


In [134]:
df_dx_r3 = df_dx_r3[['STUDY_ID', 'DIAGNOSIS_TYPE', 'DX_CODE', 'DIAGNOSIS_NAME','DX_FROM_DATE']]
df_dx_r3.rename(columns={
    'DIAGNOSIS_TYPE': 'DX_CODE_TYPE',
    'DIAGNOSIS_NAME': 'DX_NAME',
    'DX_FROM_DATE': 'DX_DATE'
}, inplace=True)
df_dx_r3 = merge_project_patient_id(df_dx_r3, df_patients_merged, 'r3')
df_dx_r3.head()

Unnamed: 0,DX_CODE_TYPE,DX_CODE,DX_NAME,DX_DATE,PROJECT_PATIENT_ID
0,ICD9CM,790.29,Other abnormal glucose,11/11/2014,64a0ac31-091d-45c5-86e3-86516ee4299a
1,ICD9CM,543.9,Other and unspecified diseases of appendix,11/05/2014,64a0ac31-091d-45c5-86e3-86516ee4299a
2,ICD9CM,455.6,Unspecified hemorrhoids without mention of com...,11/05/2014,64a0ac31-091d-45c5-86e3-86516ee4299a
3,ICD9CM,V81.1,Screening for hypertension,11/07/2014,64a0ac31-091d-45c5-86e3-86516ee4299a
4,ICD10CM,V81.1,Occupant of railway train or railway vehicle i...,11/07/2014,64a0ac31-091d-45c5-86e3-86516ee4299a


#### Registry

**TODO:** registry has `PROBLEM_STATUS` field which, combined with `UPDATE_DATE`, might give us higher fidelity on some of these.

In [135]:
df_dx_registry = pd.read_excel(f'{data_in_registry_path}deid_IBD_Registry_BA1951_Problem_List_2020-01-05-09-30-19.xls')
df_dx_registry.head()

Unnamed: 0.1,Unnamed: 0,AUTO_ID,DATE_OF_ENTRY,NOTED_DATE,RESOLVED_DATE,UPDATE_DATE,PROBLEM_DESCRIPTION,ICD9_CODE,ICD10_CODE,DIAGNOSIS_NAME,CLASS_OF_PROBLEM,PROBLEM_TYPE,PROBLEM_STATUS,PRIORITY
0,0,0,2018-11-14 00:00:00,2009-08-27 00:00:00,,2018-11-14 14:06:00,,715.26,M17.31,Post-traumatic osteoarthritis of right knee,,,ACTIVE,
1,1,0,2018-11-14 00:00:00,2009-08-27 00:00:00,,2018-11-14 14:06:00,,719.46,M25.569,"Pain in joint, lower leg",,,DELETED,
2,2,0,2014-04-03 00:00:00,2010-04-07 00:00:00,,2015-07-28 12:14:00,,836.1,S83.289A,"Tear of lateral cartilage or meniscus of knee,...",,,ACTIVE,
3,3,0,2014-04-03 00:00:00,2010-04-07 00:00:00,,2015-07-28 12:14:00,,727.09,M65.80,Other synovitis and tenosynovitis,,,ACTIVE,
4,4,0,2014-04-03 00:00:00,2010-11-29 00:00:00,,2015-07-28 12:14:00,,727.05,"M65.849, M65.839",Other tenosynovitis of hand and wrist,,,ACTIVE,


In [136]:
df_dx_registry = df_dx_registry[['AUTO_ID', 'ICD10_CODE', 'DIAGNOSIS_NAME', 'DATE_OF_ENTRY']]
df_dx_registry.rename(columns={
    'DIAGNOSIS_NAME': 'DX_NAME',
    'DATE_OF_ENTRY': 'DX_DATE',
    'ICD10_CODE': 'DX_CODE'
}, inplace=True)
df_dx_registry['DX_CODE_TYPE'] = 'ICD10'
df_dx_registry = merge_project_patient_id(df_dx_registry, df_patients_merged, 'registry')
df_dx_registry.head()

Unnamed: 0,DX_CODE,DX_NAME,DX_DATE,DX_CODE_TYPE,PROJECT_PATIENT_ID
0,M17.31,Post-traumatic osteoarthritis of right knee,2018-11-14 00:00:00,ICD10,b8d4e6bd-e1b7-4a36-8df1-911926190d53
1,M25.569,"Pain in joint, lower leg",2018-11-14 00:00:00,ICD10,b8d4e6bd-e1b7-4a36-8df1-911926190d53
2,S83.289A,"Tear of lateral cartilage or meniscus of knee,...",2014-04-03 00:00:00,ICD10,b8d4e6bd-e1b7-4a36-8df1-911926190d53
3,M65.80,Other synovitis and tenosynovitis,2014-04-03 00:00:00,ICD10,b8d4e6bd-e1b7-4a36-8df1-911926190d53
4,"M65.849, M65.839",Other tenosynovitis of hand and wrist,2014-04-03 00:00:00,ICD10,b8d4e6bd-e1b7-4a36-8df1-911926190d53


#### Merge data sources

In [137]:
df_problems_merged = pd.concat([df_problems_r3, df_dx_r3, df_dx_registry])
df_problems_merged.head()

Unnamed: 0,DX_CODE_TYPE,DX_CODE,DX_NAME,DX_DATE,PROJECT_PATIENT_ID
0,ICD9,535.50,Unspecified gastritis and gastroduodenitis wit...,11/15/2011,64a0ac31-091d-45c5-86e3-86516ee4299a
1,ICD9,578.1,Blood in stool,10/10/2011,64a0ac31-091d-45c5-86e3-86516ee4299a
2,ICD9,555.2,Regional enteritis of small intestine with lar...,04/04/2012,64a0ac31-091d-45c5-86e3-86516ee4299a
3,ICD10,H26.9,Cataract,06/20/2016,64a0ac31-091d-45c5-86e3-86516ee4299a
4,ICD10,H04.129,Dry eye,06/20/2016,64a0ac31-091d-45c5-86e3-86516ee4299a


In [138]:
# standardize code types
df_problems_merged['DX_CODE_TYPE'].unique()

array(['ICD9', 'ICD10', 'ICD9CM', 'ICD10CM', nan], dtype=object)

In [139]:
df_problems_merged['DX_CODE_TYPE'] = df_problems_merged['DX_CODE_TYPE'].str.replace('CM', '')
df_problems_merged['DX_CODE_TYPE'].unique()

array(['ICD9', 'ICD10', nan], dtype=object)

In [140]:
df_problems_merged[df_problems_merged['DX_CODE_TYPE'].isna()].shape

(24, 5)

**TODO**: 24 without a dx code type. remove these?

In [141]:
df_problems_merged = df_problems_merged[df_problems_merged['DX_CODE_TYPE'].notna()]
len(df_problems_merged)

506038

#### Write out

In [142]:
df_problems_merged.to_csv(f'{data_out_path}problem_list_merged.csv')

### Procedures

#### R3

In [143]:
df_procedures_r3 = pd.read_csv(f'{data_in_r3_path}R3_1646_BINION_PROCS_2020_05_12.csv')
df_procedures_r3.head()

Unnamed: 0,STUDY_ID,VISIT_ID,ORDER_PROC_ID,PROC_DATE,PROC_CODE,PROC_TYPE,PROC_NAME,PROC_LOCATION,ORDER_DATE
0,6999994666,452902400.0,74962671.0,,71260,CUSTOM,CT CHEST WITH CONTRAST,,09/21/2011
1,6999994666,452902400.0,74962676.0,,74160,CPT(R),CT ABDOMEN WITH CONTRAST,,10/01/2011
2,6999994666,450398700.0,74962677.0,,43235,CPT(R),"UPPER GI ENDOSCOPY,DIAGNOSIS",,10/10/2011
3,6999994666,5210964000000.0,,02/06/2018,29823,CPT/HCPCS,"Arthroscopy, shoulder, surgical; debridement, ...",OR,
4,6999994666,5210964000000.0,,02/06/2018,29827,CPT/HCPCS,"Arthroscopy, shoulder, surgical; with rotator ...",OR,


In [144]:
df_procedures_r3 = df_procedures_r3[['STUDY_ID', 'PROC_CODE',  'PROC_NAME', 'ORDER_DATE', 'PROC_DATE']]
df_procedures_r3 = merge_project_patient_id(df_procedures_r3, df_patients_merged, 'r3')
df_procedures_r3.head()

Unnamed: 0,PROC_CODE,PROC_NAME,ORDER_DATE,PROC_DATE,PROJECT_PATIENT_ID
0,71260,CT CHEST WITH CONTRAST,09/21/2011,,64a0ac31-091d-45c5-86e3-86516ee4299a
1,74160,CT ABDOMEN WITH CONTRAST,10/01/2011,,64a0ac31-091d-45c5-86e3-86516ee4299a
2,43235,"UPPER GI ENDOSCOPY,DIAGNOSIS",10/10/2011,,64a0ac31-091d-45c5-86e3-86516ee4299a
3,29823,"Arthroscopy, shoulder, surgical; debridement, ...",,02/06/2018,64a0ac31-091d-45c5-86e3-86516ee4299a
4,29827,"Arthroscopy, shoulder, surgical; with rotator ...",,02/06/2018,64a0ac31-091d-45c5-86e3-86516ee4299a


#### Registry

In [145]:
df_procedures_registry = pd.read_excel(f'{data_in_registry_path}deid_IBD_Registry_BA1951_Rad_Diagnostic_Tests_2020-01-05-12-18-06.xlsx')
print(df_procedures_registry.shape)
df_procedures_registry.head()

(342232, 20)


Unnamed: 0.1,Unnamed: 0,AUTO_ID,VISIT DEPT,VISIT_DATE,ORDERING_DATE,CPT_CODE,DESCRIPTION,QUANTITY,PROC_START_DATE,PAT_STATUS,PROC_CAT_NAME,ORDER_TYPE,ORDER_CLASS,ORDER_STATUS,UPDATE_DATE,AUTH_PROV_NAME,RESULT_DATE,REVIEW_DATE,CENTER,STUDY_RESULT
0,0,2,,2009-07-23 00:00:00,2009-07-23 00:00:00,7699905,GI ENDOSCOPIC PROCEDURE,1,,Alive,PROCEDURES,GI PROCEDURES,,Completed,2015-11-17 15:34:00,"BELLICINI, NICHOLAS A",2009-09-03 15:27:00,,,
1,1,2,,2010-03-17 00:00:00,2010-03-17 00:00:00,7699905,GI ENDOSCOPIC PROCEDURE,1,,Alive,PROCEDURES,GI PROCEDURES,,Completed,2015-11-25 11:25:00,"GLORIOSO, DAVID V",2010-03-17 09:52:00,,,
2,2,95,,2009-10-05 00:00:00,2009-10-05 00:00:00,7699905,GI ENDOSCOPIC PROCEDURE,1,,Alive,PROCEDURES,GI PROCEDURES,,Completed,2015-11-19 15:29:00,"LIMAURO, DAVID L",2009-10-05 14:54:00,,,
3,3,108,,2010-02-02 00:00:00,2010-02-02 00:00:00,93005,"ELECTROCARDIOGRAM, TRACING",1,,Alive,MEDICINE,EKG,,Completed,2015-11-24 09:19:00,"HANDELSMAN, GORDON L",2010-02-02 07:45:00,,,
4,4,123,,2009-04-28 00:00:00,2009-04-28 00:00:00,93005,"ELECTROCARDIOGRAM, TRACING",1,,Alive,MEDICINE,EKG,,Completed,2015-11-13 12:54:00,"JACOBS, RICHARD P",2009-04-28 22:55:00,,,


In [146]:
df_procedures_registry = df_procedures_registry[['AUTO_ID', 'CPT_CODE', 'DESCRIPTION', 'VISIT_DATE', 'PROC_START_DATE']]
df_procedures_registry.rename(columns={
    'CPT_CODE': 'PROC_CODE',
    'DESCRIPTION': 'PROC_NAME',
    'VISIT_DATE': 'ORDER_DATE',
    'PROC_START_DATE': 'PROC_DATE'
}, inplace=True)
df_procedures_registry = merge_project_patient_id(df_procedures_registry, df_patients_merged, 'registry')
df_procedures_registry.head()

Unnamed: 0,PROC_CODE,PROC_NAME,ORDER_DATE,PROC_DATE,PROJECT_PATIENT_ID
0,7699905,GI ENDOSCOPIC PROCEDURE,2009-07-23 00:00:00,,865adc34-4b87-4747-b192-dc7301c6f362
1,7699905,GI ENDOSCOPIC PROCEDURE,2010-03-17 00:00:00,,865adc34-4b87-4747-b192-dc7301c6f362
2,7699905,GI ENDOSCOPIC PROCEDURE,2009-10-05 00:00:00,,9006db1e-95f2-4d04-847d-2257d0c4a214
3,93005,"ELECTROCARDIOGRAM, TRACING",2010-02-02 00:00:00,,05bd182a-6b98-4c29-9a48-d3e6335344c6
4,93005,"ELECTROCARDIOGRAM, TRACING",2009-04-28 00:00:00,,48949aa3-ef57-406d-b545-1ca8b6796285


#### Merge data sources

In [147]:
df_procedures_merged = df_procedures_r3.append(df_procedures_registry)
print(df_procedures_merged.shape)
df_procedures_merged.head()

(503497, 5)


Unnamed: 0,PROC_CODE,PROC_NAME,ORDER_DATE,PROC_DATE,PROJECT_PATIENT_ID
0,71260,CT CHEST WITH CONTRAST,09/21/2011,,64a0ac31-091d-45c5-86e3-86516ee4299a
1,74160,CT ABDOMEN WITH CONTRAST,10/01/2011,,64a0ac31-091d-45c5-86e3-86516ee4299a
2,43235,"UPPER GI ENDOSCOPY,DIAGNOSIS",10/10/2011,,64a0ac31-091d-45c5-86e3-86516ee4299a
3,29823,"Arthroscopy, shoulder, surgical; debridement, ...",,02/06/2018,64a0ac31-091d-45c5-86e3-86516ee4299a
4,29827,"Arthroscopy, shoulder, surgical; with rotator ...",,02/06/2018,64a0ac31-091d-45c5-86e3-86516ee4299a


In [148]:
df_procedures_merged[df_procedures_merged['PROJECT_PATIENT_ID'].isna()].shape

(0, 5)

#### Cleanup

In [149]:
df_procedures_merged['PROC_NAME'] = df_procedures_merged['PROC_NAME'].str.upper()

**TODO: CONFIRM** Screening for only a few kinds of procedures (as long as `PROC_NAME` contains this string, it counts):

In [150]:
def proc_list_filter(x):
    for item in ['MR CHOLANG', 'CT ABD', 'CT PELVIS', 'CT ABDOMEN']:
        if item in str(x['PROC_NAME']):
            return True
        
    return False

df_procedures_merged = df_procedures_merged[df_procedures_merged.apply(proc_list_filter, axis='columns')]
print(df_procedures_merged.shape)

(21561, 5)


**TODO:** 
- Do the (OO) suffix or the NM prefix make distinct cases?
- Is ABDOMEN/PELVIS different from ABDOMEN AND PELVIS?
- IN NUCLEAR MED important?

In [151]:
sorted(df_procedures_merged['PROC_NAME'].unique())

['CT ABD AND PELVIS WITH CONTRAST PANEL',
 'CT ABD AND PELVIS WITHOUT CONTRAST PANEL',
 'CT ABD WITH CT LOWER EXTREMITIES',
 'CT ABD/PELVIS W/O CONTRAST NC',
 'CT ABDOMEN AND PELVIS WITH AND WITHOUT CONTRAST',
 'CT ABDOMEN AND PELVIS WITH CONTRAST',
 'CT ABDOMEN AND PELVIS WITHOUT CONTRAST',
 'CT ABDOMEN W/CONTRST',
 'CT ABDOMEN WITH  & WITHOUT  CONTRAST',
 'CT ABDOMEN WITH AND WITHOUT CONTRAST',
 'CT ABDOMEN WITH AND WITHOUT CONTRAST (OO)',
 'CT ABDOMEN WITH CONTRAST',
 'CT ABDOMEN WITHOUT CONTRAST',
 'CT ABDOMEN WITHOUT CONTRAST (OO)',
 'CT ABDOMEN/PELVIS WITH CONTRAST DONE IN NUCLEAR MED',
 'CT ABDOMEN/PELVIS WITHOUT CONTRAST DONE IN NUC MED',
 'CT PELVIS WITH  & WITHOUT  CONTRAST',
 'CT PELVIS WITH AND WITHOUT CONTRAST',
 'CT PELVIS WITH AND WITHOUT CONTRAST (OO)',
 'CT PELVIS WITH CONTRAST',
 'CT PELVIS WITH CONTRAST (OO)',
 'CT PELVIS WITHOUT  CONTRAST',
 'CT PELVIS WITHOUT CONTRAST',
 'CT PELVIS WITHOUT CONTRAST (OO)',
 'MR CHOLANGIOPANCREATOGRAPHY (MRCP)',
 'NM CT ABDOMEN WITH 

In [152]:
df_procedures_merged['PROC_NAME'] = df_procedures_merged['PROC_NAME'].replace('CT ABDOMEN WITH  & WITHOUT  CONTRAST', 'CT ABDOMEN WITH AND WITHOUT CONTRAST')
df_procedures_merged['PROC_NAME'] = df_procedures_merged['PROC_NAME'].replace('CT ABDOMEN W/CONTRST', 'CT ABDOMEN WITH CONTRAST')
df_procedures_merged['PROC_NAME'] = df_procedures_merged['PROC_NAME'].replace('CT PELVIS WITH  & WITHOUT  CONTRAST', 'CT PELVIS WITH AND WITHOUT CONTRAST')
sorted(df_procedures_merged['PROC_NAME'].unique())

['CT ABD AND PELVIS WITH CONTRAST PANEL',
 'CT ABD AND PELVIS WITHOUT CONTRAST PANEL',
 'CT ABD WITH CT LOWER EXTREMITIES',
 'CT ABD/PELVIS W/O CONTRAST NC',
 'CT ABDOMEN AND PELVIS WITH AND WITHOUT CONTRAST',
 'CT ABDOMEN AND PELVIS WITH CONTRAST',
 'CT ABDOMEN AND PELVIS WITHOUT CONTRAST',
 'CT ABDOMEN WITH AND WITHOUT CONTRAST',
 'CT ABDOMEN WITH AND WITHOUT CONTRAST (OO)',
 'CT ABDOMEN WITH CONTRAST',
 'CT ABDOMEN WITHOUT CONTRAST',
 'CT ABDOMEN WITHOUT CONTRAST (OO)',
 'CT ABDOMEN/PELVIS WITH CONTRAST DONE IN NUCLEAR MED',
 'CT ABDOMEN/PELVIS WITHOUT CONTRAST DONE IN NUC MED',
 'CT PELVIS WITH AND WITHOUT CONTRAST',
 'CT PELVIS WITH AND WITHOUT CONTRAST (OO)',
 'CT PELVIS WITH CONTRAST',
 'CT PELVIS WITH CONTRAST (OO)',
 'CT PELVIS WITHOUT  CONTRAST',
 'CT PELVIS WITHOUT CONTRAST',
 'CT PELVIS WITHOUT CONTRAST (OO)',
 'MR CHOLANGIOPANCREATOGRAPHY (MRCP)',
 'NM CT ABDOMEN WITH CONTRAST',
 'NM CT ABDOMEN WITHOUT CONTRAST']

#### Write out

In [153]:
df_procedures_merged.to_csv(f'{data_out_path}procedures_merged.csv')

### Hospitalizations & ER visits

#### R3

We also read the hospitalizations/ER visits from the encounters file, but this time we don't restrict them by department (presumably because they might not fall under the departments we filter by when we pull in encounters above).

In [154]:
df_hosper_r3 = pd.read_csv(f'{data_in_r3_path}R3_1646_BINION_ENCOUNTERS_2020_05_12.csv')
df_hosper_r3.head()

Unnamed: 0,STUDY_ID,VISIT_ID,START_DATE,END_DATE,ENC_TYPE,LOCATION,APPT_STATUS,ADMIT_SOURCE,HOSPITAL_SERVICE,PATIENT_TYPE,PATIENT_CLASS,CHIEF_COMPLAINT,CHIEF_COMPLAINT_ONSET_DATE
0,6999994666,452902438,09/21/2011 00:00:00,,OFFICE VISIT,PIMA GREENTREE,COMPLETED,,,,,,
1,6999994666,450398659,10/10/2011 00:00:00,,OFFICE VISIT,RAVI GI GREENTREE,COMPLETED,,,,,,
2,6999994666,452908020,09/07/2011 00:00:00,,TELEPHONE,PIMA GREENTREE,,,,,,,
3,6999994666,452509743,09/12/2011 00:00:00,,APPOINTMENT,PIMA GREENTREE,CANCELED,,,,,,
4,6999994666,451199413,09/21/2011 00:00:00,,HISTORY,PIMA GREENTREE,,,,,,,


In [155]:
# drop unnecessary columns
df_hosper_r3 = df_hosper_r3[[
    'STUDY_ID',
    'START_DATE',
    'ENC_TYPE'
]]

# rename columns
df_hosper_r3.rename(columns={'START_DATE': 'CONTACT_DATE'}, inplace=True)

# filter encounter types of interest
df_hosper_r3 = df_hosper_r3[(df_hosper_r3['ENC_TYPE'] == 'DISCHARGE SUMMARY') | (df_hosper_r3['ENC_TYPE'] == 'ER REPORT')]
df_hosper_r3['IS_HOSPITALIZATION'] = df_hosper_r3['ENC_TYPE'].apply(lambda x: 1 if x == 'DISCHARGE SUMMARY' else 0)
df_hosper_r3['IS_ER_VISIT'] = df_hosper_r3['ENC_TYPE'].apply(lambda x: 1 if x == 'ER REPORT' else 0)
df_hosper_r3.drop(columns=['ENC_TYPE'], inplace=True)
df_hosper_r3 = merge_project_patient_id(df_hosper_r3, df_patients_merged, 'r3')
df_hosper_r3.head()

Unnamed: 0,CONTACT_DATE,IS_HOSPITALIZATION,IS_ER_VISIT,PROJECT_PATIENT_ID
0,07/31/2011 00:00:00,0,1,683be0ab-8f8c-44c5-8024-b5f660ca3e9b
1,02/18/2011 00:00:00,0,1,b807eb20-ef60-4c6a-899c-bb033fa6339e
2,08/14/2011 00:00:00,1,0,c1c79322-d705-43c5-b5d2-d17689bbb5a9
3,07/21/2011 00:00:00,1,0,c1c79322-d705-43c5-b5d2-d17689bbb5a9
4,12/08/2012 00:00:00,0,1,f8a58e23-0822-4dc8-bda3-82fedc214e5c


#### Registry

This logic is directly taken from our old merge logic (`merge_registry_with_r3.ipynb`), but I include it here so our current logic is all in one place (and so I can produce the file with `PROJECT_PATIENT_ID`s instead of `AUTO_ID`s).

**TODO**: If I'm understanding the old merge logic correctly, we don't have this data for R3 patients. This seems like a big liability for our model.

In [156]:
df_hosper_registry = pd.read_excel(f'{data_in_registry_path}deid_IBD_Registry_BA1951_ER_OP_DC_Reports_2020-01-05-09-11-16.xls')
df_hosper_registry.head()

Unnamed: 0.1,Unnamed: 0,AUTO_ID,ENC_TYPE_C,ENC_TYPE_NAME,CONTACT_DATE,DEPT_ID,DEPT_NAME
0,0,0,203,OP Report,2010-01-06 00:00:00,,
1,1,0,203,OP Report,2010-04-07 00:00:00,11116101.0,XBEDFORD ORTHO
2,2,0,203,OP Report,2010-08-04 00:00:00,,
3,3,0,203,OP Report,2018-04-20 00:00:00,9999.0,EXTERNAL DEPARTMENT
4,4,1,203,OP Report,2009-03-19 00:00:00,9999.0,EXTERNAL DEPARTMENT


In [157]:
df_hosper_registry = df_hosper_registry[[
    'AUTO_ID',
    'ENC_TYPE_NAME',
    'CONTACT_DATE'
]]

df = df_hosper_registry.groupby(['AUTO_ID','CONTACT_DATE']).size().reset_index().rename(columns={0:'count'})
df['ER'] = 0
df['DS'] = 0
df['OP'] = 0

for idx, row in df.iterrows():
    q = 'AUTO_ID == ' + str(row['AUTO_ID']) + ' and CONTACT_DATE == "' + row['CONTACT_DATE'] + '"'
    temp = df_hosper_registry.query(q)
    enc_type_counts = temp['ENC_TYPE_NAME'].value_counts().to_dict()
    if 'OP Report' in enc_type_counts.keys():
        df.at[idx, 'OP'] = enc_type_counts['OP Report']
    if 'Discharge Summary' in enc_type_counts.keys():
        df.at[idx, 'DS'] = enc_type_counts['Discharge Summary']
    if 'ER Report' in enc_type_counts.keys():
        df.at[idx, 'ER'] = enc_type_counts['ER Report']
        
df['HOSP'] = 0
df['ER_VISITS'] = 0

for idx, row in df.iterrows():
    hosp = row['DS'] - row['ER']
    er = 0
    if hosp < 0:
        hosp = 0
    if row['ER'] != 0:
        er = 1
    
    df.at[idx, 'HOSP'] = hosp
    df.at[idx, 'ER_VISITS'] = er
    
# drop unnecessary columns
df = df[['AUTO_ID', 'CONTACT_DATE', 'HOSP', 'ER_VISITS']]
df.rename(columns={ 'HOSP': 'IS_HOSPITALIZATION', 'ER_VISITS': 'IS_ER_VISIT' }, inplace=True)

# merge project patient id
df = merge_project_patient_id(df, df_patients_merged, 'registry')
df.head()

Unnamed: 0,CONTACT_DATE,IS_HOSPITALIZATION,IS_ER_VISIT,PROJECT_PATIENT_ID
0,2010-01-06 00:00:00,0,0,b8d4e6bd-e1b7-4a36-8df1-911926190d53
1,2010-04-07 00:00:00,0,0,b8d4e6bd-e1b7-4a36-8df1-911926190d53
2,2010-08-04 00:00:00,0,0,b8d4e6bd-e1b7-4a36-8df1-911926190d53
3,2018-04-20 00:00:00,0,0,b8d4e6bd-e1b7-4a36-8df1-911926190d53
4,2009-03-17 00:00:00,0,1,a9c3cee8-9f41-47cb-8f0d-af62ef0f31f9


#### Merge data sources

In [158]:
df_hosper_merged = df_hosper_r3.append(df)
df_hosper_merged.head()

Unnamed: 0,CONTACT_DATE,IS_HOSPITALIZATION,IS_ER_VISIT,PROJECT_PATIENT_ID
0,07/31/2011 00:00:00,0,1,683be0ab-8f8c-44c5-8024-b5f660ca3e9b
1,02/18/2011 00:00:00,0,1,b807eb20-ef60-4c6a-899c-bb033fa6339e
2,08/14/2011 00:00:00,1,0,c1c79322-d705-43c5-b5d2-d17689bbb5a9
3,07/21/2011 00:00:00,1,0,c1c79322-d705-43c5-b5d2-d17689bbb5a9
4,12/08/2012 00:00:00,0,1,f8a58e23-0822-4dc8-bda3-82fedc214e5c


#### Write out

In [159]:
df_hosper_merged.to_csv(f'{data_out_path}hospitalizations_and_er_visits_merged.csv')