In [1]:
import pandas as pd
import numpy as np
from datetime import date

In [2]:
source_path = '/Users/dmitriyb/OneDrive - University of Pittsburgh/Research/IBD Research/Inflammatory Bowel Disease DOD Grant/data/Merged Data 2021/'
source_files = {
        'encounters': 'encounters_merged.csv',
        'labs': 'labs_merged.csv',
        'meds': 'meds_merged.csv',
        'patients': 'patients_merged.csv',
        'problems': 'problem_list_merged.csv',
        'procedures': 'procedures_merged.csv',
        'targets': 'hospitalizations_and_er_visits_merged.csv'
}

## Patients

In [3]:
patients = pd.read_csv(source_path + source_files['patients'])
patients.head()

Unnamed: 0.1,Unnamed: 0,BIRTH_YEAR,GENDER,DATA_SOURCE,MARITAL STATUS,RACE,ETHNIC_GROUP,EMPLOYMENT_STATUS,IS_ALIVE,PROJECT_PATIENT_ID
0,0,1958,MALE,r3,,,,,False,64a0ac31-091d-45c5-86e3-86516ee4299a
1,1,1928,FEMALE,r3,,,,,False,7da2e9db-ae3b-41bc-a467-fa2d9afe25ea
2,2,1958,FEMALE,r3,,,,,False,524c0db3-6ef0-47c6-9b99-098d0bf998cd
3,3,1954,FEMALE,r3,,,,,False,ed3a3f1a-0c26-402b-b922-159589036b46
4,4,1988,MALE,r3,,,,,False,4d3ea8f4-15b6-477f-805e-7932c6aabc74


In [4]:
patients.drop(['Unnamed: 0', 'IS_ALIVE', 'DATA_SOURCE', 'ETHNIC_GROUP'], axis=1, inplace=True)
patients.head()

Unnamed: 0,BIRTH_YEAR,GENDER,MARITAL STATUS,RACE,EMPLOYMENT_STATUS,PROJECT_PATIENT_ID
0,1958,MALE,,,,64a0ac31-091d-45c5-86e3-86516ee4299a
1,1928,FEMALE,,,,7da2e9db-ae3b-41bc-a467-fa2d9afe25ea
2,1958,FEMALE,,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd
3,1954,FEMALE,,,,ed3a3f1a-0c26-402b-b922-159589036b46
4,1988,MALE,,,,4d3ea8f4-15b6-477f-805e-7932c6aabc74


In [5]:
patients.fillna('Unknown', inplace=True)
patients['GENDER'] = patients['GENDER'].map({'MALE': 0, 'FEMALE': 1, 'M' : 0, 'F' : 1})
patients['AGE'] = date.today().year - patients['BIRTH_YEAR']
patients.head()

Unnamed: 0,BIRTH_YEAR,GENDER,MARITAL STATUS,RACE,EMPLOYMENT_STATUS,PROJECT_PATIENT_ID,AGE
0,1958,0,Unknown,Unknown,Unknown,64a0ac31-091d-45c5-86e3-86516ee4299a,63
1,1928,1,Unknown,Unknown,Unknown,7da2e9db-ae3b-41bc-a467-fa2d9afe25ea,93
2,1958,1,Unknown,Unknown,Unknown,524c0db3-6ef0-47c6-9b99-098d0bf998cd,63
3,1954,1,Unknown,Unknown,Unknown,ed3a3f1a-0c26-402b-b922-159589036b46,67
4,1988,0,Unknown,Unknown,Unknown,4d3ea8f4-15b6-477f-805e-7932c6aabc74,33


In [6]:
patients['MARITAL STATUS'] = patients['MARITAL STATUS'].map(
    {'Married': 'Relationship', 
     'Single' : 'Single',
     'Divorced': 'Single',
     'Widowed' : 'Single',
     'Legally Separated' : 'Single',
     'Committed relationship' : 'Relationship',
     'Significant other' : 'Relationship',
     'Unknown' : 'Unknown'})
     
patients['RACE'] = patients['RACE'].map(
    {'White': 'White', 
     'Unknown' : 'Unknown',
     'Black': 'Black',
     'Indian (Asian)' : 'Other',
     'Other Asian' : 'Other',
     'Chinese' : 'Other',
     'American Indian' : 'Other',
     'Japanese' : 'Other',
     'Other Pacific Islander' : 'Other', 
     'Filipino' : 'Other',
     'Korean' : 'Other',
     'Vietnamese' : 'Other',
     'Alaska Native' : 'Other'
    })

patients['EMPLOYMENT_STATUS'] = patients['EMPLOYMENT_STATUS'].map(
    {'Unknown': 'Unknown', 
     'Full Time': 'Employed',
     'Not Employed' : 'Unemployed',
     'Retired' : 'Unemployed',
     'Student - Full Time' : 'Student',
     'Part Time' : 'Employed',
     'Self Employed' : 'Employed',
     'Student - Part Time' : 'Student' 
    })

In [7]:
df_dummies = pd.get_dummies(patients[['MARITAL STATUS','RACE','EMPLOYMENT_STATUS']], prefix=['MARITAL','RACE','EMPL'])
patients = pd.concat([patients, df_dummies], axis=1)
patients.drop(['BIRTH_YEAR','MARITAL STATUS','RACE','EMPLOYMENT_STATUS'], axis=1, inplace=True)
patients.head()

Unnamed: 0,GENDER,PROJECT_PATIENT_ID,AGE,MARITAL_Relationship,MARITAL_Single,MARITAL_Unknown,RACE_Black,RACE_Other,RACE_Unknown,RACE_White,EMPL_Employed,EMPL_Student,EMPL_Unemployed,EMPL_Unknown
0,0,64a0ac31-091d-45c5-86e3-86516ee4299a,63,0,0,1,0,0,1,0,0,0,0,1
1,1,7da2e9db-ae3b-41bc-a467-fa2d9afe25ea,93,0,0,1,0,0,1,0,0,0,0,1
2,1,524c0db3-6ef0-47c6-9b99-098d0bf998cd,63,0,0,1,0,0,1,0,0,0,0,1
3,1,ed3a3f1a-0c26-402b-b922-159589036b46,67,0,0,1,0,0,1,0,0,0,0,1
4,0,4d3ea8f4-15b6-477f-805e-7932c6aabc74,33,0,0,1,0,0,1,0,0,0,0,1


## Encounters

In [8]:
encounters = pd.read_csv(source_path + source_files['encounters'])
encounters.head()

Unnamed: 0.1,Unnamed: 0,ENCOUNTER_DATE,ENCOUNTER_TYPE,DEPT_NAME,ICD9_CODE,ICD10_CODE,PRIMARY_DX,PROJECT_PATIENT_ID
0,0,08/01/2011 00:00:00,TELEPHONE,GI SURG DDC HBC,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9
1,1,08/05/2011 00:00:00,TELEPHONE,GI SURG DDC HBC,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9
2,2,04/20/2011 00:00:00,TELEPHONE,GAS HBC OAKLAND DDC PH,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9
3,3,04/21/2011 00:00:00,SCAN,GAS HBC OAKLAND DDC PH,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9
4,4,04/21/2011 00:00:00,SCAN,GI SURG DDC HBC,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9


In [9]:
encounters['ENCOUNTER_YEAR'] = pd.DatetimeIndex(encounters['ENCOUNTER_DATE']).year
encounters.head()

Unnamed: 0.1,Unnamed: 0,ENCOUNTER_DATE,ENCOUNTER_TYPE,DEPT_NAME,ICD9_CODE,ICD10_CODE,PRIMARY_DX,PROJECT_PATIENT_ID,ENCOUNTER_YEAR
0,0,08/01/2011 00:00:00,TELEPHONE,GI SURG DDC HBC,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9,2011
1,1,08/05/2011 00:00:00,TELEPHONE,GI SURG DDC HBC,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9,2011
2,2,04/20/2011 00:00:00,TELEPHONE,GAS HBC OAKLAND DDC PH,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9,2011
3,3,04/21/2011 00:00:00,SCAN,GAS HBC OAKLAND DDC PH,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9,2011
4,4,04/21/2011 00:00:00,SCAN,GI SURG DDC HBC,,,,c1c79322-d705-43c5-b5d2-d17689bbb5a9,2011


In [10]:
#encounters['ENCOUNTER_TYPE'].value_counts()
q = "ENCOUNTER_TYPE == 'TELEPHONE' or ENCOUNTER_TYPE == 'OFFICE VISIT'"
table = encounters.query(q).groupby(['PROJECT_PATIENT_ID', 'ENCOUNTER_YEAR', 'ENCOUNTER_TYPE']).agg({'ENCOUNTER_TYPE' : 'count'})
table.columns = ['ENCOUNTER_COUNTS']
table.reset_index(inplace=True)
table.head()

Unnamed: 0,PROJECT_PATIENT_ID,ENCOUNTER_YEAR,ENCOUNTER_TYPE,ENCOUNTER_COUNTS
0,001ebb31-0d52-4814-8f19-39d88bee3b29,2009,OFFICE VISIT,2
1,001ebb31-0d52-4814-8f19-39d88bee3b29,2009,TELEPHONE,5
2,001ebb31-0d52-4814-8f19-39d88bee3b29,2010,OFFICE VISIT,1
3,001ebb31-0d52-4814-8f19-39d88bee3b29,2010,TELEPHONE,1
4,001ebb31-0d52-4814-8f19-39d88bee3b29,2011,OFFICE VISIT,1


In [11]:
df_enc = pd.pivot_table(table, index=['PROJECT_PATIENT_ID', 'ENCOUNTER_YEAR'], 
                       values = 'ENCOUNTER_COUNTS', 
                       columns='ENCOUNTER_TYPE')

df_enc.reset_index(inplace=True)
df_enc.fillna(0, inplace=True)
df_enc.head()

ENCOUNTER_TYPE,PROJECT_PATIENT_ID,ENCOUNTER_YEAR,OFFICE VISIT,TELEPHONE
0,001ebb31-0d52-4814-8f19-39d88bee3b29,2009,2.0,5.0
1,001ebb31-0d52-4814-8f19-39d88bee3b29,2010,1.0,1.0
2,001ebb31-0d52-4814-8f19-39d88bee3b29,2011,1.0,10.0
3,001ebb31-0d52-4814-8f19-39d88bee3b29,2012,0.0,1.0
4,001ebb31-0d52-4814-8f19-39d88bee3b29,2014,0.0,3.0


In [12]:
# Join patients and encounters

df = pd.merge(patients, df_enc, how='inner', left_on='PROJECT_PATIENT_ID', right_on='PROJECT_PATIENT_ID')
df.head()

Unnamed: 0,GENDER,PROJECT_PATIENT_ID,AGE,MARITAL_Relationship,MARITAL_Single,MARITAL_Unknown,RACE_Black,RACE_Other,RACE_Unknown,RACE_White,EMPL_Employed,EMPL_Student,EMPL_Unemployed,EMPL_Unknown,ENCOUNTER_YEAR,OFFICE VISIT,TELEPHONE
0,0,4d3ea8f4-15b6-477f-805e-7932c6aabc74,33,0,0,1,0,0,1,0,0,0,0,1,2011,4.0,8.0
1,0,4d3ea8f4-15b6-477f-805e-7932c6aabc74,33,0,0,1,0,0,1,0,0,0,0,1,2012,1.0,4.0
2,0,4d3ea8f4-15b6-477f-805e-7932c6aabc74,33,0,0,1,0,0,1,0,0,0,0,1,2013,0.0,2.0
3,0,4d3ea8f4-15b6-477f-805e-7932c6aabc74,33,0,0,1,0,0,1,0,0,0,0,1,2014,2.0,8.0
4,0,4d3ea8f4-15b6-477f-805e-7932c6aabc74,33,0,0,1,0,0,1,0,0,0,0,1,2015,2.0,5.0


## Labs

In [13]:
labs = pd.read_csv(source_path + source_files['labs'])
labs.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0.1,Unnamed: 0,RESULT_DATE,ORDER_PROC_ID,COMPONENT_NAME,ORD_VALUE,ORD_NUM_VALUE,REFERENCE_UNIT,REFERENCE_LOW,REFERENCE_HIGH,RESULT_FLAG,GROUP,PROJECT_PATIENT_ID
0,0,03/27/2015,203602100,SPECIMEN DESCRIPTION,Stool,,,,,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd
1,1,03/27/2015,203602100,SPECIAL REQUESTS,,,,,,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd
2,2,03/27/2015,203602100,CULTURE,,,,,,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd
3,3,03/27/2015,203602100,REPORT,Final Result 03/27/2015,,,,,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd
4,4,03/23/2015,114670251,LIPASE,164,164.0,U/L,50.0,393.0,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd


In [14]:
labs['RESULT_DATE'] = labs['RESULT_DATE'].astype('datetime64[ns]')
labs['RESULT_YEAR'] = pd.DatetimeIndex(labs['RESULT_DATE']).year
labs.head()

Unnamed: 0.1,Unnamed: 0,RESULT_DATE,ORDER_PROC_ID,COMPONENT_NAME,ORD_VALUE,ORD_NUM_VALUE,REFERENCE_UNIT,REFERENCE_LOW,REFERENCE_HIGH,RESULT_FLAG,GROUP,PROJECT_PATIENT_ID,RESULT_YEAR
0,0,2015-03-27,203602100,SPECIMEN DESCRIPTION,Stool,,,,,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd,2015.0
1,1,2015-03-27,203602100,SPECIAL REQUESTS,,,,,,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd,2015.0
2,2,2015-03-27,203602100,CULTURE,,,,,,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd,2015.0
3,3,2015-03-27,203602100,REPORT,Final Result 03/27/2015,,,,,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd,2015.0
4,4,2015-03-23,114670251,LIPASE,164,164.0,U/L,50.0,393.0,,,524c0db3-6ef0-47c6-9b99-098d0bf998cd,2015.0


In [15]:
table = labs.groupby(['PROJECT_PATIENT_ID', 'RESULT_YEAR', 'GROUP']).agg({'ORD_NUM_VALUE' : ['min', 'max']})
#table.columns = ['LAB_MIN', 'LAB_MAX']
table.reset_index(inplace=True)
#table.columns = table.columns.droplevel()
table.columns = ['_'.join(col).strip().upper() for col in table.columns.values]
table.columns = ['PROJECT_PATIENT_ID', 'RESULT_YEAR', 'GROUP', 
                 'MIN', 'MAX']
table.head()

Unnamed: 0,PROJECT_PATIENT_ID,RESULT_YEAR,GROUP,MIN,MAX
0,000b3bdc-50c8-490b-acd0-64754abb91df,2010.0,albumin,,
1,000b3bdc-50c8-490b-acd0-64754abb91df,2010.0,eos,,
2,000b3bdc-50c8-490b-acd0-64754abb91df,2010.0,hemoglobin,,
3,000b3bdc-50c8-490b-acd0-64754abb91df,2010.0,monocytes,,
4,001ebb31-0d52-4814-8f19-39d88bee3b29,2017.0,albumin,3.8,3.8


In [16]:
df_labs = pd.pivot_table(table, index=['PROJECT_PATIENT_ID', 'RESULT_YEAR'], 
                       values = ['MIN', 'MAX'], 
                       columns='GROUP')
#df_labs.fillna(0, inplace=True)
df_labs.reset_index(inplace=True)
df_labs.columns = ['_'.join(col).strip().upper() for col in df_labs.columns.values]
df_labs.rename(columns={'PROJECT_PATIENT_ID_':'PROJECT_PATIENT_ID', 'RESULT_YEAR_': 'RESULT_YEAR'}, inplace=True)
df_labs.head()



Unnamed: 0,PROJECT_PATIENT_ID,RESULT_YEAR,MAX_ALBUMIN,MAX_CRP,MAX_EOS,MAX_ESR,MAX_HEMOGLOBIN,MAX_MONOCYTES,MAX_VITAMIN_D,MIN_ALBUMIN,MIN_CRP,MIN_EOS,MIN_ESR,MIN_HEMOGLOBIN,MIN_MONOCYTES,MIN_VITAMIN_D
0,001ebb31-0d52-4814-8f19-39d88bee3b29,2017.0,3.8,,7.0,,13.7,10.0,,3.8,,0.4,,13.7,0.5,
1,001ebb31-0d52-4814-8f19-39d88bee3b29,2018.0,4.1,,5.0,,13.7,12.0,8.0,4.1,,0.3,,13.7,0.7,8.0
2,002feddb-9223-4c1e-b56c-9bcee72592a3,2019.0,,,2.0,,12.1,4.0,,,,0.2,,8.7,0.5,
3,00418a04-d98f-4569-af9f-624fb7c0d6c0,2017.0,3.8,0.3,140.0,56.0,10.6,530.0,,3.8,0.3,2.0,56.0,9.7,7.0,
4,00418a04-d98f-4569-af9f-624fb7c0d6c0,2018.0,3.5,,90.0,,10.0,550.0,,3.5,,1.0,,10.0,9.0,


### To-do - replace missing values with normal range
* Albumin: 3.4 to 5.4 g/dL (34 to 54 g/L)
* CRP: normal reading is less than 10 milligram per liter (mg/L)
* EOS: normal range is usually between 30 and 350
* ESR: 
    * Women under age 50 should have an ESR between 0 and 20 mm/hr.
    * Men under age 50 should have an ESR between 0 and 15 mm/hr.
    * Women over age 50 should have an ESR between 0 and 30 mm/hr.
    * Men over age 50 should have an ESR between 0 and 20 mm/hr.
    * Children should have an ESR between 0 and 10 mm/hr.
* Hemoglobin: 
    * For men, 13.5 to 17.5 grams per deciliter. 
    * For women, 12.0 to 15.5 grams per deciliter.
* Monocytes:
    * Monocytes: 80 to 800

   

In [17]:
# Impute albumin: 
df_labs['MAX_ALBUMIN'].fillna(4, inplace=True)
df_labs['MIN_ALBUMIN'].fillna(4, inplace=True)

# Impute CRP
df_labs['MAX_CRP'].fillna(df_labs['MAX_CRP'].median(), inplace=True)
df_labs['MIN_CRP'].fillna(df_labs['MIN_CRP'].median(), inplace=True)

# Impute EOS
df_labs['MAX_EOS'].fillna(30, inplace=True)
df_labs['MIN_EOS'].fillna(30, inplace=True)

# Impute ESR
df_labs['MAX_ESR'].fillna(10, inplace=True)
df_labs['MIN_ESR'].fillna(10, inplace=True)

# Impute HEMOGLOBIN
df_labs['MAX_HEMOGLOBIN'].fillna(14, inplace=True)
df_labs['MIN_HEMOGLOBIN'].fillna(14, inplace=True)

# MAX_MONOCYTES
df_labs['MAX_MONOCYTES'].fillna(100, inplace=True)
df_labs['MIN_MONOCYTES'].fillna(100, inplace=True)


# MIN_VITAMIN_D
df_labs['MAX_VITAMIN_D'].fillna(30, inplace=True)
df_labs['MIN_VITAMIN_D'].fillna(30, inplace=True)

In [18]:
# Join labs to master dataframe

df = pd.merge(df, df_labs, how='inner', left_on=['PROJECT_PATIENT_ID','ENCOUNTER_YEAR'], right_on=['PROJECT_PATIENT_ID','RESULT_YEAR'])
df.head()

Unnamed: 0,GENDER,PROJECT_PATIENT_ID,AGE,MARITAL_Relationship,MARITAL_Single,MARITAL_Unknown,RACE_Black,RACE_Other,RACE_Unknown,RACE_White,...,MAX_HEMOGLOBIN,MAX_MONOCYTES,MAX_VITAMIN_D,MIN_ALBUMIN,MIN_CRP,MIN_EOS,MIN_ESR,MIN_HEMOGLOBIN,MIN_MONOCYTES,MIN_VITAMIN_D
0,0,4d3ea8f4-15b6-477f-805e-7932c6aabc74,33,0,0,1,0,0,1,0,...,15.4,11.0,57.0,3.6,2.541,0.0,3.0,12.0,0.2,33.0
1,0,4d3ea8f4-15b6-477f-805e-7932c6aabc74,33,0,0,1,0,0,1,0,...,16.2,9.0,21.0,4.5,0.147,0.1,7.0,16.1,0.7,18.0
2,0,4d3ea8f4-15b6-477f-805e-7932c6aabc74,33,0,0,1,0,0,1,0,...,16.0,10.0,30.0,4.2,0.38,0.0,10.0,12.5,0.6,30.0
3,0,4d3ea8f4-15b6-477f-805e-7932c6aabc74,33,0,0,1,0,0,1,0,...,16.1,11.0,28.0,4.5,0.029,0.1,5.0,16.1,0.6,28.0
4,0,4d3ea8f4-15b6-477f-805e-7932c6aabc74,33,0,0,1,0,0,1,0,...,15.8,100.0,30.0,4.7,0.38,30.0,5.0,15.8,100.0,30.0


## Medications

In [19]:
meds = pd.read_csv(source_path + source_files['meds'])
meds['ORDERING_DATE'] = meds['ORDERING_DATE'].astype('datetime64[ns]')
meds['ORDERING_YEAR'] = pd.DatetimeIndex(meds['ORDERING_DATE']).year
meds.head()

Unnamed: 0.1,Unnamed: 0,ORDER_ID,MED_NAME,SIMPLE_GENERIC_NAME,ORDERING_DATE,START_DATE,END_DATE,PHARM_CLASS,GROUP,PROJECT_PATIENT_ID,ORDERING_YEAR
0,9544,269515878,VEDOLIZUMAB 300 MG INTRAVENOUS SOLUTION,VEDOLIZUMAB,2016-08-25,,,"INTEGRIN RECEPTOR ANTAGONIST, MONOCLONAL ANTIBODY",ANTI INTEGRIN,8a3ffbbd-920b-4fc3-98d1-333d1f0c4421,2016
1,12397,363569478,ENTYVIO IV,VEDOLIZUMAB,2018-08-01,,,"INTEGRIN RECEPTOR ANTAGONIST, MONOCLONAL ANTIBODY",ANTI INTEGRIN,aafb936f-c01d-42b9-af39-d7d42565bd94,2018
2,13542,360250929,USTEKINUMAB 90 MG/ML SUBCUTANEOUS SYRINGE,USTEKINUMAB,2018-06-26,06/26/2018,07/26/2018,MONOCLONAL ANTIBODY-HUMAN INTERLEUKIN 12/23 INHIB,ANTI IL12,b46226ef-d27a-400a-bcaf-162da03beacf,2018
3,13552,360250936,USTEKINUMAB 90 MG/ML SUBCUTANEOUS SYRINGE,USTEKINUMAB,2018-07-27,07/27/2018,,MONOCLONAL ANTIBODY-HUMAN INTERLEUKIN 12/23 INHIB,ANTI IL12,b46226ef-d27a-400a-bcaf-162da03beacf,2018
4,15757,358016766,USTEKINUMAB 90 MG/ML SUBCUTANEOUS SYRINGE,USTEKINUMAB,2018-06-13,06/13/2018,06/13/2018,MONOCLONAL ANTIBODY-HUMAN INTERLEUKIN 12/23 INHIB,ANTI IL12,c97774af-47f3-4493-890f-9e9e5baeb6e3,2018


In [20]:
table = meds.groupby(['PROJECT_PATIENT_ID', 'ORDERING_YEAR', 'GROUP']).agg({'GROUP' : 'count'})
table.columns = ['GROUP_COUNT']
table.reset_index(inplace=True)
#table.columns = table.columns.droplevel()
table.columns = ['_'.join(col).strip().upper() for col in table.columns.values]
table.columns = ['PROJECT_PATIENT_ID', 'ORDERING_YEAR', 'GROUP', 
                 'COUNT']
table.head()

Unnamed: 0,PROJECT_PATIENT_ID,ORDERING_YEAR,GROUP,COUNT
0,00069eef-b59e-459b-b7b4-b9be111a14c2,2009,5 ASA,2
1,000b3bdc-50c8-490b-acd0-64754abb91df,2009,Immunomodulators,1
2,000b3bdc-50c8-490b-acd0-64754abb91df,2009,Vitamin D,1
3,001ebb31-0d52-4814-8f19-39d88bee3b29,2009,5 ASA,3
4,001ebb31-0d52-4814-8f19-39d88bee3b29,2009,Systemic steroids,1


In [21]:
# To-do: Change MED_COUNT to 0/1

In [22]:
df_meds = pd.pivot_table(table, index=['PROJECT_PATIENT_ID', 'ORDERING_YEAR'], 
                       values = ['COUNT'], 
                       columns='GROUP')
df_meds.fillna(0, inplace=True)
df_meds.reset_index(inplace=True)
df_meds.columns = ['_'.join(col).strip().upper() for col in df_meds.columns.values]
df_meds.rename(columns={'PROJECT_PATIENT_ID_':'PROJECT_PATIENT_ID', 'ORDERING_YEAR_': 'ORDERING_YEAR'}, inplace=True)
df_meds.head()



Unnamed: 0,PROJECT_PATIENT_ID,ORDERING_YEAR,COUNT_5 ASA,COUNT_ANTI IL12,COUNT_ANTI INTEGRIN,COUNT_ANTI TNF,COUNT_IMMUNOMODULATORS,COUNT_SYSTEMIC STEROIDS,COUNT_VITAMIN D
0,00069eef-b59e-459b-b7b4-b9be111a14c2,2009,2.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000b3bdc-50c8-490b-acd0-64754abb91df,2009,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,001ebb31-0d52-4814-8f19-39d88bee3b29,2009,3.0,0.0,0.0,0.0,0.0,1.0,0.0
3,001ebb31-0d52-4814-8f19-39d88bee3b29,2010,2.0,0.0,0.0,0.0,0.0,1.0,0.0
4,001ebb31-0d52-4814-8f19-39d88bee3b29,2011,5.0,0.0,0.0,0.0,0.0,1.0,0.0


In [23]:
# Join medications to master dataframe

df = pd.merge(df, df_meds, how='inner', left_on=['PROJECT_PATIENT_ID', 'ENCOUNTER_YEAR'], right_on=['PROJECT_PATIENT_ID','ORDERING_YEAR'])
df.head()

Unnamed: 0,GENDER,PROJECT_PATIENT_ID,AGE,MARITAL_Relationship,MARITAL_Single,MARITAL_Unknown,RACE_Black,RACE_Other,RACE_Unknown,RACE_White,...,MIN_MONOCYTES,MIN_VITAMIN_D,ORDERING_YEAR,COUNT_5 ASA,COUNT_ANTI IL12,COUNT_ANTI INTEGRIN,COUNT_ANTI TNF,COUNT_IMMUNOMODULATORS,COUNT_SYSTEMIC STEROIDS,COUNT_VITAMIN D
0,1,3fa42a37-41a5-4995-8085-cc9bf1b259e4,70,0,0,1,0,0,1,0,...,0.2,57.0,2016,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0,70c8c891-183c-4355-bde7-350434fb6e4a,60,0,0,1,0,0,1,0,...,5.0,30.0,2015,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0,b8d4e6bd-e1b7-4a36-8df1-911926190d53,58,1,0,0,0,0,0,1,...,0.5,38.0,2010,2.0,0.0,0.0,0.0,2.0,2.0,0.0
3,0,b8d4e6bd-e1b7-4a36-8df1-911926190d53,58,1,0,0,0,0,0,1,...,0.6,30.0,2011,4.0,0.0,0.0,0.0,4.0,0.0,0.0
4,0,b8d4e6bd-e1b7-4a36-8df1-911926190d53,58,1,0,0,0,0,0,1,...,0.3,42.0,2012,0.0,0.0,0.0,0.0,2.0,0.0,0.0


## Problems

In [24]:
problems = pd.read_csv(source_path + source_files['problems'])
problems.head()

Unnamed: 0.1,Unnamed: 0,DX_CODE_TYPE,DX_CODE,DX_NAME,DX_DATE,PROJECT_PATIENT_ID
0,0,ICD9,535.50,Unspecified gastritis and gastroduodenitis wit...,11/15/2011,64a0ac31-091d-45c5-86e3-86516ee4299a
1,1,ICD9,578.1,Blood in stool,10/10/2011,64a0ac31-091d-45c5-86e3-86516ee4299a
2,2,ICD9,555.2,Regional enteritis of small intestine with lar...,04/04/2012,64a0ac31-091d-45c5-86e3-86516ee4299a
3,3,ICD10,H26.9,Cataract,06/20/2016,64a0ac31-091d-45c5-86e3-86516ee4299a
4,4,ICD10,H04.129,Dry eye,06/20/2016,64a0ac31-091d-45c5-86e3-86516ee4299a


In [25]:
# Only psych comorbidities

## Procedures

In [26]:
procs = pd.read_csv(source_path + source_files['procedures'])
procs['ORDER_DATE'] = procs['ORDER_DATE'].astype('datetime64[ns]')
procs['ORDER_YEAR'] = pd.DatetimeIndex(procs['ORDER_DATE']).year
procs.head()

Unnamed: 0.1,Unnamed: 0,PROC_CODE,PROC_NAME,ORDER_DATE,PROC_DATE,PROJECT_PATIENT_ID,ORDER_YEAR
0,1,74160,CT ABDOMEN WITH CONTRAST,2011-10-01,,64a0ac31-091d-45c5-86e3-86516ee4299a,2011
1,13,74160,CT ABDOMEN WITH CONTRAST,2009-09-15,,64a0ac31-091d-45c5-86e3-86516ee4299a,2009
2,15,72193,CT PELVIS WITH CONTRAST,2009-09-15,,64a0ac31-091d-45c5-86e3-86516ee4299a,2009
3,16,74177,CT ABDOMEN AND PELVIS WITH CONTRAST,2014-11-05,,64a0ac31-091d-45c5-86e3-86516ee4299a,2014
4,29,74170,CT ABDOMEN WITH AND WITHOUT CONTRAST,2011-09-21,,64a0ac31-091d-45c5-86e3-86516ee4299a,2011


In [27]:
table = procs.groupby(['PROJECT_PATIENT_ID', 'ORDER_YEAR']).agg({'PROC_NAME':'count'})
table.reset_index(inplace=True)
table.columns = ['PROJECT_PATIENT_ID', 'ORDER_YEAR','PROC_COUNT']
table.head()

Unnamed: 0,PROJECT_PATIENT_ID,ORDER_YEAR,PROC_COUNT
0,00418a04-d98f-4569-af9f-624fb7c0d6c0,2009,2
1,00418a04-d98f-4569-af9f-624fb7c0d6c0,2010,2
2,004977aa-bdab-4a53-a820-d053281b93b0,2009,2
3,004977aa-bdab-4a53-a820-d053281b93b0,2010,2
4,004977aa-bdab-4a53-a820-d053281b93b0,2014,3


In [28]:
# Join procedures to master dataframe

df = pd.merge(df, table, how='inner', left_on=['PROJECT_PATIENT_ID', 'ENCOUNTER_YEAR'], right_on=['PROJECT_PATIENT_ID','ORDER_YEAR'])
df.head()

Unnamed: 0,GENDER,PROJECT_PATIENT_ID,AGE,MARITAL_Relationship,MARITAL_Single,MARITAL_Unknown,RACE_Black,RACE_Other,RACE_Unknown,RACE_White,...,ORDERING_YEAR,COUNT_5 ASA,COUNT_ANTI IL12,COUNT_ANTI INTEGRIN,COUNT_ANTI TNF,COUNT_IMMUNOMODULATORS,COUNT_SYSTEMIC STEROIDS,COUNT_VITAMIN D,ORDER_YEAR,PROC_COUNT
0,1,3fa42a37-41a5-4995-8085-cc9bf1b259e4,70,0,0,1,0,0,1,0,...,2016,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2016,2
1,0,70c8c891-183c-4355-bde7-350434fb6e4a,60,0,0,1,0,0,1,0,...,2015,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2015,1
2,0,a9c3cee8-9f41-47cb-8f0d-af62ef0f31f9,69,1,0,0,0,0,0,1,...,2009,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2009,3
3,0,a9c3cee8-9f41-47cb-8f0d-af62ef0f31f9,69,1,0,0,0,0,0,1,...,2012,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2012,1
4,0,a9c3cee8-9f41-47cb-8f0d-af62ef0f31f9,69,1,0,0,0,0,0,1,...,2015,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2015,1


In [29]:
df.shape

(2699, 42)

## Targets

In [30]:
targets = pd.read_csv(source_path + source_files['targets'])
targets['CONTACT_DATE'] = targets['CONTACT_DATE'].astype('datetime64[ns]')
targets['CONTACT_YEAR'] = pd.DatetimeIndex(targets['CONTACT_DATE']).year
targets.head()

Unnamed: 0.1,Unnamed: 0,CONTACT_DATE,IS_HOSPITALIZATION,IS_ER_VISIT,PROJECT_PATIENT_ID,CONTACT_YEAR
0,0,2011-07-31,0,1,683be0ab-8f8c-44c5-8024-b5f660ca3e9b,2011
1,1,2011-02-18,0,1,b807eb20-ef60-4c6a-899c-bb033fa6339e,2011
2,2,2011-08-14,1,0,c1c79322-d705-43c5-b5d2-d17689bbb5a9,2011
3,3,2011-07-21,1,0,c1c79322-d705-43c5-b5d2-d17689bbb5a9,2011
4,4,2012-12-08,0,1,f8a58e23-0822-4dc8-bda3-82fedc214e5c,2012


In [31]:
table = targets.groupby(['PROJECT_PATIENT_ID', 'CONTACT_YEAR']).agg({'IS_HOSPITALIZATION' : 'sum', 'IS_ER_VISIT' : 'sum'})
table.reset_index(inplace=True)
table.head()

Unnamed: 0,PROJECT_PATIENT_ID,CONTACT_YEAR,IS_HOSPITALIZATION,IS_ER_VISIT
0,001ebb31-0d52-4814-8f19-39d88bee3b29,2015,0,5
1,001ebb31-0d52-4814-8f19-39d88bee3b29,2016,0,1
2,001ebb31-0d52-4814-8f19-39d88bee3b29,2017,0,2
3,001ebb31-0d52-4814-8f19-39d88bee3b29,2018,0,2
4,002feddb-9223-4c1e-b56c-9bcee72592a3,2019,0,1


In [32]:
# Join targets to master dataframe

df = pd.merge(df, table, how='inner', left_on=['PROJECT_PATIENT_ID', 'ENCOUNTER_YEAR'], right_on=['PROJECT_PATIENT_ID','CONTACT_YEAR'])
df.head()

Unnamed: 0,GENDER,PROJECT_PATIENT_ID,AGE,MARITAL_Relationship,MARITAL_Single,MARITAL_Unknown,RACE_Black,RACE_Other,RACE_Unknown,RACE_White,...,COUNT_ANTI INTEGRIN,COUNT_ANTI TNF,COUNT_IMMUNOMODULATORS,COUNT_SYSTEMIC STEROIDS,COUNT_VITAMIN D,ORDER_YEAR,PROC_COUNT,CONTACT_YEAR,IS_HOSPITALIZATION,IS_ER_VISIT
0,1,3fa42a37-41a5-4995-8085-cc9bf1b259e4,70,0,0,1,0,0,1,0,...,1.0,0.0,0.0,0.0,0.0,2016,2,2016,2,4
1,0,70c8c891-183c-4355-bde7-350434fb6e4a,60,0,0,1,0,0,1,0,...,1.0,0.0,0.0,0.0,0.0,2015,1,2015,0,1
2,0,a9c3cee8-9f41-47cb-8f0d-af62ef0f31f9,69,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,2009,3,2009,1,2
3,0,a9c3cee8-9f41-47cb-8f0d-af62ef0f31f9,69,1,0,0,0,0,0,1,...,0.0,0.0,0.0,1.0,0.0,2015,1,2015,0,1
4,0,a9c3cee8-9f41-47cb-8f0d-af62ef0f31f9,69,1,0,0,0,0,0,1,...,0.0,0.0,1.0,3.0,1.0,2016,3,2016,3,3


In [33]:
df.shape

(2083, 45)

In [34]:
df.columns = [str(col).lower().replace(' ', '_') for col in df.columns]
df.head()

Unnamed: 0,gender,project_patient_id,age,marital_relationship,marital_single,marital_unknown,race_black,race_other,race_unknown,race_white,...,count_anti_integrin,count_anti_tnf,count_immunomodulators,count_systemic_steroids,count_vitamin_d,order_year,proc_count,contact_year,is_hospitalization,is_er_visit
0,1,3fa42a37-41a5-4995-8085-cc9bf1b259e4,70,0,0,1,0,0,1,0,...,1.0,0.0,0.0,0.0,0.0,2016,2,2016,2,4
1,0,70c8c891-183c-4355-bde7-350434fb6e4a,60,0,0,1,0,0,1,0,...,1.0,0.0,0.0,0.0,0.0,2015,1,2015,0,1
2,0,a9c3cee8-9f41-47cb-8f0d-af62ef0f31f9,69,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,2009,3,2009,1,2
3,0,a9c3cee8-9f41-47cb-8f0d-af62ef0f31f9,69,1,0,0,0,0,0,1,...,0.0,0.0,0.0,1.0,0.0,2015,1,2015,0,1
4,0,a9c3cee8-9f41-47cb-8f0d-af62ef0f31f9,69,1,0,0,0,0,0,1,...,0.0,0.0,1.0,3.0,1.0,2016,3,2016,3,3


In [38]:
df = df[['age', 'gender', 'project_patient_id', 'marital_relationship',
       'marital_single', 'marital_unknown', 'race_black', 'race_other',
       'race_unknown', 'race_white', 'empl_employed', 'empl_student',
       'empl_unemployed', 'empl_unknown', 'encounter_year', 'office_visit',
       'telephone', 'max_albumin', 'max_crp', 'max_eos',
       'max_esr', 'max_hemoglobin', 'max_monocytes', 'max_vitamin_d',
       'min_albumin', 'min_crp', 'min_eos', 'min_esr', 'min_hemoglobin',
       'min_monocytes', 'min_vitamin_d', 'count_5_asa',
       'count_anti_il12', 'count_anti_integrin', 'count_anti_tnf',
       'count_immunomodulators', 'count_systemic_steroids', 'count_vitamin_d',
       'proc_count', 'is_hospitalization',
       'is_er_visit']]

In [42]:
df.to_csv('year_per_patient_num.csv')