In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm

# Scenario: Creating a Set of Machine Learning Friendly Features from EHR Data to Predict Diabetes Onset

First we will load in the necessary data files

In [2]:
def load_data_for_file(filename):
    print(f"Loading data for {filename}")
    df = pd.concat([ # use pd.concat to append/concatenate the data for all states together into a single frame
        pd.read_parquet(f"https://dicbworkshops.s3.amazonaws.com/{output_dir}/parquet/{filename}") # use read_csv to load the data from each output directory
        for output_dir in tqdm(['output_hi', 'output_ma', 'output_tx', 'output_wa']) # loop over each output directory
    ])
    return df

In [5]:
# load in the conditions
conditions = load_data_for_file('conditions.parquet')
observations = load_data_for_file('observations.parquet')
medications = load_data_for_file('medications.parquet')
procedures = load_data_for_file('procedures.parquet')

Loading data for conditions.parquet


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.34it/s]


Loading data for observations.parquet


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.21s/it]


Loading data for medications.parquet


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.19it/s]


Loading data for procedures.parquet


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.04it/s]


## Filtering Out Patients with Diagnoses of Type-2 Diabetes
For this exercise, we are interested in filtering out all patients with a diagnosis of Type-2 diabetes \
We can filter these out based on the SNOMED code `44054006`

In [20]:
type2_patients = conditions.query('CODE == 44054006').sort_values(by=['PATIENT', 'START']).drop_duplicates(subset=['PATIENT', 'START'], keep='first')

In [19]:
# now we create a dictionary/lookup table to map each patient's ID to the date of their earliest Type 2 diagnosis
patient_diagnosis_dates = {
    row['PATIENT']: row['START']
    for _, row in type2_patients.iterrows()
}

In [22]:
conditions

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,SYSTEM,CODE,DESCRIPTION
0,2001-07-18,,8f8229e6-00be-a033-bb16-42781f9d208a,249a2734-060e-3b6e-de43-c5c2c97e888b,SNOMED-CT,473461003,Educated to high school level (finding)
1,2001-07-18,,8f8229e6-00be-a033-bb16-42781f9d208a,249a2734-060e-3b6e-de43-c5c2c97e888b,SNOMED-CT,160903007,Full-time employment (finding)
2,2011-07-25,,8f8229e6-00be-a033-bb16-42781f9d208a,7570e7f5-b7a8-3848-0208-07e278d3754e,SNOMED-CT,161744009,Past pregnancy history of miscarriage (situation)
3,2014-08-13,2015-07-22,8f8229e6-00be-a033-bb16-42781f9d208a,366aac19-54b2-b4af-2b03-8d35c03ae2ba,SNOMED-CT,73595000,Stress (finding)
4,2014-12-03,2015-07-15,8f8229e6-00be-a033-bb16-42781f9d208a,a047e3c3-9e86-dd54-f026-e28929aa1e6b,SNOMED-CT,72892002,Normal pregnancy (finding)
...,...,...,...,...,...,...,...
47332,2024-12-31,2025-01-14,e4f7d897-4345-a297-0ae3-ff6fbe5a71aa,f9215af9-3736-0599-7ceb-4485b8af96e7,SNOMED-CT,80583007,Severe anxiety (panic) (finding)
47333,2025-01-07,2025-01-21,e4f7d897-4345-a297-0ae3-ff6fbe5a71aa,c9444afc-a873-a18c-1e49-eb3a92c4dc01,SNOMED-CT,314529007,Medication review due (situation)
47334,2025-01-14,2025-01-28,e4f7d897-4345-a297-0ae3-ff6fbe5a71aa,a68b03e2-9449-782b-47d4-da72c2cfecda,SNOMED-CT,423315002,Limited social contact (finding)
47335,2025-01-28,2025-02-04,e4f7d897-4345-a297-0ae3-ff6fbe5a71aa,8b0ae691-9e4c-b1ce-0cc2-da9f273db9f9,SNOMED-CT,314529007,Medication review due (situation)


## Filtering Out Post-diagnosis Conditions, Observations, Medications, and Procedures and Unifying Into a Shared Representation
First we need to filter out all EHR data from encounters that took place after the Type-2 diabetes diagnosis for the type 2 patients

In [26]:
conditions_filtered = []
for _, row in conditions.iterrows():
    patient = row['PATIENT']
    start_date = row['START']
    if patient in patient_diagnosis_dates and patient_diagnosis_dates[patient] > start_date:
        conditions_filtered.append(row)
conditions_filtered = pd.DataFrame(conditions_filtered)    

In [43]:
observations_simplified = observations[observations['PATIENT'].isin(patient_diagnosis_dates)].assign(
    DATE_SIMPLE=lambda x: pd.to_datetime(x['DATE']).dt.date.astype('str')
)
observations_filtered = []
for _, row in tqdm(observations_simplified.iterrows(), total=len(observations_simplified)):
    patient = row['PATIENT']
    date = row['DATE_SIMPLE']
    if patient_diagnosis_dates[patient] > date:
        observations_filtered.append(row)
observations_filtered = pd.DataFrame(observations_filtered)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 789669/789669 [00:10<00:00, 73941.30it/s]


In [47]:
medications_simplified = medications[medications['PATIENT'].isin(patient_diagnosis_dates)].assign(
    DATE_SIMPLE=lambda x: pd.to_datetime(x['START']).dt.date.astype('str')
)
medications_filtered = []
for _, row in tqdm(medications_simplified.iterrows(), total=len(medications_simplified)):
    patient = row['PATIENT']
    date = row['DATE_SIMPLE']
    if patient_diagnosis_dates[patient] > date:
        medications_filtered.append(row)
medications_filtered = pd.DataFrame(medications_filtered)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 55269/55269 [00:00<00:00, 75408.99it/s]


In [48]:
medications_filtered

Unnamed: 0,START,STOP,PATIENT,PAYER,ENCOUNTER,CODE,DESCRIPTION,BASE_COST,PAYER_COVERAGE,DISPENSES,TOTALCOST,REASONCODE,REASONDESCRIPTION,DATE_SIMPLE
1,2014-05-26T02:53:46Z,2015-05-21T02:53:46Z,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,b49abd41-5aea-59cc-cc55-39ec41444b99,748879,Levora 0.15/30 28 Day Pack,526.18,0.0,12,6314.16,,,2014-05-26
3,2015-05-21T02:53:46Z,2016-05-15T02:53:46Z,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,aa48b307-2418-b114-a3d9-58b7ebc2ac24,978950,Natazia 28 Day Pack,879.02,0.0,12,10548.24,,,2015-05-21
14,2016-05-15T02:53:46Z,2017-05-10T03:30:00Z,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,c24cf078-a8f2-291f-d7c7-ae41aa5218cb,389221,Etonogestrel 68 MG Drug Implant,27.79,0.0,12,333.48,,,2016-05-15
16,2017-05-10T03:30:00Z,2018-05-05T03:30:00Z,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,e1ce023e-9599-e731-219a-728bebd0ddc7,749762,Seasonique 91 Day Pack,460.21,0.0,12,5522.52,,,2017-05-10
659,2017-06-27T22:06:09Z,2017-06-27T22:06:09Z,6dc3a549-a3d4-8fee-1a15-b986dbe9b1d2,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,33edd580-9470-4a53-cb4a-ec08c73740b8,1535362,sodium fluoride 0.0272 MG/MG Oral Gel,260.78,0.0,1,260.78,66383009.0,Gingivitis (disorder),2017-06-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71341,1995-07-10T20:19:46Z,1995-07-10T20:19:46Z,66de5451-b446-367b-821d-265b4cfda04f,e03e23c9-4df1-3eb6-a62d-f70f02301496,0dab360f-f9de-9aaa-a760-bc16ce3e5e9e,1535362,sodium fluoride 0.0272 MG/MG Oral Gel,200.25,0.0,1,200.25,103697008.0,Patient referral for dental care (procedure),1995-07-10
71389,2000-01-24T17:24:45Z,2000-02-12T17:24:45Z,66de5451-b446-367b-821d-265b4cfda04f,e03e23c9-4df1-3eb6-a62d-f70f02301496,f927f2ea-ea37-2d97-9546-90ce953de341,313782,Acetaminophen 325 MG Oral Tablet,70.67,0.0,1,70.67,10509002.0,Acute bronchitis (disorder),2000-01-24
72467,1989-02-15T02:00:06Z,,b968c8bf-cf84-628a-bd6e-d9b312f32d3a,d18ef2e6-ef40-324c-be54-34a5ee865625,9a3fca6e-6924-2919-964d-649a4be0767c,197591,Diazepam 5 MG Oral Tablet,5.37,0.0,437,2346.69,128613002.0,Seizure disorder (disorder),1989-02-15
73276,1996-02-14T04:57:49Z,,09047a65-738d-2563-6b92-ca4e6f508232,e03e23c9-4df1-3eb6-a62d-f70f02301496,e77ee5cc-cc07-bf67-c013-7ecfce8cbab5,310325,ferrous sulfate 325 MG Oral Tablet,0.23,0.0,240,55.20,,,1996-02-14


In [51]:
procedures_simplified = procedures[procedures['PATIENT'].isin(patient_diagnosis_dates)].assign(
    DATE_SIMPLE=lambda x: pd.to_datetime(x['START']).dt.date.astype('str')
)
procedures_filtered = []
for _, row in tqdm(procedures_simplified.iterrows(), total=len(procedures_simplified)):
    patient = row['PATIENT']
    date = row['DATE_SIMPLE']
    if patient_diagnosis_dates[patient] > date:
        procedures_filtered.append(row)
procedures_filtered = pd.DataFrame(procedures_filtered)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108374/108374 [00:01<00:00, 67847.51it/s]


In [52]:
procedures_filtered

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,SYSTEM,CODE,DESCRIPTION,BASE_COST,REASONCODE,REASONDESCRIPTION,DATE_SIMPLE
39,2016-02-23T02:53:46Z,2016-02-23T03:46:34Z,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,799c717a-401a-e1e7-2b4f-ab040641b2b6,SNOMED-CT,710824005,Assessment of health and social care needs (pr...,600.50,,,2016-02-23
40,2016-02-23T03:46:34Z,2016-02-23T04:02:52Z,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,799c717a-401a-e1e7-2b4f-ab040641b2b6,SNOMED-CT,710841007,Assessment of anxiety (procedure),600.50,,,2016-02-23
41,2016-02-23T04:02:52Z,2016-02-23T04:28:16Z,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,799c717a-401a-e1e7-2b4f-ab040641b2b6,SNOMED-CT,866148006,Screening for domestic abuse (procedure),600.50,,,2016-02-23
42,2016-02-23T04:28:16Z,2016-02-23T04:41:42Z,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,799c717a-401a-e1e7-2b4f-ab040641b2b6,SNOMED-CT,171207006,Depression screening (procedure),600.50,,,2016-02-23
43,2016-02-23T04:41:42Z,2016-02-23T05:04:24Z,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,799c717a-401a-e1e7-2b4f-ab040641b2b6,SNOMED-CT,171207006,Depression screening (procedure),600.50,,,2016-02-23
...,...,...,...,...,...,...,...,...,...,...,...
188353,2000-01-17T17:24:45Z,2000-01-17T18:12:42Z,66de5451-b446-367b-821d-265b4cfda04f,f653d6cc-e7d0-1170-f770-49aff5d33dbe,SNOMED-CT,710824005,Assessment of health and social care needs (pr...,463.95,,,2000-01-17
188354,2000-01-17T18:12:42Z,2000-01-17T18:34:51Z,66de5451-b446-367b-821d-265b4cfda04f,f653d6cc-e7d0-1170-f770-49aff5d33dbe,SNOMED-CT,710841007,Assessment of anxiety (procedure),463.95,,,2000-01-17
188355,2000-01-17T18:34:51Z,2000-01-17T19:08:18Z,66de5451-b446-367b-821d-265b4cfda04f,f653d6cc-e7d0-1170-f770-49aff5d33dbe,SNOMED-CT,866148006,Screening for domestic abuse (procedure),463.95,,,2000-01-17
188356,2000-01-17T19:08:18Z,2000-01-17T19:20:13Z,66de5451-b446-367b-821d-265b4cfda04f,f653d6cc-e7d0-1170-f770-49aff5d33dbe,SNOMED-CT,171207006,Depression screening (procedure),463.95,,,2000-01-17
