In [2]:
import pandas as pd

## Read in data

In [3]:
allergies = pd.read_csv('allergies.csv')
careplans = pd.read_csv('careplans.csv')
conditions = pd.read_csv('conditions.csv')
encounters = pd.read_csv('encounters.csv') ##NOT USING RN, DO WE NEED?
immunizations = pd.read_csv('immunizations.csv')
medications = pd.read_csv('medications.csv')
observations = pd.read_csv('observations.csv')
patients = pd.read_csv('patients.csv')
procedures = pd.read_csv('procedures.csv')

## Clean up dataframes: have one row per patient

In [4]:
## ALLERGIES
allergies_pivot = pd.get_dummies(allergies['DESCRIPTION'])
allergies_pivot['PATIENT'] = allergies['PATIENT']
allergies_pivot = allergies_pivot.groupby('PATIENT').sum().reset_index()

In [5]:
## CAREPLANS
careplans_pivot = pd.get_dummies(careplans['DESCRIPTION'])
careplans_pivot['PATIENT'] = careplans['PATIENT']
careplans_pivot = careplans_pivot.groupby('PATIENT').sum().reset_index()

In [6]:
## CONDITIONS
conditions_pivot = pd.get_dummies(conditions['DESCRIPTION'])
conditions_pivot['PATIENT'] = conditions['PATIENT']
conditions_pivot = conditions_pivot.groupby('PATIENT').sum().reset_index()

In [7]:
## IMMUNIZATIONS
immunizations_pivot = pd.get_dummies(immunizations['DESCRIPTION'])
immunizations_pivot['PATIENT'] = immunizations['PATIENT']
immunizations_pivot = immunizations_pivot.groupby('PATIENT').sum().reset_index()

In [8]:
## MEDICATIONS
medications_pivot = pd.get_dummies(medications['DESCRIPTION'])
medications_pivot['PATIENT'] = medications['PATIENT']
medications_pivot = medications_pivot.groupby('PATIENT').sum().reset_index()

In [9]:
## OBSERVATIONS
observations['VALUE'] = pd.to_numeric(observations['VALUE'], errors='coerce')

# Pivot table with mean aggregation
observations_pivot= observations.pivot_table(index=observations.index, columns='DESCRIPTION', values='VALUE', fill_value=0, aggfunc='mean')
observations_pivot['PATIENT'] = observations['PATIENT']
observations_pivot = observations_pivot.groupby('PATIENT').sum().reset_index()

In [10]:
## PROCEDURES
procedures_pivot = pd.get_dummies(procedures['DESCRIPTION'])
procedures_pivot['PATIENT'] = procedures['PATIENT']
procedures_pivot = procedures_pivot.groupby('PATIENT').sum().reset_index()

## Add suffixes to columns

In [11]:
patients = patients.rename(columns={'patient': 'PATIENT'})

def add_suffix(df, suffix):
    renamed_columns = {}
    for col_name in df.columns:
        if col_name != 'PATIENT':
            renamed_columns[col_name] = col_name + '_' + suffix
        else:
            renamed_columns[col_name] = col_name
    return df.rename(columns=renamed_columns)


allergies_clean = add_suffix(allergies_pivot, 'ALLERGIES')
careplans_clean = add_suffix(careplans_pivot, 'CAREPLANS')
conditions_clean = add_suffix(conditions_pivot, 'CONDITIONS')
immunizations_clean = add_suffix(immunizations_pivot, 'IMMUNIZATIONS')
medications_clean = add_suffix(medications_pivot, 'MEDICATIONS')
observations_pivot = add_suffix(observations_pivot, 'OBSERVATIONS')
procedures_clean = add_suffix(procedures_pivot, 'PROCEDURES')


Split up encounters to add stuff

In [12]:
# total_rows = len(encounters_clean)
# rows_per_section = total_rows // 8

# section1 = encounters_clean.iloc[:rows_per_section]
# section2 = encounters_clean.iloc[rows_per_section:2*rows_per_section]
# section3 = encounters_clean.iloc[2*rows_per_section:3*rows_per_section]
# section4 = encounters_clean.iloc[3*rows_per_section:4*rows_per_section]
# section5 = encounters_clean.iloc[4*rows_per_section:5*rows_per_section]
# section6 = encounters_clean.iloc[5*rows_per_section:6*rows_per_section]
# section7 = encounters_clean.iloc[6*rows_per_section:7*rows_per_section]
# section8 = encounters_clean.iloc[7*rows_per_section:]

# section1 = add_suffix(section1, '1')
# section2 = add_suffix(section2, '2')
# section3 = add_suffix(section3, '3')
# section4 = add_suffix(section4, '4')
# section5 = add_suffix(section5, '5')
# section6 = add_suffix(section6, '6')
# section7 = add_suffix(section7, '7')
# section8 = add_suffix(section8, '8')

## Merge datasets

In [13]:
def merge_datasets(conditions_spec):
    
    merged_df = pd.merge(conditions_spec, patients, on='PATIENT', how='left')
    
    # # Check if there are any matched patient IDs
    # if not merged_df.empty:
    #     print("Matched patient IDs found.")
    #     # Optionally, print or inspect the matched patient IDs
    #     print("Matched patient IDs:", merged_df['PATIENT'].unique())
    # else:
    #     print("No matched patient IDs found.")
    
    merged_df = pd.merge(merged_df, allergies_clean, on='PATIENT', how='left')
    merged_df = pd.merge(merged_df, careplans_clean, on='PATIENT', how='left')
    merged_df = pd.merge(merged_df, procedures_clean, on='PATIENT', how='left')
    
    # # Check if there are any matched patient IDs
    # if not merged_df.empty:
    #     print("2 Matched patient IDs found.")
    #     # Optionally, print or inspect the matched patient IDs
    #     print("2 Matched patient IDs:", merged_df['PATIENT'].unique())
    # else:
    #     print("2 No matched patient IDs found.")
    
    # print("onto encounter merges")
    
    # merged_df = pd.merge(merged_df, section1, on='PATIENT', how='left')
    # merged_df = pd.merge(merged_df, section2, on='PATIENT', how='left')
    # merged_df = pd.merge(merged_df, section3, on='PATIENT', how='left')
    # merged_df = pd.merge(merged_df, section4, on='PATIENT', how='left')
    # merged_df = pd.merge(merged_df, section5, on='PATIENT', how='left')
    # merged_df = pd.merge(merged_df, section6, on='PATIENT', how='left')
    # merged_df = pd.merge(merged_df, section7, on='PATIENT', how='left')
    
    print("3 merges left to go")
    
    merged_df = pd.merge(merged_df, immunizations_clean, on='PATIENT', how='left')
    
    print("1")
    
    merged_df = pd.merge(merged_df, medications_clean, on='PATIENT', how='left')
    print("2")

    merged_df = pd.merge(merged_df, observations_pivot, on='PATIENT', how='left')
    print("3")
    
    return merged_df

Separate conditions dataframe based on disease (now just with diabetes but ultimately need to do with each disease)

In [14]:
## DIABETES
illness_descriptions = ['Diabetes','Prediabetes','Diabetic retinopathy associated with type II diabetes mellitus (disorder)', 
                        'Nonproliferative diabetic retinopathy due to type 2 diabetes mellitus (disorder)', 'Macular edema and retinopathy due to type 2 diabetes mellitus (disorder)', 
                        'Microalbuminuria due to type 2 diabetes mellitus (disorder)', 'Diabetic renal disease (disorder)', 'Neuropathy due to type 2 diabetes mellitus (disorder)']

conditions_spec = pd.DataFrame()

# Loop through each illness description and filter the DataFrame
for description in illness_descriptions:
    full_description = description + "_CONDITIONS"
    
    conditions_spec = pd.concat([conditions_spec, conditions_clean[conditions_clean[full_description] == 1]])

# Drop duplicate rows if needed
conditions_spec = conditions_spec.drop_duplicates()

diabetes_df = merge_datasets(conditions_spec)
diabetes_df.to_csv('diabetes.csv', index=False)

3 merges left to go
1
2
3


In [15]:
## PREGNANCY
illness_descriptions = ['Miscarriage in first trimester','Prediabetes','Miscarriage in second trimester',
                        'Complication occuring during pregnancy','Preeclampsia', 'Antepartum eclampsia',
                        'Tubal pregnancy', 'Congenital uterine anomaly', 'Blighted ovum']

conditions_spec = pd.DataFrame()

# Loop through each illness description and filter the DataFrame
for description in illness_descriptions:
    full_description = description + "_CONDITIONS"
    
    conditions_spec = pd.concat([conditions_spec, conditions_clean[conditions_clean[full_description] == 1]])

# Drop duplicate rows if needed
conditions_spec = conditions_spec.drop_duplicates()

pregnancy_df = merge_datasets(conditions_spec)
pregnancy_df.to_csv('pregnancy.csv', index=False)

3 merges left to go
1
2
3


In [16]:
## CANCER
illness_descriptions = ['Non-small cell lung cancer (disorder)', 'Non-small cell carcinoma of lung  TNM stage 4 (disorder)',
                        'Primary small cell malignant neoplasm of lung  TNM stage 4 (disorder)','Non-small cell carcinoma of lung  TNM stage 2 (disorder)',
                        'Non-small cell lung cancer (disorder)', 'Suspected lung cancer (situation)', 'Malignant tumor of colon',
                        'Overlapping malignant neoplasm of colon']

conditions_spec = pd.DataFrame()

# Loop through each illness description and filter the DataFrame
for description in illness_descriptions:
    full_description = description + "_CONDITIONS"
    
    conditions_spec = pd.concat([conditions_spec, conditions_clean[conditions_clean[full_description] == 1]])

# Drop duplicate rows if needed
conditions_spec = conditions_spec.drop_duplicates()

cancer_df = merge_datasets(conditions_spec)
cancer_df.to_csv('cancer.csv', index=False)

3 merges left to go
1
2
3


In [17]:
## HEART
illness_descriptions = ['Coronary Heart Disease', 'History of cardiac arrest (situation)', 'Cardiac Arrest',
                        'History of myocardial infarction (situation)', 'Myocardial Infarction']

conditions_spec = pd.DataFrame()

# Loop through each illness description and filter the DataFrame
for description in illness_descriptions:
    full_description = description + "_CONDITIONS"
    
    conditions_spec = pd.concat([conditions_spec, conditions_clean[conditions_clean[full_description] == 1]])

# Drop duplicate rows if needed
conditions_spec = conditions_spec.drop_duplicates()

heart_df = merge_datasets(conditions_spec)
heart_df.to_csv('heart.csv', index=False)

3 merges left to go
1
2
3


Should we separate these in ETC to unique dataframes?

In [18]:
## ETC
illness_descriptions = ['Hypertension', 'Stroke', 'Child attention deficit disorder', 'Drug overdose']

conditions_spec = pd.DataFrame()

# Loop through each illness description and filter the DataFrame
for description in illness_descriptions:
    full_description = description + "_CONDITIONS"
    
    conditions_spec = pd.concat([conditions_spec, conditions_clean[conditions_clean[full_description] == 1]])

# Drop duplicate rows if needed
conditions_spec = conditions_spec.drop_duplicates()

etc_df = merge_datasets(conditions_spec)

etc_df.to_csv('etc.csv', index=False)

3 merges left to go
1
2
3


In [26]:
## LUNGS
illness_descriptions = ['Asthma', 'Pulmonary emphysema (disorder)', 'Seasonal allergic rhinitis', 
                        'Acute bronchitis (disorder)', 'Chronic obstructive bronchitis (disorder)',
                        'Childhood asthma', 'Perennial allergic rhinitis with seasonal variation',
                        'Perennial allergic rhinitis', 'Acute bacterial sinusitis (disorder)', 'Chronic sinusitis (disorder)',
                        'Sinusitis (disorder)']

conditions_spec = pd.DataFrame()

# Loop through each illness description and filter the DataFrame
for description in illness_descriptions:
    full_description = description + "_CONDITIONS"
    conditions_spec = pd.concat([conditions_spec, conditions_clean[conditions_clean[full_description] == 1]])

# Drop duplicate rows if needed
conditions_spec = conditions_spec.drop_duplicates()

lungs_df = merge_datasets(conditions_spec)

lungs_df.to_csv('lungs.csv', index=False)

Asthma
RangeIndex(start=0, stop=0, step=1)
Index(['PATIENT', 'Acute allergic reaction_CONDITIONS',
       'Acute bacterial sinusitis (disorder)_CONDITIONS',
       'Acute bronchitis (disorder)_CONDITIONS',
       'Acute viral pharyngitis (disorder)_CONDITIONS',
       'Alzheimer's disease (disorder)_CONDITIONS',
       'Antepartum eclampsia_CONDITIONS', 'Appendicitis_CONDITIONS',
       'Asthma_CONDITIONS', 'Atopic dermatitis_CONDITIONS',
       ...
       'Sprain of ankle_CONDITIONS', 'Sprain of wrist_CONDITIONS',
       'Streptococcal sore throat (disorder)_CONDITIONS', 'Stroke_CONDITIONS',
       'Suspected lung cancer (situation)_CONDITIONS',
       'Tear of meniscus of knee_CONDITIONS', 'Third degree burn_CONDITIONS',
       'Tubal pregnancy_CONDITIONS', 'Viral sinusitis (disorder)_CONDITIONS',
       'Whiplash injury to neck_CONDITIONS'],
      dtype='object', length=119)
Pulmonary emphysema (disorder)
Index(['PATIENT', 'Acute allergic reaction_CONDITIONS',
       'Acute bacteria

In [22]:
lungs_df.head()

Unnamed: 0,PATIENT,Acute allergic reaction_CONDITIONS,Acute bacterial sinusitis (disorder)_CONDITIONS,Acute bronchitis (disorder)_CONDITIONS,Acute viral pharyngitis (disorder)_CONDITIONS,Alzheimer's disease (disorder)_CONDITIONS,Antepartum eclampsia_CONDITIONS,Appendicitis_CONDITIONS,Asthma_CONDITIONS,Atopic dermatitis_CONDITIONS,...,Sodium_OBSERVATIONS,Soybean IgE Ab in Serum_OBSERVATIONS,Systolic Blood Pressure_OBSERVATIONS,Total Cholesterol_OBSERVATIONS,Total score [MMSE]_OBSERVATIONS,Triglycerides_OBSERVATIONS,Urea Nitrogen_OBSERVATIONS,Walnut IgE Ab in Serum_OBSERVATIONS,Wheat IgE Ab in Serum_OBSERVATIONS,White oak IgE Ab in Serum_OBSERVATIONS
0,00341a88-1cc1-4b39-b0f9-05b0531991a0,0,1,0,0,0,0,0,1,1,...,0.0,0.0,466.0,574.0,0.0,430.0,0.0,0.0,0.0,0.0
1,0318fe27-553c-468b-99fd-3272b0407f33,0,0,0,0,0,1,0,1,0,...,0.0,0.0,643.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0419b689-ddaa-45aa-abdc-64e321cb5c03,0,0,0,0,0,0,0,1,0,...,1257.0,0.0,1479.0,552.0,0.0,403.0,127.0,0.0,0.0,0.0
3,0a356517-c3d5-4b8e-8429-1898e22e2291,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0b763e21-2a8f-4f04-86d2-f7e8f9cebb21,0,0,0,1,0,0,0,1,0,...,1394.0,0.0,1772.0,354.0,0.0,261.0,127.0,0.0,0.0,0.0


## Unnecessary notes at bottom

Calculate data usage

In [20]:
# # Get memory usage of each column
# memory_usage_per_column = merged_df.memory_usage(deep=True)

# # Sum up memory usage of all columns
# total_memory_usage = memory_usage_per_column.sum()

# print("Total memory usage of DataFrame: {} bytes".format(total_memory_usage))

NOTES FROM PHIL TO STOP KERNEL DEATH
Size of indiv. dataframes
- delete columns that we are not going to use before we merge

- read in datasets a chunk at a time
    - operate on sequential parts of the data

- if its still an issue, could get set up on cluster to compute there as well