In [51]:
import pandas as pd

## Read in data

In [52]:
allergies = pd.read_csv('allergies.csv')
careplans = pd.read_csv('careplans.csv')
conditions = pd.read_csv('conditions.csv')
encounters = pd.read_csv('encounters.csv') ##NOT USING RN, DO WE NEED?
immunizations = pd.read_csv('immunizations.csv')
medications = pd.read_csv('medications.csv')
observations = pd.read_csv('observations.csv')
patients = pd.read_csv('patients.csv')
procedures = pd.read_csv('procedures.csv')

## Clean up dataframes: have one row per patient

In [53]:
## ALLERGIES
allergies_pivot = pd.get_dummies(allergies['DESCRIPTION'])
allergies_pivot['PATIENT'] = allergies['PATIENT']
allergies_pivot = allergies_pivot.groupby('PATIENT').sum().reset_index()

In [54]:
## CAREPLANS
careplans_pivot = pd.get_dummies(careplans['DESCRIPTION'])
careplans_pivot['PATIENT'] = careplans['PATIENT']
careplans_pivot = careplans_pivot.groupby('PATIENT').sum().reset_index()

In [55]:
## CONDITIONS
conditions_pivot = pd.get_dummies(conditions['DESCRIPTION'])
conditions_pivot['PATIENT'] = conditions['PATIENT']
conditions_pivot = conditions_pivot.groupby('PATIENT').sum().reset_index()

In [56]:
## IMMUNIZATIONS
immunizations_pivot = pd.get_dummies(immunizations['DESCRIPTION'])
immunizations_pivot['PATIENT'] = immunizations['PATIENT']
immunizations_pivot = immunizations_pivot.groupby('PATIENT').sum().reset_index()

In [57]:
## MEDICATIONS
medications_pivot = pd.get_dummies(medications['DESCRIPTION'])
medications_pivot['PATIENT'] = medications['PATIENT']
medications_pivot = medications_pivot.groupby('PATIENT').sum().reset_index()

In [58]:
## OBSERVATIONS
observations['VALUE'] = pd.to_numeric(observations['VALUE'], errors='coerce')

# Pivot table with mean aggregation
observations_pivot= observations.pivot_table(index=observations.index, columns='DESCRIPTION', values='VALUE', fill_value=0, aggfunc='mean')
observations_pivot['PATIENT'] = observations['PATIENT']
observations_pivot = observations_pivot.groupby('PATIENT').sum().reset_index()

observations_pivot.to_csv('observations_pivot.csv', index=False)

In [59]:
## PROCEDURES
procedures_pivot = pd.get_dummies(procedures['DESCRIPTION'])
procedures_pivot['PATIENT'] = procedures['PATIENT']
procedures_pivot = procedures_pivot.groupby('PATIENT').sum().reset_index()

### REMOVE MA US from Patient info (they are all from mass)

In [60]:
places = patients['birthplace']
cleaned_places = [place.rsplit(' ', 2)[0] for place in places]

patients['birthplace'] = cleaned_places



In [61]:
town_list = patients['birthplace'].unique()

In [62]:
patients.to_csv('patient_clean.csv', index=False)

In [63]:
def extract_town_name(address, town_list):
    # Check if any part of the address matches any town name in the list
    for town in town_list:
        if town in address:
            return town
    return None

patients['curr_town'] = ""

for index, address in enumerate(patients['address']):
    # Extract the town name
    town_name = extract_town_name(address, town_list)
    
    # Assign the extracted town name to the corresponding entry in the 'curr_town' column
    patients.at[index, 'curr_town'] = town_name

## Add suffixes to columns

In [64]:
patients = patients.rename(columns={'patient': 'PATIENT'})

def add_suffix(df, suffix):
    renamed_columns = {}
    for col_name in df.columns:
        if col_name != 'PATIENT':
            renamed_columns[col_name] = col_name + '_' + suffix
        else:
            renamed_columns[col_name] = col_name
    return df.rename(columns=renamed_columns)


allergies_clean = add_suffix(allergies_pivot, 'ALLERGIES')
careplans_clean = add_suffix(careplans_pivot, 'CAREPLANS')
conditions_clean = add_suffix(conditions_pivot, 'CONDITIONS')
immunizations_clean = add_suffix(immunizations_pivot, 'IMMUNIZATIONS')
medications_clean = add_suffix(medications_pivot, 'MEDICATIONS')
observations_pivot = add_suffix(observations_pivot, 'OBSERVATIONS')
procedures_clean = add_suffix(procedures_pivot, 'PROCEDURES')


## Merge datasets

In [65]:
def merge_datasets(conditions_spec):
    
    merged_df = pd.merge(conditions_spec, patients, on='PATIENT', how='left')
    
    # # Check if there are any matched patient IDs
    # if not merged_df.empty:
    #     print("Matched patient IDs found.")
    #     # Optionally, print or inspect the matched patient IDs
    #     print("Matched patient IDs:", merged_df['PATIENT'].unique())
    # else:
    #     print("No matched patient IDs found.")
    
    merged_df = pd.merge(merged_df, allergies_clean, on='PATIENT', how='left')
    merged_df = pd.merge(merged_df, careplans_clean, on='PATIENT', how='left')
    merged_df = pd.merge(merged_df, procedures_clean, on='PATIENT', how='left')
    
    # # Check if there are any matched patient IDs
    # if not merged_df.empty:
    #     print("2 Matched patient IDs found.")
    #     # Optionally, print or inspect the matched patient IDs
    #     print("2 Matched patient IDs:", merged_df['PATIENT'].unique())
    # else:
    #     print("2 No matched patient IDs found.")
    
    # print("onto encounter merges")
    
    # merged_df = pd.merge(merged_df, section1, on='PATIENT', how='left')
    # merged_df = pd.merge(merged_df, section2, on='PATIENT', how='left')
    # merged_df = pd.merge(merged_df, section3, on='PATIENT', how='left')
    # merged_df = pd.merge(merged_df, section4, on='PATIENT', how='left')
    # merged_df = pd.merge(merged_df, section5, on='PATIENT', how='left')
    # merged_df = pd.merge(merged_df, section6, on='PATIENT', how='left')
    # merged_df = pd.merge(merged_df, section7, on='PATIENT', how='left')
    
    print("3 merges left to go")
    
    merged_df = pd.merge(merged_df, immunizations_clean, on='PATIENT', how='left')
    
    print("1")
    
    merged_df = pd.merge(merged_df, medications_clean, on='PATIENT', how='left')
    print("2")

    merged_df = pd.merge(merged_df, observations_pivot, on='PATIENT', how='left')
    print("3")
    
    return merged_df

In [66]:
conditions_clean.head()

Unnamed: 0,PATIENT,Acute allergic reaction_CONDITIONS,Acute bacterial sinusitis (disorder)_CONDITIONS,Acute bronchitis (disorder)_CONDITIONS,Acute viral pharyngitis (disorder)_CONDITIONS,Alzheimer's disease (disorder)_CONDITIONS,Antepartum eclampsia_CONDITIONS,Appendicitis_CONDITIONS,Asthma_CONDITIONS,Atopic dermatitis_CONDITIONS,...,Sprain of ankle_CONDITIONS,Sprain of wrist_CONDITIONS,Streptococcal sore throat (disorder)_CONDITIONS,Stroke_CONDITIONS,Suspected lung cancer (situation)_CONDITIONS,Tear of meniscus of knee_CONDITIONS,Third degree burn_CONDITIONS,Tubal pregnancy_CONDITIONS,Viral sinusitis (disorder)_CONDITIONS,Whiplash injury to neck_CONDITIONS
0,00269bb7-e3ab-43a9-9cdf-cdf9b6e3b2b3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,00341a88-1cc1-4b39-b0f9-05b0531991a0,0,1,0,0,0,0,0,1,1,...,1,0,1,0,0,0,0,0,1,0
2,004a5922-7c4d-40cc-a0f8-68f607044c99,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,00630ce3-e8eb-4ed4-889b-2c0ac257cbf4,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,007cbcc1-7333-4c79-b5e9-ffa93822fa11,0,0,0,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


Separate conditions dataframe based on disease (now just with diabetes but ultimately need to do with each disease)

In [67]:
## DIABETES
illness_descriptions = ['PATIENT','Diabetes_CONDITIONS','Prediabetes_CONDITIONS','Diabetic retinopathy associated with type II diabetes mellitus (disorder)_CONDITIONS', 
                        'Nonproliferative diabetic retinopathy due to type 2 diabetes mellitus (disorder)_CONDITIONS', 'Macular edema and retinopathy due to type 2 diabetes mellitus (disorder)_CONDITIONS', 
                        'Microalbuminuria due to type 2 diabetes mellitus (disorder)_CONDITIONS', 'Diabetic renal disease (disorder)_CONDITIONS', 'Neuropathy due to type 2 diabetes mellitus (disorder)_CONDITIONS']


subset_conditions = conditions_clean.loc[:, illness_descriptions]

subset_conditions.to_csv('conditions_diabetes.csv', index=False)

diabetes_df = merge_datasets(subset_conditions)
diabetes_df.to_csv('diabetes.csv', index=False)

3 merges left to go
1
2
3


In [68]:
## PREGNANCY
illness_descriptions = ['PATIENT','Miscarriage in first trimester_CONDITIONS','Miscarriage in second trimester_CONDITIONS',
                        'Complication occuring during pregnancy_CONDITIONS','Preeclampsia_CONDITIONS', 'Antepartum eclampsia_CONDITIONS',
                        'Tubal pregnancy_CONDITIONS', 'Congenital uterine anomaly_CONDITIONS', 'Blighted ovum_CONDITIONS']

subset_conditions = conditions_clean.loc[:, illness_descriptions]
pregnancy_df = merge_datasets(subset_conditions)
pregnancy_df.to_csv('pregnancy.csv', index=False)

3 merges left to go
1
2
3


In [69]:
## CANCER
illness_descriptions = ['PATIENT','Non-small cell lung cancer (disorder)_CONDITIONS', 'Non-small cell carcinoma of lung  TNM stage 4 (disorder)_CONDITIONS',
                        'Primary small cell malignant neoplasm of lung  TNM stage 4 (disorder)_CONDITIONS','Non-small cell carcinoma of lung  TNM stage 2 (disorder)_CONDITIONS',
                        'Non-small cell lung cancer (disorder)_CONDITIONS', 'Suspected lung cancer (situation)_CONDITIONS', 'Malignant tumor of colon_CONDITIONS',
                        'Overlapping malignant neoplasm of colon_CONDITIONS']

subset_conditions = conditions_clean.loc[:, illness_descriptions]

subset_conditions.to_csv('conditions_cancer.csv', index=False)


cancer_df = merge_datasets(subset_conditions)
cancer_df.to_csv('cancer.csv', index=False)

3 merges left to go
1
2
3


In [70]:
## HEART
illness_descriptions = ['PATIENT','Coronary Heart Disease_CONDITIONS', 'History of cardiac arrest (situation)_CONDITIONS', 'Cardiac Arrest_CONDITIONS',
                        'History of myocardial infarction (situation)_CONDITIONS', 'Myocardial Infarction_CONDITIONS']

subset_conditions = conditions_clean.loc[:, illness_descriptions]
heart_df = merge_datasets(subset_conditions)
heart_df.to_csv('heart.csv', index=False)

3 merges left to go
1
2
3


Should we separate these in ETC to unique dataframes?

In [71]:
## ETC
illness_descriptions = ['PATIENT','Hypertension_CONDITIONS', 'Stroke_CONDITIONS', 'Child attention deficit disorder_CONDITIONS', 'Drug overdose_CONDITIONS']

subset_conditions = conditions_clean.loc[:, illness_descriptions]
etc_df = merge_datasets(subset_conditions)
etc_df.to_csv('etc.csv', index=False)

3 merges left to go
1
2
3


In [72]:
## LUNGS
illness_descriptions = ['PATIENT','Asthma_CONDITIONS', 'Pulmonary emphysema (disorder)_CONDITIONS', 'Seasonal allergic rhinitis_CONDITIONS', 
                        'Acute bronchitis (disorder)_CONDITIONS', 'Chronic obstructive bronchitis (disorder)_CONDITIONS',
                        'Childhood asthma_CONDITIONS', 'Perennial allergic rhinitis with seasonal variation_CONDITIONS',
                        'Perennial allergic rhinitis_CONDITIONS', 'Acute bacterial sinusitis (disorder)_CONDITIONS', 'Chronic sinusitis (disorder)_CONDITIONS',
                        'Sinusitis (disorder)_CONDITIONS']

subset_conditions = conditions_clean.loc[:, illness_descriptions]
lungs_df = merge_datasets(subset_conditions)
lungs_df.to_csv('lungs.csv', index=False)

3 merges left to go
1
2
3


## Unnecessary notes at bottom

Calculate data usage

In [73]:
# # Get memory usage of each column
# memory_usage_per_column = merged_df.memory_usage(deep=True)

# # Sum up memory usage of all columns
# total_memory_usage = memory_usage_per_column.sum()

# print("Total memory usage of DataFrame: {} bytes".format(total_memory_usage))

NOTES FROM PHIL TO STOP KERNEL DEATH
Size of indiv. dataframes
- delete columns that we are not going to use before we merge

- read in datasets a chunk at a time
    - operate on sequential parts of the data

- if its still an issue, could get set up on cluster to compute there as well