In [1]:
import pandas as pd

## Read in data

In [2]:
allergies = pd.read_csv('allergies.csv')
careplans = pd.read_csv('careplans.csv')
conditions = pd.read_csv('conditions.csv')
encounters = pd.read_csv('encounters.csv') ##NOT USING RN, DO WE NEED?
immunizations = pd.read_csv('immunizations.csv')
medications = pd.read_csv('medications.csv')
observations = pd.read_csv('observations.csv')
patients = pd.read_csv('patients.csv')
procedures = pd.read_csv('procedures.csv')

## Clean up dataframes: have one row per patient

In [3]:
## ALLERGIES
allergies_pivot = pd.get_dummies(allergies['DESCRIPTION'])
allergies_pivot['PATIENT'] = allergies['PATIENT']
allergies_pivot = allergies_pivot.groupby('PATIENT').sum().reset_index()

In [4]:
## CAREPLANS
careplans_pivot = pd.get_dummies(careplans['DESCRIPTION'])
careplans_pivot['PATIENT'] = careplans['PATIENT']
careplans_pivot = careplans_pivot.groupby('PATIENT').sum().reset_index()

In [5]:
## CONDITIONS
conditions_pivot = pd.get_dummies(conditions['DESCRIPTION'])
conditions_pivot['PATIENT'] = conditions['PATIENT']
conditions_pivot = conditions_pivot.groupby('PATIENT').sum().reset_index()

In [6]:
## IMMUNIZATIONS
immunizations_pivot = pd.get_dummies(immunizations['DESCRIPTION'])
immunizations_pivot['PATIENT'] = immunizations['PATIENT']
immunizations_pivot = immunizations_pivot.groupby('PATIENT').sum().reset_index()

In [7]:
## MEDICATIONS
medications_pivot = pd.get_dummies(medications['DESCRIPTION'])
medications_pivot['PATIENT'] = medications['PATIENT']
medications_pivot = medications_pivot.groupby('PATIENT').sum().reset_index()

In [8]:
## OBSERVATIONS
observations['VALUE'] = pd.to_numeric(observations['VALUE'], errors='coerce')

# Pivot table with mean aggregation
observations_pivot= observations.pivot_table(index=observations.index, columns='DESCRIPTION', values='VALUE', fill_value=0, aggfunc='mean')
observations_pivot['PATIENT'] = observations['PATIENT']
observations_pivot = observations_pivot.groupby('PATIENT').sum().reset_index()

observations_pivot.to_csv('observations_pivot.csv', index=False)

In [9]:
## PROCEDURES
procedures_pivot = pd.get_dummies(procedures['DESCRIPTION'])
procedures_pivot['PATIENT'] = procedures['PATIENT']
procedures_pivot = procedures_pivot.groupby('PATIENT').sum().reset_index()

### REMOVE MA US from Patient info (they are all from mass)

In [10]:
#import re
places = patients['birthplace']
cleaned_places = [place.replace(" MA US", "") if " MA US" in place else place for place in places]


patients['birthplace'] = cleaned_places

In [11]:
town_names = patients['birthplace'].unique().tolist()
#some towns do not appear in birthplace but appear in current addresses, I modified the function below to print these towns and create this list, then
# reverted to the function so it wouldn't have a long output
towns_not_in_bp = ['Sandwich', 'Uxbridge', 'Holland', 'Monson', 'Wendell', 'Wayland', 'Rochester', 'Belchertown', 'Lanesborough', 'Hatfield',
                  'Georgetown', 'Lakeville', 'Princeton', 'Blackstone', 'Hinsdale', 'Harvard', 'Chesterfield', 'Wellfleet', 'Northfield', 'Hubbardston',
                  'Windsor', 'Wales', 'Sandisfield', 'Bolton', 'Truro', 'Southwick', 'Sheffield', 'Scituate', 'Halifax', 'Nahant', 'Stockbridge', 'Berlin']
                
for town in towns_not_in_bp:
    town_names.append(town)

In [12]:
def extract_town_name(address, town_list):
    # Check if any part of the address matches any town name in the list
    for town in town_list:
        if town in address:
            return town
    print(address)
    return None

patients['curr_town'] = ""

for index, address in enumerate(patients['address']):
    # Extract the town name
    town_name = extract_town_name(address, town_names)
    
    # Assign the extracted town name to the corresponding entry in the 'curr_town' column
    patients.at[index, 'curr_town'] = town_name

In [13]:
patients.to_csv('patient_clean.csv', index=False)

## Add suffixes to columns

In [14]:
patients = patients.rename(columns={'patient': 'PATIENT'})

def add_suffix(df, suffix):
    renamed_columns = {}
    for col_name in df.columns:
        if col_name != 'PATIENT':
            renamed_columns[col_name] = col_name + '_' + suffix
        else:
            renamed_columns[col_name] = col_name
    return df.rename(columns=renamed_columns)


allergies_clean = add_suffix(allergies_pivot, 'ALLERGIES')
careplans_clean = add_suffix(careplans_pivot, 'CAREPLANS')
conditions_clean = add_suffix(conditions_pivot, 'CONDITIONS')
immunizations_clean = add_suffix(immunizations_pivot, 'IMMUNIZATIONS')
medications_clean = add_suffix(medications_pivot, 'MEDICATIONS')
observations_pivot = add_suffix(observations_pivot, 'OBSERVATIONS')
procedures_clean = add_suffix(procedures_pivot, 'PROCEDURES')


## Merge datasets

Separate conditions dataframe based on disease (now just with diabetes but ultimately need to do with each disease)

In [15]:
## DIABETES
illness_descriptions = ['PATIENT','Diabetes_CONDITIONS','Prediabetes_CONDITIONS','Diabetic retinopathy associated with type II diabetes mellitus (disorder)_CONDITIONS', 
                        'Nonproliferative diabetic retinopathy due to type 2 diabetes mellitus (disorder)_CONDITIONS', 'Macular edema and retinopathy due to type 2 diabetes mellitus (disorder)_CONDITIONS', 
                        'Microalbuminuria due to type 2 diabetes mellitus (disorder)_CONDITIONS', 'Diabetic renal disease (disorder)_CONDITIONS', 'Neuropathy due to type 2 diabetes mellitus (disorder)_CONDITIONS']


subset_conditions = conditions_clean.loc[:, illness_descriptions]

# dataset to be used for analysis
subset_conditions.to_csv('conditions_diabetes.csv', index=False)

In [16]:
## PREGNANCY
illness_descriptions = ['PATIENT','Miscarriage in first trimester_CONDITIONS','Miscarriage in second trimester_CONDITIONS',
                        'Complication occuring during pregnancy_CONDITIONS','Preeclampsia_CONDITIONS', 'Antepartum eclampsia_CONDITIONS',
                        'Tubal pregnancy_CONDITIONS', 'Congenital uterine anomaly_CONDITIONS', 'Blighted ovum_CONDITIONS']

subset_conditions = conditions_clean.loc[:, illness_descriptions]

# dataset to be used for analysis
subset_conditions.to_csv('conditions_pregnancy.csv', index=False)

In [17]:
## CANCER
illness_descriptions = ['PATIENT','Non-small cell lung cancer (disorder)_CONDITIONS', 'Non-small cell carcinoma of lung  TNM stage 4 (disorder)_CONDITIONS',
                        'Primary small cell malignant neoplasm of lung  TNM stage 4 (disorder)_CONDITIONS','Non-small cell carcinoma of lung  TNM stage 2 (disorder)_CONDITIONS',
                        'Non-small cell lung cancer (disorder)_CONDITIONS', 'Suspected lung cancer (situation)_CONDITIONS', 'Malignant tumor of colon_CONDITIONS',
                        'Overlapping malignant neoplasm of colon_CONDITIONS']

subset_conditions = conditions_clean.loc[:, illness_descriptions]

subset_conditions.to_csv('conditions_cancer.csv', index=False)

In [18]:
## HEART
illness_descriptions = ['PATIENT','Coronary Heart Disease_CONDITIONS', 'History of cardiac arrest (situation)_CONDITIONS', 'Cardiac Arrest_CONDITIONS',
                        'History of myocardial infarction (situation)_CONDITIONS', 'Myocardial Infarction_CONDITIONS']

subset_conditions = conditions_clean.loc[:, illness_descriptions]
subset_conditions.to_csv('conditions_heart.csv', index=False)

In [19]:
## LUNGS
illness_descriptions = ['PATIENT','Asthma_CONDITIONS', 'Pulmonary emphysema (disorder)_CONDITIONS', 'Seasonal allergic rhinitis_CONDITIONS', 
                        'Acute bronchitis (disorder)_CONDITIONS', 'Chronic obstructive bronchitis (disorder)_CONDITIONS',
                        'Childhood asthma_CONDITIONS', 'Perennial allergic rhinitis with seasonal variation_CONDITIONS',
                        'Perennial allergic rhinitis_CONDITIONS', 'Acute bacterial sinusitis (disorder)_CONDITIONS', 'Chronic sinusitis (disorder)_CONDITIONS',
                        'Sinusitis (disorder)_CONDITIONS']

subset_conditions = conditions_clean.loc[:, illness_descriptions]
subset_conditions.to_csv('conditions_lungs.csv', index=False)