# Main script to merge PDD, EDD and geometry

Modules: N/A <br>
Author: Cornelia Ilin <br>
Email: cilin@ischool.berkeley.edu <br>
Date created: March 28, 2022 <br>

Author: Cornelia Ilin

Email: cilin@wisc.edu

Date created: Oct 14, 2022

### Step 1: Import packages

In [None]:
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings("ignore")

### Step 2: Define working directories

In [None]:
in_dir_h = 'C:/Users/cilin/Research/CA_hospitals/Input/final_data/health/'
out_dir = 'C:/Users/cilin/Research/CA_hospitals/Input/final_data/all_combined/'

### Step 3: Define functions

``read``

In [None]:
def read_health_data():
    '''
    '''
    df_dict = {}
    for file in os.listdir(in_dir_h):
        if file in ['PDD_final.csv', 'EDD_final.csv']:
            key = file.split('_')[0]
            print('Reading:', file)
            df_dict[key] = pd.read_csv(in_dir_h + file)
        
    return df_dict

``preprocess``


In [None]:
def preprocess_data(df):
    ''''''
    # transform date to datetime
    df['admtdate'] = pd.to_datetime(df.admtdate)
    df['bthdate'] = pd.to_datetime(df.bthdate)
    
    # compute age
    df['age'] = (df.admtdate - df.bthdate)/np.timedelta64(1, 'Y')

    # keep only if age <=18
    #df = df[df.age.le(18)]
    
    # drop if rln=='---------'
    df = df[~df.rln.eq('---------')]

    # add pm25 and wfeI
    df['pm25I'] = np.nan # pm25 exposure at birth
    df['wfeI'] = np.nan  # wildfire exposure at birth
    df['cntyresI'] = np.nan
    df['sexI'] = np.nan
    df['raceI'] = np.nan
    df['visitsM_9mpp'] = np.nan
    df['visitsM_1ypp'] = np.nan
    df['visitsI_1yol'] = np.nan

    # rename columns
    df.rename(columns={'bthdate': 'bthdateI', 'bthyear': 'bthyearI', 'rln':'rlnI_updated'}, inplace=True)
    
    return df

### Step 4: Read data

In [None]:
dict_h = read_health_data()
print('Keys in data:', dict_h.keys())

In [None]:
# print shapes
for key in ['PDD', 'EDD']:
    print(key)
    display(dict_h[key].shape)
    print('---')

In [None]:
# concatenate PDD and EDD
df = pd.concat([dict_h['PDD'], dict_h['EDD']], axis=0)
df.head()

### Step 5: Preprocess data

In [None]:
df = preprocess_data(df)
print('Number of unique pediatric patients', df.rlnI_updated.nunique())

### Step 6: Export data

In [None]:
df.to_csv(out_dir + 'analysis_data_pdd_edd.csv')

In [None]:
df.columns