# Main script to merge Birth, PDD, EDD and geometry

Modules: N/A <br>
Author: Cornelia Ilin <br>
Email: cilin@ischool.berkeley.edu <br>
Date created: March 28, 2022 <br>

### Step 1: Import packages

In [None]:
import pandas as pd
import numpy as np
import os

### Step 2: Define working directories

In [None]:
in_dir = 'C:/Users/cilin/Research/CA_hospitals/Input/final_data/health/'
in_dir_data_selection = 'C:/Users/cilin/Research/CA_hospitals/Input/raw_data/health/'

### Step 3: Define functions

``read``

In [None]:
def read_data():
    '''
    '''
    df_dict = {}
    for file in os.listdir(in_dir):
        if file in ['Birth_final.csv', 'PDD_final.csv', 'EDD_final.csv']:
            key = file.split('_')[0]
            print('Reading:', file)
            df_dict[key] = pd.read_csv(in_dir + file)
        
    return df_dict

``preprocess``

In [None]:
def preprocess_data(df_dict):
    ''''''
    ## Birth ##
    ###########
    # from Birth key keep only _input==['B', 'I'] records
    df_dict['Birth'] = df_dict['Birth'][df_dict['Birth']._input.isin(('B', 'I'))]

    
    ## PDD and EDD ##
    #################
    # merge PDD and EDD keys (call it PE)
    df_dict['PE'] = pd.concat(
        [df_dict['PDD'], df_dict['EDD']],
        axis=0
    )

    # keep only rln that are in birth data
    rlnIs = df_dict["Birth"].rlnI_updated.unique()
    df_dict['PE'] = df_dict['PE'][df_dict['PE'].rln.isin(rlnIs)]

    
    ## reset index ##
    #################
    for key in df_dict.keys():
        df_dict[key].reset_index(
            drop=True,
            inplace=True
        )
        
    return df_dict

``merge``

In [None]:
def outer_merge(df_dict):
    ''''''
    ## merge Birth and PE ##
    ########################
    # add the merge to key 'BPE'
    df_dict['BPE'] = df_dict['Birth'].merge(
        df_dict['PE'],
        left_on=['rlnI_updated', 'admdateI'],
        right_on=['rln', 'admtdate'],
        how='outer',
        indicator=True
    )
    
    
    ## fill-in RHS information ##
    #############################
    # (missing data in PDD/EDD data)
    RHS_columns = [
       'rln', 'patzip',	'patcnty',	'patcnty_name',	'bthdate',	'bthyear',
        'bthmonth',	'bthday', 'admtdate',	'admtyear',	'admtmonth', 'admtday',
        'charge',	'diag00',	'diag01',	'diag02',	'diag03',
        'diag04',	'proc00',	'proc01',	'proc02',	'proc03',	'proc04', 'data_source'
    ]

    LHS_columns= [
        'rlnI_updated', 'zipI', 'cntyresI', 'cntyresI_name', 'bthdateI', 'bthyearI',
        'bthmonthI', 'bthdayI', 'admdateI', 'admyearI', 'admmonthI', 'admdayI',	
        '_chargesI', 'diagI00',	'diagI01',	'diagI02',	'diagI03',	
        'diagI04', 'procI00', 'procI01',	'procI02',	'procI03',	'procI04'
    ]


    for idx, col in enumerate(RHS_columns):
        #print(idx,col)
        if col=='data_source':
            df_dict['BPE'][col] = np.where(
            (df_dict['BPE']['_merge'].eq('left_only') & df_dict['BPE'][col].isna()),
            'Birth', df_dict['BPE'][col]
        )
        else:
            df_dict['BPE'][col] = np.where(
            (df_dict['BPE']['_merge'].eq('left_only') & df_dict['BPE'][col].isna()),
            df_dict['BPE'][LHS_columns[idx]], df_dict['BPE'][col]
        )
    

    ## fill-in LHS information ##
    #############################
    # (missing data in Birth data)
    df_dict['BPE']['rlnI_updated'] = np.where(
        df_dict['BPE'].rlnI_updated.isna(),
        df_dict['BPE'].rln, df_dict['BPE'].rlnI_updated
    )
    
    ## fill-in information at Birth ##
    print('Filling in birth information...')
    %time df_dict['BPE'] = LHS_merge_helper(df_dict['BPE'])
    
    # drop if brthdate in patient records different than brthdateI
    df_dict['BPE'] = df_dict['BPE'][df_dict['BPE'].bthdateI.eq(df_dict['BPE'].bthdate)]

    return df_dict

In [None]:
def LHS_merge_helper(df):
    ''' Fill in empty LHS cells **with information at Birth**
    '''
    def helper(grp):
        ''''''
        #print('rlnI_updated:', grp.rlnI_updated.unique())
        temp_grp = grp[grp._input.eq('B')]
        temp_grp.reset_index(drop=True, inplace=True)

        columns = ['_brthid', '_brthIDHST', 'rlnI_status',
            'bthdateM',	'bthyearM',	'bthmonthM', 'bthdayM',
            'bthdateI', 'bthyearI', 'bthmonthI', 'bthdayI',
            'bthI_month_year',
            'fbthdate',	'fbthyear',	'fbthmonth', 'fbthday',
            'raceM', 'frace', 'meduc', 'feduc',	'sexI',	'raceI',
            'matresst',	'matresst_name','bthresmb',	'bthresmb_name',
            'bthresmb06',	'bthresmb06_name',
            'zipM',	'zipI', 'ZCTA10I_month_year',
            'cntyresM',	'cntyresM_name', 'cntyresI', 'cntyresI_name',
            'hplzipM', 'hplzipI', 'hplcntyM', 'hplcntyM_name', 'hplcntyI', 'hplcntyI_name',
            'visitsM_9mpp',	'visitsM_1ypp',	'visitsI_1yol',
            'precare',	'precare_name',	'prevlbd',	'prevlbd_name',	'prevlbl',	'prevlbl_name',
            'cebl',	'cebl_name',	'ceb',	'ceb_name',	'prevsts',	'prevsts_name',	'gest_days',
            'gest_days_name',	'bthwghtI',	'caesar',	'caesar_name', 'term_a20wks',
            'term_a20wks_name',	'term_b20wks',	'term_b20wks_name'] 
        try:
            for col in columns:
                temp_col_val = temp_grp[col].unique()[0]
                grp[col] = temp_col_val
                
        except IndexError:
            return grp
            
                
        try:
            for col in ['ZCTA10I_centroid', 'ZCTA10M_centroid']:
                temp_val = temp_grp[col][0]
                grp[col] = temp_val

            for col in ['ZCTA10I', 'ZCTA10M']:
                temp_val = temp_grp[col].unique()[0]
                grp[col] = temp_val
                
            return grp
                
        except KeyError:
            return grp
    
    # groupby rlnI_updated to add birth values
    df = df.groupby('rlnI_updated').apply(helper)
    
    return df

``add variables``

In [None]:
def add_outcome(df):
    """ A function that adds an outcome variable (=1 if diagnosis (diag00-diag04) is for respiratory/circulatory/injury; = 0 if for injury/poisoning) 
    """
    outcome_is_1 = add_outcome_helper()
        
    df["outcome"] = np.where(df._input.eq('B'), 'B',
                             np.where((df.diag00.str.startswith(outcome_is_1)| df.diag01.str.startswith(outcome_is_1)), 1,
                                      np.where(df.diag00.str.startswith(("8", "9", "S", "T")), 0, 'others')))
        
    return df

In [None]:
def add_outcome_helper():
    """ A function that creates a list of ICD9/10 diagnosis codes for which outcome variable == 1, 
        (i.e. diagnosis is related to respiratory or circultory health condition)
    # return: a list of diagnosis codes
    """
    # read icd codes selection    
    icd = pd.read_excel(
        in_dir_data_selection + 'data_selection.xlsx',
        'Diag_codes', skiprows = 2, header = 0
    )
        
    # keep only if it's to be used in the analysis
    icd = icd[icd["Use to define outcome variable [behrt]"] == 1]

    
    ## Step1: extract ICD-9-CM codes for which outcome == 1:
    icd9_values = [] 

    # grab raw values
    raw_values = icd["ICD-9-CM"].values.tolist()

    # create array sequences based on raw values
    seq_values = []
    for val in raw_values:
        if val == -1:
            continue
        else:
            seq_values.append(np.arange(int(val[0:3]), int(val[4:7])+1, 1))

    # save final values
    for row in seq_values:
        for cell in row:
            icd9_values.append(str(cell))

    # create tuple
    icd9_values = tuple(icd9_values)
    
    ## Step2: extract ICD-10-CM codes for which outcome == 1:
    icd10_values = [] 

    # grab raw values
    raw_values = icd["ICD-10-CM"].values.tolist()
    raw_values = [val.lstrip() for val in raw_values]

    # create array sequences based on raw values
    seq_values = []
    seq_letters = []
    
    for val in raw_values:
        if val == "J00-J06, J20-J22":
            seq_values.append(np.arange(int(val[1:3]), int(val[5:7])+1, 1))
            seq_values.append(np.arange(int(val[10:12]), int(val[14:16])+1, 1))
            seq_letters.append(val[0])
            seq_letters.append(val[9]) 
        else: 
            seq_values.append(np.arange(int(val[1:3]), int(val[5:7])+1, 1))
            seq_letters.append(val[0])

    # save final values
    for index, row in enumerate(seq_values):
        for cell in row:
            if cell < 10: # add a 0 to numbers that have only one character
                icd10_values.append(seq_letters[index] + "0" + str(cell))
            else:
                icd10_values.append(seq_letters[index] + str(cell))

    # create tuple
    icd10_values = tuple(icd10_values)
    
    ## Step3: Combine Step1 and Step2
    icd_values = icd9_values + icd10_values
    
    return icd_values

---
### Step 4: Read Birth, PDD, EDD data

In [None]:
df_dict = read_data()
print('Dictionary keys:', df_dict.keys())

---
### Step 5: Preprocess Birth, PDD, EDD data

In [None]:
df_dict = preprocess_data(df_dict)
print('Dictionary keys:', df_dict.keys())

---
### Step 6: Merge Birth, PDD, EDD data

Important!!! Before you merge this column, think of a better way to populate birth information!! (LHS) and make sure to add ZCTA codes and centroid for infant and mother

In [None]:
df_dict = outer_merge(df_dict)
print('Dictionary keys:', df_dict.keys())
print('Shape of merged BPE data:', df_dict['BPE'].shape)

---
### Step 7: Add variables

``outcome``

In [None]:
df_dict['BPE'] = add_outcome(df_dict['BPE'])

### Step 8: Export to .csv

In [None]:
df_dict['BPE'].to_csv(in_dir + 'BPE_final.csv')

---
tests

In [None]:
df_dict['BPE'][df_dict['BPE'].rlnI_updated.eq('NC3UWUCMU')][['ZCTA10I', 'ZCTA10M', 'ZCTA10I_centroid']]

``add list of admission dates for each birth ID``

In [None]:
# subset data based on _input=['B', 'I']
mini_mini_df = mini_df[mini_df._input.isin(('B', 'I'))]
mini_mini_df.reset_index(
    drop=True,
    inplace=True
)

# create list/tuple of admtdateI
def helper(grp):
    ''''''
    dates = grp.admdateI.unique()
    grp['admdatesI_brthdata'] = [dates] * len(grp)
    return grp

mini_mini_df = mini_mini_df.groupby('_brthid', as_index=False).apply(helper)

In [None]:
import pandas as pd
import numpy as np
import os

in_dir = 'C:/Users/cilin/Research/CA_hospitals/Input/final_data/health/'
in_dir_data_selection = 'C:/Users/cilin/Research/CA_hospitals/Input/raw_data/health/'

df = pd.read_csv(in_dir + 'BPE_final.csv')