# Main script to clean Birth data

Modules: N/A <br>
Author: Cornelia Ilin <br>
Email: cilin@ischool.berkeley.edu <br>
Date created: March 28, 2022 <br>

### Step 1: Import packages

In [None]:
import pandas as pd
import numpy as np
import os

### Step 2: Define working directories

In [None]:
in_dir = 'C:/Users/cilin/Research/CA_hospitals/Input/interm_data/health/'
in_dir_data_selection = 'C:/Users/cilin/Research/CA_hospitals/Input/raw_data/health/'
out_dir = 'C:/Users/cilin/Research/CA_hospitals/Input/final_data/health/'

### Step 3: Define functions

``read data``

In [None]:
def read_data():
    ''''''
    df = pd.read_csv(
        in_dir + 'Birth.csv'
    )
        
    return df

---
preprocessing - add, recode, substitute

---

``dates``

In [None]:
def recode_dates(df):
    ''' Recode birth and admission dates (transform from SAS format to Pandas)
    '''
    # define dates
    dates = ['bthdate', 'bthdateI', 
             'mbthdate', 'bthdateM',
             'fbthdate',
             'admdateI', 'admdateM']
    
    for i in range(len(dates)):
        df[dates[i]] = pd.to_timedelta(df[dates[i]], unit = 'D') + pd.Timestamp('1960-1-1')
        
    return df    

In [None]:
def sub_missing_dates(df):
    ''' Substitute missing date values in hospital data with values in vital stats data
    '''
    # substitute birthdate of infant (bthdate is from vital stats data, bthdateI is from hospital data)
    df['bthdateI'] = np.where(df.bthdateI.isna(), df.bthdate, df.bthdateI)
    
    # subsitute birth date of mother (mbthdate is from vital stats data, bthdateM is from hospital data)
    df['bthdateM'] = np.where(df.bthdateM.isna(), df.mbthdate, df.bthdateM)
    
    # drop vars
    df.drop(columns=['bthdate', 'mbthdate'], inplace=True)
    
    return df

In [None]:
def add_dates(df):
    ''' Add dates for year, month, day of birth for infant, mother, father
    '''
    # define dates
    dates = ['bthdateI', 'bthdateM', 'fbthdate',
             'admdateI', 'admdateM']
    
    # define bth variable to be added (year, month, day of birth)
    newvars = [['bthyearI', 'bthmonthI', 'bthdayI'],
              ['bthyearM', 'bthmonthM', 'bthdayM'],
              ['fbthyear', 'fbthmonth', 'fbthday'],
              ['admyearI', 'admmonthI', 'admdayI'],
              ['admyearM', 'admmonthM', 'admdayM']]
        
    for i in range(len(dates)):
        # add bth year
        df[newvars[i][0]] = pd.DatetimeIndex(df[dates[i]]).year
        # add bth month
        df[newvars[i][1]] = pd.DatetimeIndex(df[dates[i]]).month
        # add bth date
        df[newvars[i][2]] = pd.DatetimeIndex(df[dates[i]]).day
    
    return df

``sex``

In [None]:
def sub_missing_sex(df):
    ''' Substitute missing sex values in hospital data with values in vital stats data 
    '''
    # transform to string (nbsex is from vital stats data, sexI is from hospital data))
    df['nbsex'] = df.nbsex.astype(str)
    df['sexI'] = df.sexI.astype(str)
    
    # substitute missing sexI with nbsex
    df['sexI'] = np.where(df.sexI.isin(('nan', '0.0', '3.0', '4.0')), df.nbsex, df.sexI)

    # nbsex has a label 9 = Undetermined, replace with nan
    df['sexI'] = np.where(df.sexI.eq('9.0'), 'nan', df.sexI)
    
    # drop nbsex
    df.drop(columns=['nbsex'], inplace=True)
    
    return df

In [None]:
def recode_sex(df):
    ''' Recode sex variables
    '''
    # 1 = Male, 2 = Female
    df['sexI'] = np.where(df.sexI.eq('1.0'), 'M',
                          np.where(df.sexI.eq('2.0'), 'F', 'nan'))

    return df

``race``

In [None]:
def recode_race(df):
    ''' Recode race
    '''
    # transform race vars to string
    variables = [
        'raceh83I', 'raceh95I', 'nbrace',
        'raceh83M', 'raceh95M', 'mrace',
        'frace'
    ]

    for var in variables:
        df[var] = df[var].astype(str)

    # recode raceh83I to match raceh85I; do the same for raceh83M and raceh95M
    df['raceh83I'] = np.where(df.raceh83I == '1.0', 'white',
                                    np.where(df.raceh83I == '2.0', 'black', 
                                            np.where(df.raceh83I == '3.0', 'hisp', 
                                                    np.where(df.raceh83I == '4.0', 'native american/eskimo/aleut',
                                                            np.where(df.raceh83I == '5.0', 'asian/pacific islander', 
                                                                    np.where(df.raceh83I == '6.0', 'other',
                                                                        np.where(df.raceh83I =='7.0', 'unknown',
                                                                            np.where(df.raceh83I == '0.0', 'unknown', df.raceh83I))))))))

    df['raceh95I'] = np.where(df.raceh95I == '1.0', 'white',
                                    np.where(df.raceh95I == '2.0', 'black', 
                                            np.where(df.raceh95I == '3.0', 'native american/eskimo/aleut', 
                                                    np.where(df.raceh95I == '4.0', 'asian/pacific islander',
                                                            np.where(df.raceh95I == '5.0', 'other', 
                                                                    np.where(df.raceh95I == '6.0', 'unknown',
                                                                            np.where(df.raceh95I == '0.0', 'unknown', df.raceh95I)))))))
    
    # recode race83I, race83M
    race83 = ['raceh83I', 'raceh83M']
    for val in race83:
        df[val] = np.where(df[val] == '1.0', 'white',
                                        np.where(df[val] == '2.0', 'black', 
                                                np.where(df[val] == '3.0', 'hisp', 
                                                        np.where(df[val]== '4.0', 'native american/eskimo/aleut',
                                                                np.where(df[val] == '5.0', 'asian/pacific islander', 
                                                                        np.where(df[val] == '6.0', 'other',
                                                                            np.where(df[val].isin(('0.0', '7.0')), 'unknown', df[val])))))))
    # recode race95I, race95M
    race95 = ['raceh95I', 'raceh95M']
    for val in race95:
        df[val] = np.where(df[val] == '1.0', 'white',
                                        np.where(df[val] == '2.0', 'black', 
                                                np.where(df[val] == '3.0', 'native american/eskimo/aleut', 
                                                        np.where(df[val] == '4.0', 'asian/pacific islander',
                                                                np.where(df[val] == '5.0', 'other', 
                                                                        np.where(df[val].isin(('0.0','6.0', '7.0')), 'unknown', df[val]))))))

    # recode raceh83I, raceh95I, raceh83M, raceh95M as only two variables raceI, raceM
    race = ['raceI', 'raceM']
    for i, val in enumerate(race):
        df[val] = np.where(df[race83[i]] == 'nan', df[race95[i]], df[race83[i]])

    # recode nbrace, mrace, frace (these are from vital stats)
    races_vs = ['nbrace', 'mrace', 'frace']
    for val in races_vs:
        df[val] = np.where(df[val] == '10.0', 'white',
                          np.where(df[val] == '20.0', 'black',
                                  np.where(df[val].isin(('30.0',  '57.0', '58.0')), 'native american/eskimo/aleut',
                                          np.where(df[val].isin(('40.0', '41.0', '42.0', '43.0', '44.0', '45.0', '46.0', '47.0', '48.0','52.0','59.0')), 'asian/pacific islander',
                                                  np.where(df[val].isin(('51.0', '53.0', '54.0', '55.0', '56.0')), 'other',
                                                          np.where(df[val].isin(('9.0', '19.0', '49.0', '98.0', '99.0')), 'unknown', df[val]))))))
        
    # drop 83 and 95 race vars
    df.drop(
        columns=['raceh83I', 'raceh83M', 'raceh95I', 'raceh95M'],
        inplace=True
    )
    
    return df

In [None]:
def sub_missing_race(df):
    '''Substitute missing race values in hospital data with values in vital stats data
    '''
    # substitute infant race
    df['raceI'] = np.where(df.raceI.isin(('nan', 'unknown')), df.nbrace, df.raceI)
    
    # substitute mother race
    df['raceM'] = np.where(df.raceM.isin(('nan', 'unknown')), df.mrace, df.raceM)
    
    # drop nbrace and mrace
    df.drop(
        columns=['nbrace', 'mrace'],
        inplace=True
    )
    
    return df

``education``

In [None]:
def recode_educ(df):
    '''
    '''
    variables = ['meduc', 'feduc', 'meduc06', 'feduc06']
    for var in variables:
        # transform var to string
        df[var] = df[var].astype(str)

    # modify string values for feduc
    df['feduc'] = np.where(df.feduc.isin(('00', '01', '02', '03', '04', '05', '06', '07', '08', '09')), df.feduc.str[1:]+'.0',
                           np.where(df.feduc.isin(('10', '11', '12', '13', '14', '15', '16', '17', '99')), df.feduc+'.0',
                                   np.where(df.feduc.isin(('61', '69', 'CO', 'NI')), '99.0', df.feduc)))    
    
    
    # encode education before 2006
    educ_b_06 = ['meduc', 'feduc']
    for var in educ_b_06:
        df[var] = np.where(df[var].isin(('0.0', '1.0', '2.0', '3.0',
                                                '4.0', '5.0', '6.0', '7.0',
                                                '8.0', '9.0', '10.0', '11.0', '12.0')), 'high school or less',
                                  np.where(df[var].isin(('13.0', '14.0', '15.0')), 'college (1-3 years)', 
                                          np.where(df[var] == '16.0', 'college (4 years)',
                                                  np.where(df[var] == '17.0', 'masters or phd',
                                                        np.where(df[var].isin(('0.0', '18.0', '19.0', '24.0','99.0')), 'unknown or other', df[var])))))

        
    # encode education after 2006
    educ06 = ['meduc06', 'feduc06']
    for var in educ06:
        df[var] = np.where(df[var].isin(('1.0', '2.0', '3.0')), 'high school or less',
                                  np.where(df[var].isin(('4.0', '5.0')), 'college (1-3 years)', 
                                          np.where(df[var] == '6.0', 'college (4 years)',
                                                  np.where(df[var].isin(('7.0', '8.0')), 'masters or phd',
                                                        np.where(df[var].isin(('0.0', '9.0')), 'unknown or other', df[var])))))

        
    # recode meduc, meduc06, feduc, feduc06 as meduc, feduc
    educ = ['meduc', 'feduc']
    for i, val in enumerate(educ):
        df[val] = np.where(df[educ_b_06[i]] == 'nan', df[educ06[i]], df[educ_b_06[i]])
        
    
    # drop meduc06 and feduc06
    df.drop(
        columns=['meduc06', 'feduc06'],
        inplace=True
    )
        
    return df

``zip``

In [None]:
def recode_zip(df):
    ''' For infant and mother: Recode zip at birth
    '''
    
    # define zipcode variables
    zips = ['hplzipI', 'hplzipM', 'zipI', 'zipM', 'zipresm']
    
    for val in zips:
        # recode zip as string
        df[val] = df[val].astype(str)
        
        # recode XXXXX, YYYYY, ZZZZZ
        df[val] = np.where(df[val].eq('XXXXX'), 'nan',
                          np.where(df[val].eq('YYYYY'), 'outside of US',
                                  np.where(df[val].eq('ZZZZZ'), 'homeless', df[val])))
        
        # set zip to 'nan' depending on zip length
        df['len_zip'] = df[val].str.len()
        df[val] = np.where(df['len_zip'].isin((1, 2, 4, 6)), 'nan', df[val])
        
        if val=='zipres':
            df[val] = np.where(df['len_zip'].eq(5), 'nan', df[val])
            
        # remove .0 or 0000.0 from zip code if it has any
        df[val] = np.where(df['len_zip'].isin((7,11)), df[val].str[:5], df[val])
        
        # drop len_zip
        df.drop(columns=['len_zip'], inplace=True)
        
    return df

In [None]:
def sub_missing_zip(df):
    ''' For mother only: substitute missing zip residence values in hospital data with values in vital stats data (this information is only available for mother)
        For infant only: substitute missing zipI with zipM, hplzipI, hplzipM -> in this order
    '''
    df['zipM'] = np.where(df.zipM.eq('nan'), df.zipresm, df.zipM)
    
    columns = ['zipM', 'hplzipI', 'hplzipM']
    for col in columns:
        df['zipI'] = np.where(df.zipI.eq('nan'), df[col], df.zipI)
    
    # drop zipresm
    df.drop(columns=['zipresm'], inplace=True)
    
    return df

``county``

In [None]:
def recode_county(df):
    ''' For infant and mother: Recode county at birth
    '''

    cols = ['cntyresI', 'cntyresM', 'hplcntyI', 'hplcntyM']
    for col in cols:
        # transform to string
        df[col] = df[col].astype(str)

        # read county code and associated names from the data_selection.xlsx file
        cnty_values = pd.read_excel(
            in_dir_data_selection + 'data_selection.xlsx',
            'County_names', skiprows = 2, header = 0
        ).iloc[:,1:3] # select only the first 2 columns

        cnty_values = cnty_values.astype(str)
        cnty_values['county_code'] = cnty_values.county_code + '.0'

        # add county names to df
        temp_df = df[[col]].merge(
            cnty_values,
            left_on=col,
            right_on='county_code',
            how='left'
        )

        # rename county_name
        temp_df.rename(
            columns={'county_name': col+'_name'},
            inplace=True
        )

        # replace values in col+'_name' depending on val in col or col+'_name'
        temp_df[col+'_name'] = np.where(temp_df[col].eq('0.0'), 'unknown/outside CA/homeless',
                                  np.where(temp_df[col+'_name'].isna(), 'nan', temp_df[col+'_name']))
        
        # add col+'_name' to original df
        df[col+'_name'] = temp_df[col+'_name']
        
    return df

``state``

In [None]:
def recode_state_m(df):
    ''' For mother only: Recode state at birth
    '''
    # transform to string
    df['matresst'] = df.matresst.astype(str)

    # read state code and associate names from the data_selection.xlsx file
    st_values = pd.read_excel(
        in_dir_data_selection + 'data_selection.xlsx',
        'State_names', skiprows = 2, header = 0
    )
    st_values = st_values.astype(str)

    # substitute state names for matresst coding
    temp_df = df[['matresst']].merge(
        st_values,
        left_on=['matresst'],
        right_on=['state_code'],
        how='left'
    )

    # code 98.0 doesn't exist in data_selection.xlsx
    temp_df['state_name'] = np.where(
        temp_df.matresst.eq('98.0'), 'Unknown Nativity', temp_df.state_name
    )

    df['matresst_name'] = temp_df.state_name
    
    return df

In [None]:
def recode_state_mob(df):
    ''' For mother only: Recode state/country at mother's own birth
    '''
    ##############
    ## bthresmb ##
    ##############
    df['bthresmb'] = df.bthresmb.astype(str)
    # read state abbreviation and name from data_selection.xlsx file
    state_values = pd.read_excel(
        in_dir_data_selection + 'data_selection.xlsx',
        'State_names', skiprows = 2, header = 0
    ).iloc[:,4:6] # read only cols 4 and 5
    state_values = state_values.astype(str)  


    # rename columns
    state_values.rename(
        columns={'state_code2': 'code',
                 'state_name2': 'name'}, inplace=True)

    # add country/state name to bthresmb
    temp_df = df[['bthresmb']].merge(
        state_values,
        left_on='bthresmb',
        right_on='code',
        how='left'
    )
    temp_df['name'] = np.where(temp_df.name.isna(), 'nan', temp_df.name)

    temp_df.rename(
        columns={'name':'bthresmb_name'}, inplace=True
    )
    temp_df.reset_index(drop=True, inplace=True)

    # add bthresmb_name to df
    df['bthresmb_name'] = temp_df.bthresmb_name
    
    ################
    ## bthresmb06 ##
    ################
    df['bthresmb06'] = df.bthresmb06.astype(str)
    # read state abbreviation and name from data_selection.xlsx file
    state_values = pd.read_excel(
        in_dir_data_selection + 'data_selection.xlsx',
        'State_names', skiprows = 2, header = 0
    ).iloc[:,7:9] # read only cols 7 and 8
    state_values = state_values.astype(str)    
    state_values['state_code3'] = state_values.state_code3.str.split('.').str[0] # remove .0
    state_values['state_name3'] = state_values.state_name3.str.lstrip() # remove leading white spaces
    
    # read country/state marc codes
    country_values_marc = pd.read_excel(
        in_dir_data_selection + 'data_selection.xlsx',
        'Country_names', skiprows = 2, header = 0
    ).iloc[:,[8,9]]
    country_values_marc = country_values_marc.astype(str)


    # rename columns in state_/country_values
    state_values.rename(
        columns={'state_code3': 'code',
                 'state_name3': 'name'}, inplace=True)

    country_values_marc.rename(
        columns={'MARC': 'code',
            'Country': 'name'},inplace=True)

    # concatenate state_/country_values
    sc_values = pd.concat(
        [state_values, country_values_marc],
        axis=0
    )

    # clean sc_values
    sc_values = sc_values.astype(str)
    sc_values = sc_values[~sc_values.code.eq('nan')]
    sc_values['name'] = sc_values.name.str.replace('\xa0', ' ')
    sc_values.reset_index(drop=True, inplace=True)


    # add country/state name to bthresmb06
    temp_df = df[['bthresmb06']].merge(
        sc_values,
        left_on='bthresmb06',
        right_on='code',
        how='left'
    )
    temp_df['name'] = np.where(temp_df.name.isna(), 'nan', temp_df.name)

    temp_df.rename(
        columns={'name':'bthresmb06_name'}, inplace=True
    )
    temp_df.reset_index(drop=True, inplace=True)

    # add bthresmb06_name to df
    df['bthresmb06_name'] = temp_df.bthresmb06_name

    
    # set difference (names after 2006 that didn't exist before)
    set_diff = np.setdiff1d(df.bthresmb06_name.unique(), df.bthresmb_name.unique())
    df['bthresmb06_limited'] = np.where(df.bthresmb06_name.isin(set_diff), 'RE', df.bthresmb06)
    df['bthresmb06_limited_name'] = np.where(df.bthresmb06_name.isin(set_diff), 'Reminder of the World', df.bthresmb06_name)
    
    # combine bthresmb, bthresmb_name, bthresmb06_limited, bthresmb06_limited_name
    df['bthresmb'] = np.where(df.bthresmb.eq('nan'), df.bthresmb06_limited, df.bthresmb)
    df['bthresmb_name'] = np.where(df.bthresmb_name.eq('nan'), df.bthresmb06_limited_name, df.bthresmb_name)

    df.drop(
        columns=['bthresmb06_limited', 'bthresmb06_limited_name'],
        inplace=True
    )

    return df

``caesar``

In [None]:
def recode_caesar(df):
    '''
    '''
    # define columns
    columns = ['caesar', 'caesar05']
    for val in columns:
        # transform cols to string
        df[val] = df[val].astype(str)
        # remove .0 from string
        df[val] = df[val].str.split('.').str[0]
        
    ## caesar ##
    ############
    df['caesar_name'] = np.where(df.caesar.eq('1'), 'C-section, Primary',
                           np.where(df.caesar.eq('2'), 'C-section, Repeat',
                                   np.where(df.caesar.eq('3'), 'Vaginal, Spontaneous',
                                           np.where(df.caesar.isin(('34', '43')), 'Vaginal, Spontenous after prev. C-section',
                                                   np.where(df.caesar.eq('5'), 'Vaginal, Forceps',
                                                           np.where(df.caesar.isin(('54','45')), 'Vaginal, Forceps after prev. C-section',
                                                                   np.where(df.caesar.eq('6'), 'Vaginal, Vacuum',
                                                                           np.where(df.caesar.isin(('64', '46')), 'Vaginal, Vacuum after prev. C-section',
                                                                                   np.where(df.caesar.isin(('56', '65', '456', '465', '546', '564', '654')), 'Others', 'nan')))))))))
    
    ## caesar05 ##
    ##############
    df['caesar05_name'] = np.where(df.caesar05.isin(('1', '11', '21', '31')), 'C-section, Primary',
                           np.where(df.caesar05.isin(('2', '12', '22', '32')), 'C-section, Repeat',
                                   np.where(df.caesar05.eq('3'), 'Vaginal, Spontaneous',
                                           np.where(df.caesar05.eq('4'), 'Vaginal, Spontenous after prev. C-section',
                                                   np.where(df.caesar05.eq('5'), 'Vaginal, Forceps',
                                                           np.where(df.caesar05.eq('15'), 'Vaginal, Forceps after prev. C-section',
                                                                   np.where(df.caesar05.eq('6'), 'Vaginal, Vacuum',
                                                                           np.where(df.caesar05.eq('16'), 'Vaginal, Vacuum after prev. C-section',
                                                                                    np.where(df.caesar05.isin(('88', '99')), 'Others', 'nan')))))))))
    
    # add caesar05 and caesar05_name to caesar and caesar_name
    df['caesar'] = np.where(df.caesar.eq('na'), df.caesar05, df.caesar)
    df['caesar_name'] = np.where(df.caesar_name.eq('na'), df.caesar05_name, df.caesar_name)
    
    return df

``complications``

In [None]:
def recode_probl(df):
    ''' Recode complications during and before pregnancy, during labor, and complications with the newborn
    '''
    # probl_1: Complications pregnancy/concurrent illnesses
    # probl_2: Complications labor/delivery 
    # probl_3: Complications newborn (Abnormal Conditions/Clinical Procedures)
    columns = ['probl_1', 'probl_2', 'probl_3']

    for col in columns:
        # transform column to string
        df[col] = df[col].astype(str)

        # drop .0
        df[col] = df[col].str.split('.').str[0]

        # measure length
        df[col+'_len'] = df[col].str.len()

        # measure mode
        df[col+'_len_mode'] = np.mod(df[col+'_len'], 2)

        # add 0 if mode is odd (i.e., number is not divisible by 2)
        df[col] = np.where(df[col+'_len'].eq(0), 'nan',
                          np.where(df[col+'_len_mode'].eq(1), str(0)+ df[col], df[col]))

        # replace if col == 0nan
        df[col] = np.where(df[col].eq('0nan'), 'nan', df[col])

        # split each string into 2 charcaters and form a list
        df[col+'_list'] = df[col].str.findall('..')

        # drop cols
        df.drop(
            columns=[col+'_len', col+'_len_mode'],
            inplace=True
        )
        
    return df

``pregnancy precare``

In [None]:
def recode_precare(df):
    ''' Recode prenatal care
    '''
    variables = ['precare', 'prevsts']
    for var in variables:
        # transform to string
        df[var] = df[var].astype(str)
        # remove .0 from string
        df[var] = df[var].str.split('.').str[0]
    
    ## month prenatal care bagan ##
    ###############################
    df['precare_name'] = np.where(df.precare.eq('0'), 'no precare',
                                  np.where(df.precare.eq('-'), 'unknown or not reported',
                                          np.where(df.precare.eq('nan'), 'nan', 'began in '+ df.precare+'th month of pregnancy')))
    
    ## number of prenatal care visits ##
    ####################################
    df['prevsts_name'] = np.where(df.prevsts.eq('0'), 'no precare visits',
                                 np.where(df.prevsts.eq('99'), 'unknown or not reported', 
                                         np.where(df.prevsts.eq('nan'), 'nan', df.prevsts+' precare visits')))
    
    return df

``previous births``

In [None]:
def recode_prev_births(df):
    '''
    '''
    variables = ['prevlbl', 'prevlbd', 'llbmths', 'term_a20wks', 'term_b20wks', 'cebl', 'ceb']
    for col in variables:
        # transform to string
        df[col] = df[col].astype(str)
        # remove .0 from string
        df[col] = df[col].str.split('.').str[0]

    ## previous live births now alive ##
    ####################################
    # does not include current birth
    df['prevlbl_name'] = np.where(df.prevlbl.eq('0'), 'no previous births',
                                  np.where(df.prevlbl.isin(('98', '99')), 'unknown or not reported',
                                          np.where(df.prevlbl.eq('nan'), 'nan', df.prevlbl+' previous live births now alive')))
    
    # includes current birth
    df['cebl_name'] = np.where(df.cebl.eq('0'), 'no previous birth incl. current birth',
                               np.where(df.cebl.isin(('98', '99')), 'unknown or not reported',
                                        np.where(df.cebl.eq('nan'), 'nan', df.cebl+ ' children ever born alive, incl.current birth')))
                                        
    ## previous live births and pregnancy terminations > 20 weeks gestation ##
    ##########################################################################
    df['ceb_name'] = np.where(df.ceb.eq('0'), 'no previous birth incl. current birth',
                               np.where(df.ceb.isin(('98', '99')), 'unknown or not reported',
                                        np.where(df.ceb.eq('nan'), 'nan', df.ceb+ ' children ever born alive + pregnancy term > 20 weeks')))
                                        
                               

    ## previous live births now dead ##
    ###################################
    # does not include current birth
    df['prevlbd_name'] = np.where(df.prevlbd.eq('0'), 'no previous births',
                                 np.where(df.prevlbd.isin(('98','99')), 'unknown or not reported', 
                                         np.where(df.prevlbd.eq('nan'), 'nan', df.prevlbd+' previous live births now dead')))
    
    ## months since last live birth ##
    ##################################
    
    ## Pregnancy terminations > 20wks or < 20wks gestation ##
    #########################################################
    for col in ['term_a20wks', 'term_b20wks']:
        df[col+'_name'] = np.where(df[col].eq('0'), 'no previous pregnancy terminations',
                                       np.where(df[col].eq('nan'), 'nan',
                                               np.where(df[col].isin(('98', '99')), 'unknown or not reported', df[col] + 'previous preganancy terminations')))
        
    return df 

``gestation length``

In [None]:
def recode_gest_days(df):
    ''' Calculated by subtracting the date of last normal menses from the date of birth
    '''
    variables = ['gest_days']
    for var in variables:
        # transform to string
        df[var] = df[var].astype(str)
        # remove .0 from string
        df[var] = df[var].str.split('.').str[0]

    ## previous live births now alive ##
    ####################################
    # does not include current birth
    df['gest_days_name'] = np.where(df.gest_days.eq('999'), 'unknown or not reported',
                                          np.where(df.gest_days.eq('nan'), 'nan', df.gest_days+' days of gestation'))
    
    return df

``admission source``

In [None]:
def recode_admission_src(df):
    '''
    '''
    variables = ['admsrc83I', 'admsrc83M', 'admsrc953I', 'admsrc953M']
    for col in variables:
        # transform to string
        df[col] = df[col].astype(str)
        # remove .0 from string
        df[col] = df[col].str.split('.').str[0]
        
    ## admission source ##
    ######################
    df['admsrcI'] = np.where((df.admsrc83I.eq('12') | (df.admsrc952I.eq('1') & df.admsrc953I.eq('1'))), 'ER', 'other')
    df['admsrcM'] = np.where((df.admsrc83M.eq('12') | (df.admsrc952M.eq('1') & df.admsrc953M.eq('1'))), 'ER', 'other')
    
    return df

``diagnosis codes``

In [None]:
def recode_diagnosis_codes(df):
    '''
    '''
    columns = [
        'diagM00',	'diagM01',	'diagM02',	'diagM03',	'diagM04',
        'proc_edasM00',	'proc_edasM01',	'proc_edasM02',	'proc_edasM03',	'proc_edasM04',
        'procM00', 'procM01',	'procM02',	'procM03',	'procM04',
        'diagI00',	'diagI01',	'diagI02',	'diagI03',	'diagI04',
        'proc_edasI00',	'proc_edasI01',	'proc_edasI02',	'proc_edasI03',	'proc_edasI04',
        'procI00',	'procI01',	'procI02',	'procI03',	'procI04'
    ]

    for col in columns:
        df[col] = df[col].astype(str)
        
    return df

---
preprocessing - add, drop

---

``linked births only``

In [None]:
def linked_births_only(df):
    ''' Keep births only if  vital stats birth, infant, and maternal discharge records are linked
    '''
    # find birth ids that are linked 
    births_linked = df[df['_linkedB'].eq('Y')]['_brthid'].unique()
    print('Number of unique linked birth IDs:', births_linked.shape[0])

    # subset df to keep only birth ids that are linked
    # call it mini_df
    mini_df = df[df['_brthid'].isin(births_linked)]
    mini_df.reset_index(
        drop=True,
        inplace=True
    )
    
    return mini_df

``zip geometry``

In [None]:
def add_drop_zip_geometry(df):
    ''' Add zip code geometries
    '''

    ## read/preprocess geometry ##
    ##############################
    os.chdir("C:/Users/cilin/Research/CA_hospitals/Script/ssn_selection/cleaning/")
    %run "4. geom_cleaning.ipynb"
    
    # drop geometry column
    gdf_zcta.drop(
        columns='ZCTA10_geometry',
        inplace=True
    )

    ## read/preprocess crosswalk ZIP to ZCTA ##
    ###########################################
    # read crosswalk
    cw= pd.read_csv(
        'C:/Users/cilin/Research/CA_hospitals/Input/raw_data/census_geo/ZiptoZcta_Crosswalk_2021.csv'
    )

    # keep if state is CA
    cw = cw[cw.STATE.eq('CA')]

    # transform to string
    cw['ZIP_CODE'] = cw.ZIP_CODE.astype(str)


    ## add geometry to ZIP ##
    #########################
    # define zip columns
    columns = ['zipI', 'zipM', 'hplzipI', 'hplzipM'] 
    for idx, col in enumerate(columns):
        #print(col)

        ## preprocess df ##
        ###################
        # transform zipI to string    
        df[col] = df[col].astype(str)
        df[col] = df[col].str.split('.').str[0] # remove .0

        # grab I, M, hI, hM initials
        if idx==2:
            initial='hI'
        elif idx==3:
            initial='hM'
        else:
            initial=columns[idx][3:]


        ## read unique ZIP in df 
        temp_df = pd.DataFrame(
            df[col].unique(),
            columns=[col]
        )


        # attach ZCTA10 from gdf_zcta file #
        ####################################
        temp_df = temp_df.merge(
            gdf_zcta[['ZCTA10']], 
            left_on=col,
            right_on='ZCTA10',
            how='left'
        )

        # attach ZCTA from crosswalk file #
        ###################################
        temp_df = temp_df.merge(
            cw[['ZIP_CODE', 'ZCTA']], 
            left_on=col,
            right_on='ZIP_CODE',
            how='left'
        )


        # substitute with ZCTA if ZCTA10 is missing
        temp_df['ZCTA10'] = np.where(temp_df.ZCTA10.isna(), temp_df.ZCTA, temp_df.ZCTA10)

        # drop duplicates 
        temp_df.drop_duplicates(
            [col],
            inplace=True
        )


        # add in geometry #
        ###################
        temp_df = temp_df.merge(
            gdf_zcta, 
            on='ZCTA10',
            how='left'
        )


        # drop cols that are not of interest
        temp_df.drop(
            columns=['ZIP_CODE', 'ZCTA'],
            inplace=True
        )


        # merge to original df
        temp_df = df[[col]].merge(
            temp_df,
            on=col,
            how='left'
        )


        # rename columns 
        new_cols = list(temp_df.columns[1:])
        for new_col in new_cols:
            if len(new_col.split('_'))==1:
                temp_name = new_col.split('_')[0]+initial
            else:
                temp_name = new_col.split('_')[0]+initial+'_'+new_col.split('_')[1]
            temp_df.rename(
                columns={new_col:temp_name},
                inplace=True
            )

        # drop col
        temp_df.drop(columns=[col], inplace=True)

        # add temp_df cols to original df
        for temp_col in temp_df.columns:
            df[temp_col] = temp_df[temp_col]
    
    ## if ZCTA geometry of ZCTA10I is missing substitute with that of mom or hospital
    colsI = ['ZCTA10I', 'ZCTA10I_centroid']
    colsM = ['ZCTA10M', 'ZCTA10M_centroid']
    colshI = ['ZCTA10hI', 'ZCTA10hI_centroid']
    colshM = ['ZCTA10hM', 'ZCTA10hM_centroid']
    
    for idx, colI in enumerate(colsI):
        df[colI] = np.where(df[colI].isna(), df[colsM[idx]], df[colI])
        df[colI] = np.where(df[colI].isna(), df[colshI[idx]], df[colI])
        df[colI] = np.where(df[colI].isna(), df[colshM[idx]], df[colI])
    
    # drop if ZCTAI_centroid isna()
    df = df[~df.ZCTA10I_centroid.isna()]
    df.reset_index(drop=True, inplace=True)   
    
    return df

    '''
    ## if ZCTA geometries are missing (for I, M, hI, hM), find ZCTA10I geometry for the 4 digits zipI 
    temp_df = df[df.ZCTA10I_centroid.isna()]
    temp_df.drop_duplicates(subset='zipI', inplace=True)
    
    # keep only 'zipI'
    temp_df = temp_df[['zipI']]
    
    # find 4d zipI, ZCTA10I, and ZIP_CODE
    temp_df['zipI_4d'] = temp_df.zipI.str[:4]
    gdf_zcta['ZCTA10_4d'] = gdf_zcta.ZCTA10.str[:4]
    cw['ZIP_CODE_4d'] = cw.ZIP_CODE.str[:4]
    
    # add geometries for 4d zipI
    temp_df = temp_df.merge(
        gdf_zcta['ZCTA10_4d'],
        left_on='zipI_4d',
        right_on='ZCTA10_4d',
        how='left'
    )
    
    temp_df.drop_duplicates(subset='zipI', inplace=True)
    
    # attach ZCTA_4d from crosswalk file
    temp_df = temp_df.merge(
        cw[['ZIP_CODE_4d', 'ZCTA']], 
        left_on='zipI_4d',
        right_on='ZIP_CODE_4d',
        how='left'
    )
    
    # substitute with ZCTA_4d if ZCTA10_4d is missing
    temp_df['ZCTA10_4d'] = np.where(temp_df.ZCTA10_4d.isna(), temp_df.ZCTA.str[:4], temp_df.ZCTA10_4d)
    
    temp_df.drop_duplicates(subset='zipI', inplace=True)
    temp_df = temp_df[['zipI', 'ZCTA10_4d']]
   
    # add geometry
    temp_df = temp_df.merge(gdf_zcta,
            on='ZCTA10_4d',
            how='left'  
    )
    
    temp_df.drop_duplicates(subset='zipI', inplace=True)
    
    temp_df.rename(columns={'ZCTA10_centroid': 'ZCTA10_4d_centroid'}, inplace=True)

    # merge to original df
    df = df.merge(
            temp_df,
            on='zipI',
            how='left'
        )
    
    # populate missing ZCTA1010_centroid
    df['ZCTA10I_centroid'] = np.where(df['ZCTA10I_centroid'].isna(), df['ZCTA10_4d_centroid'], df['ZCTA10I_centroid'])
    
    # drop cols
    df.drop(columns=['ZCTA10_4d_centroid'], inplace=True)
    
    # drop if ZCTA10_centroid is not available
    df = df[~df.ZCTA10I_centroid.isna()]
    
    df.reset_index(drop=True, inplace=True)   
    
    return df     
    '''

``rlnI status``

In [None]:
def rlnI_status(mini_df):
    ''' Define rlnI status: at birth, first year of life, or not assigned at all
    '''
    # generate variable to show if rlnI is missing at the observation level
    mini_df['rlnI_obs_missing'] = np.where(mini_df.rlnI.isin(('---------', np.nan)), 1, 0)

    ## rlnI assigned at birth ##
    ############################
    # find birth ids for which rlnI was assigned at birth
    births_w_rln_at_birth = mini_df[
        (mini_df['_input'].eq("B")) & (~mini_df['rlnI'].isin(('---------', np.nan)))
    ]['_brthid'].unique()

    ## rlnI assigned 1st year of life ##
    ####################################
    # find birth ids for which rlnI was assigned in first year of life
    births_w_rln_1yol = mini_df[
        (~mini_df['_brthid'].isin(births_w_rln_at_birth)) & (mini_df.rlnI_obs_missing.eq(0))
    ]['_brthid'].unique()

    ## rlnI status ##
    #################
    # create rlnI status variable to indicate if/when the rlnI was assigned
    # this will help remove birth ids w/o a rlnI
    mini_df['rlnI_status'] = np.where(mini_df['_brthid'].isin(births_w_rln_at_birth), 'rlnI assigned at birth',
                                     np.where(mini_df['_brthid'].isin(births_w_rln_1yol), 'rlnI assigned 1st year of life', 'rlnI not assigned'))
    

    return mini_df

``keys``

In [None]:
def add_keys(mini_df):
    ''''''
    # making sure all vars are strings and strip .0
    for col in ['ZCTA10I', 'bthmonthI', 'bthyearI']:
        mini_df[col] = mini_df[col].astype(str).str.split('.').str[0]

    # create key with birthyear, birthmonth and birthzip of infant
    mini_df['ZCTA10I_month_year'] = mini_df.ZCTA10I + '_' + mini_df.bthmonthI + '_' + mini_df.bthyearI

    # create key with birthyear and birthmonth of infant
    mini_df['bthI_month_year'] = mini_df.bthmonthI+  '_' + mini_df.bthyearI
    
    return mini_df

In [None]:
def save_dfs_rlnI(mini_df):
    '''
    '''
    #########################
    ##  rlnI not assigned ##
    ########################
    mini_df_no_rlnI = mini_df[
        mini_df.rlnI_status.eq('rlnI not assigned')
    ]
    
    # reset index
    mini_df_no_rlnI.reset_index(
        drop=True,
        inplace=True
    )
    
    # save to csv
    mini_df_no_rlnI.to_csv(in_dir + 'Birth_pre_final_no_rlnI.csv')

    ###################
    ## rlnI assigned ##
    ###################
    # NOTE: if you want to do record linkage the following line of code should not be executed
    mini_df_rlnI = mini_df[
        ~mini_df.rlnI_status.eq('rlnI not assigned')
    ]

    # reset index
    mini_df_rlnI.reset_index(
        drop=True,
        inplace=True
    )
    
    #save to csv
    mini_df_rlnI.to_csv(in_dir + 'Birth_pre_final_rlnI.csv')
    
    return mini_df_rlnI

``process rlnI births``

In [None]:
def rlnI_births_process(mini_df):
    ''' Preprocess if birthID has a rlnI assigned at birth or in first year of life
    '''

    ############################
    ## number of unique rlnIs ##
    ############################
    # find the total number of unique rlnI for each birth id
    rlnI_nunique = mini_df.groupby('_brthid', as_index=False).rlnI.nunique() # 6 unique rlnIs is max

    # rename
    rlnI_nunique.rename(
        columns={'rlnI': 'rlnI_total'},
        inplace=True
    )

    # create dictionary with birth ids that have more than 1 (up to 6) unique rlnIs
    b_rlnIs = {}
    keys = [
        'b_w2rln',
        'b_w3rln',
        'b_w4rln',
        'b_w5rln',
        'b_w6rln',
    ]
    for idx, key in enumerate(keys):
        b_rlnIs[key] = rlnI_nunique[rlnI_nunique.rlnI_total.eq(idx+2)]['_brthid'].unique()

    # create number of unique rlnI variable
    mini_df['rlnI_total'] = np.where(mini_df['_brthid'].isin(b_rlnIs['b_w2rln']), '2',
                                    np.where(mini_df['_brthid'].isin(b_rlnIs['b_w3rln']), '3',
                                            np.where(mini_df['_brthid'].isin(b_rlnIs['b_w4rln']), '4',
                                                    np.where(mini_df['_brthid'].isin(b_rlnIs['b_w5rln']), '5',
                                                            np.where(mini_df['_brthid'].isin(b_rlnIs['b_w6rln']), '6', '1')))))

    ##########################################
    ## keep birth ID with at most XXXX rlnI ##
    ##########################################
    # add here if you want to remove anything



    ###############################
    ## assign most frequent rlnI ##
    ###############################
    def helper(grp):
        ''' 
        '''
        # get most frequent rlnI value
        most_freq_rln = grp[~grp.rlnI.eq('---------')].rlnI.mode() # remove '-------'; nan values are also removed by default with mode()
        grp['rlnI_updated'] = most_freq_rln[0]
        return grp

    mini_df = mini_df.groupby('_brthid').apply(helper)
    
    ## keep only rlnI that have a single birthid ##
    ##############################################
    temp_df = mini_df.groupby(
        ['rlnI_updated'],
        as_index=False
    )['_brthid'].nunique()

    rlnI_1brthid = temp_df[temp_df['_brthid'].eq(1)].rlnI_updated.unique()
    mini_df = mini_df[mini_df.rlnI_updated.isin(rlnI_1brthid)]

    return mini_df

``total hospital visits``

In [None]:
def total_hosp_vists(mini_df):
    ''' Compute number of hospital visits before and after birth (for mother and infant)
    '''
    def helper(grp):
        ''''''
        # number of mother visits 9 month pre-partum (9mpp)
        grp['visitsM_9mpp'] = grp[grp._input.eq('M') & grp._diffM.lt(0)].shape[0]
        # number of mother visits 1st year post-partum (1ypp)
        grp['visitsM_1ypp'] = grp[grp._input.eq('M') & grp._diffM.gt(0)].shape[0]
        # number of infant visits 1st year of life (1yol)
        grp['visitsI_1yol'] = grp[grp._input.eq('I')].shape[0]
        return grp

    mini_df = mini_df.groupby('_brthid').apply(helper)
    
    return mini_df

``cols of interest``

In [None]:
def keep_cols_of_interest(mini_df):
    cols = [
        '_brthid',	'_brthIDHST',
        '_linkedB',	'_linkidB',	'_linkidI',
        '_linkidM',	'_twinB',	'_twinI',
        '_twinM',	'_twinwght',	'typebth',
        'bthorder',	'rlnI',	'rlnM',	
        'rlnI_status', 'rlnI_total', 'rlnI_updated',
        'bthdateM', 'bthyearM',	'bthmonthM', 'bthdayM',
        'bthdateI',	'bthyearI',	'bthmonthI',
        'bthdayI', 'bthI_month_year', 'fbthdate',	'fbthyear',
        'fbthmonth',	'fbthday',
        'zipM', 'ZCTA10M', 'ZCTA10M_centroid',
        'zipI',	'ZCTA10I', 'ZCTA10I_centroid',
        'hplzipM', 'ZCTA10hM',  'ZCTA10hM_centroid',
        'hplzipI', 'ZCTA10hI', 'ZCTA10hI_centroid',
        'ZCTA10I_month_year', 
        'cntyresM',	'cntyresM_name', 'cntyresI',
        'cntyresI_name', 'hplcntyM', 'hplcntyM_name',
        'hplcntyI',	'hplcntyI_name', 'matresst',
        'matresst_name', 'bthresmb', 'bthresmb_name',
        'bthresmb06', 'bthresmb06_name', 'raceM',
        'frace', 'meduc', 'feduc', 'sexI', 'raceI',
        'admdateM',	'admyearM',	'admmonthM',
        'admdayM',	'admdateI',	'admyearI',
        'admmonthI', 'admdayI',	
        'hplidI', 'hospidM',	'_sortid',	'_input',
        'visitsM_9mpp', 'visitsM_1ypp', 'visitsI_1yol',
        '_diffI',	'_diffM',	'_losI', '_losM',
        'lenstayI',	'lenstayM',	'_chargesI',
        '_chargesM', 'paycatI',	'paycatM',
        'payplanI',	'payplanM',	'paytypeI',
        'paytypeM',	'admsrcI',	'admsrcM',
        'precare', 'precare_name',
        'prevlbd',	'prevlbd_name', 
        'prevlbl', 'prevlbl_name', 'cebl', 'cebl_name',
        'ceb', 'ceb_name', 'prevsts', 'prevsts_name',
        'gest_days', 'gest_days_name', 'bthhour', 'bthwghtI',
        'caesar', 'caesar_name',
        'probl_1', 'probl_1_list',
        'probl_2',	'probl_2_list',
        'probl_3',	'probl_3_list',
        'llbmths',	'llbyr',	'ltamth',	'ltamths',	'ltayr',
        'term_a20wks', 'term_a20wks_name',
        'term_b20wks', 'term_b20wks_name',
        'diagM00',	'diagM01',	'diagM02',	'diagM03',	'diagM04',
        'proc_edasM00',	'proc_edasM01',	'proc_edasM02',	'proc_edasM03',	'proc_edasM04',
        'procM00', 'procM01',	'procM02',	'procM03',	'procM04',
        'diagI00',	'diagI01',	'diagI02',	'diagI03',	'diagI04',
        'proc_edasI00',	'proc_edasI01',	'proc_edasI02',	'proc_edasI03',	'proc_edasI04',
        'procI00',	'procI01',	'procI02',	'procI03',	'procI04'
    ]
        
    return mini_df[cols]

---
### Step 4: Read data

In [None]:
df = read_data()
print('Shape of data:', df.shape)
df.head(2)

---
### Step 5: Data preprocessing - add, recode, substitute

``sort values``

In [None]:
# sort values
df.sort_values(
    by=['_brthid', '_sortid'],
    inplace=True
)

# reset index
df.reset_index(
    drop=True,
    inplace=True
)

``dates``

In [None]:
# recode, substitute and add dates
df = add_dates(sub_missing_dates(recode_dates(df)))

``sex``

In [None]:
# recode, substitute sex
df = recode_sex(sub_missing_sex(df))

``race``

In [None]:
# recode, substitute race
df = sub_missing_race(recode_race(df))

``education``

In [None]:
# recode educ
df = recode_educ(df)

``zip`` 

In [None]:
# recode, substitute zip at birth for infant and mother
df = sub_missing_zip(recode_zip(df))

``county``

In [None]:
# recode county at birth for infant and mother
df = recode_county(df)

``state/country``

In [None]:
# recode state/country at birth for mother
df = recode_state_m(df)

In [None]:
# recode state/country at mother's own birth
df = recode_state_mob(df)

``caesar``

In [None]:
# recode C-section at birth
df = recode_caesar(df)

``pregnancy, labor, newborn complications``

In [None]:
# recode complications with preganancy, labor or newborn
df = recode_probl(df)

``preganncy precare``

In [None]:
# recode precare variables (month prenanatal care began and number of previous visits)
df = recode_precare(df)

``previous births``

In [None]:
# recode previous births now dead or alive
df = recode_prev_births(df)

``gestation length``

In [None]:
df = recode_gest_days(df)

``admission source``

In [None]:
df = recode_admission_src(df)

``diagnosis codes``

In [None]:
df = recode_diagnosis_codes(df)

In [None]:
print(
    'Shape of data after Step 5: "Data preprocessing - add, recode, substitute":',
    df.shape
)

In [None]:
df.to_csv(in_dir + 'Birth_after_step5.csv')

---
### Step 6: Data preprocessing - add, drop

``keep if _linkedB=='Y``

In [None]:
df = pd.read_csv(in_dir + 'Birth_after_step5.csv')

In [None]:
# Keep births only if  vital stats birth, infant, and maternal discharge records are linked
mini_df = linked_births_only(df)

In [None]:
# print shape after this cleaning
print('Df shape after _linkedB record cleaning:', mini_df.shape)

``add ZIP geography. drop if zipI geography is nan()``

In [None]:
mini_df = add_drop_zip_geometry(mini_df)

In [None]:
# print shape after this cleaning
print('Df shape after ZCTA10I_centroid cleaning:', mini_df.shape)

``add rlnI status``

In [None]:
# add rlnI status variable
mini_df = rlnI_status(mini_df)

``add keys for merging or FE``

In [None]:
mini_df = add_keys(mini_df)

In [None]:
mini_df.to_csv(in_dir + 'Birth_after_step6.csv')

``save dfs``

In [None]:
# save df based on rlnI status (no rlnI; rlnI present at birth or 1st year of life)
# return only df with rlnI present
mini_df_rlnI = save_dfs_rlnI(mini_df)

In [None]:
print('Df shape after rlnI cleaning:', mini_df_rlnI.shape)
print(
    'Share of birth IDs retained relative to original dataset:',
     mini_df_rlnI._brthid.nunique()/df._brthid.nunique()
)

<span style="color:pink">[RESTART KERNEL HERE, otherwise you run out of memory]</span>

``process if rlnI assigned``

<span style="color:white">[important section if you do record linkeage later on]</span>

In [None]:
print('Read data:')
mini_df_rlnI = pd.read_csv(in_dir + 'Birth_pre_final_rlnI.csv')

In [None]:
# if more than one rlnI, assign the most frequent one
# if more than one rlnI and all have the same frequency, assign mode()
mini_df_rlnI = rlnI_births_process(mini_df_rlnI)

``add total hospital visits`` 

In [None]:
# compute number of visits for mother and child
print('Compute total hosp visits:')
mini_df_rlnI = total_hosp_vists(mini_df_rlnI)

In [None]:
print(
    'Shape of data after Step 6: "Data preprocessing - drop, add":',
    mini_df_rlnI.shape
)

---
### Step 7: Export data

In [None]:
# keep only cols of interest and output to .csv
keep_cols_of_interest(mini_df_rlnI).to_csv(out_dir + 'Birth_final.csv') #birth, infant, mother records

In [None]:
# these are zip codes in CA but they don't have a county assigned
#df[(df.cntyresI_name.eq('nan')) & (~df.zipI.isin(('nan', 'outside of US', 'homeless'))) & df.zipI.str[:2].isin(('90', '91', '92', '93', '94', '95', '96'))]['zipI'].unique()