# SIPP Data Processing

In this notebook, I process the data from the US Census Bureau Survey of Income and Program Participation (SIPP) from years 2018 - 2021 and export the relevant data to SIPP_data.csv.


In [1]:
import pandas as pd
import numpy as np

In [89]:
def prep_data(data_file, rd_schema, rw_schema, year):
    """ 
    Process SIPP survey data.
    First read in schema to get data-type information for each varaible (rd_schema). 
    Then read in relevant columns from census data file (data_file).
    Extract age of youngest child by finding max of TCBYR columns.
    Keep only columns corresponding to parents with a child under 15.
    Add survey year column.
    """
    rd_schema = pd.read_json(rd_schema)
    rw_schema = pd.read_json(rw_schema)
    rd_schema['dtype'] = ['Int64' if x== 'integer' \
                         else 'object' if x == 'string' \
                         else 'Float64' if x == 'float' \
                         else 'ERROR' \
                         for x in rd_schema['dtype']]
    rw_schema['dtype'] = ['Int64' if x== 'integer' \
                         else 'object' if x == 'string' \
                         else 'Float64' if x == 'float' \
                         else 'ERROR' \
                         for x in rw_schema['dtype']]
    data_df = pd.read_csv(data_file, \
                          names = rd_schema['name'], \
                          dtype = dict([(i,v) for i,v in zip(rd_schema['name'], rd_schema['dtype'])]), \
                          sep = '|', \
                          header = 0, \
                          usecols = ['SSUID','PNUM','MONTHCODE','SPANEL','TST_INTV','RREGION_INTV',\
                                     'TMETRO_INTV','TAGE', 'ESEX','ERACE','EORIGIN','EEDUC','TCBYR_1','TCBYR_2',\
                                     'TCBYR_3','TCBYR_4','TCBYR_5','TCBYR_6','EDAYCARE', 'EDAYHS', 'EFAM', \
                                     'EGRAN', 'EHEADST','EJB1_AWOP1','EJB1_AWOPRE1','EJB1_AWOPSM1', \
                                     'EJB1_PTRESN1','ELIST','ENJ_NOWRK6','ENREL','ENUR','ENURHS', \
                                     'EOTHR','EPAR','EPAY','EPAYHELP','EPROG','ESELF','ESIB15', \
                                     'ETIMELOST','ETIMELOST_TP','EWORKMORE','TJB1_ANNSAL1','TPAYWK']
                         )
    data_df['MAX_TCBYR'] = data_df[['TCBYR_1','TCBYR_2','TCBYR_3','TCBYR_4','TCBYR_5','TCBYR_6']].max(axis=1)
    data_df = data_df[year-data_df['MAX_TCBYR']<=14]
    data_df['YEAR'] = year
    return data_df

In [90]:
SIPP_2018 = prep_data('pu2018.csv', 'pu2018_schema.json', 'rw2018_schema.json', 2018)
SIPP_2019 = prep_data('pu2019.csv', 'pu2019_schema.json', 'rw2019_schema.json', 2019)
SIPP_2020 = prep_data('pu2020.csv', 'pu2020_schema.json', 'rw2020_schema.json', 2020)
SIPP_2021 = prep_data('pu2021.csv', 'pu2021_schema.json', 'rw2021_schema.json', 2021)

In [94]:
dfs = [SIPP_2018, SIPP_2019, SIPP_2020, SIP_2021]

In [95]:
SIPP_data = pd.concat(dfs, ignore_index=True)

In [96]:
SIPP_data.to_csv('SIPP_data.csv')