# Process PSID dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import xmltodict

## Parse the .xml codebook

In [37]:
with open("../data/psid/J322110_codebook.xml", "r", encoding = "cp1252") as file:
    xml_text = file.read()
file.close()

In [38]:
codebook_dict = xmltodict.parse(xml_text)
codebook_dict = codebook_dict["CODEXML"]['LIST_JOBID']['JOBID']['LIST_VARIABLE']['VARIABLE']

In [39]:
codebook_df = pd.DataFrame(codebook_dict)
codebook_df

Unnamed: 0,YEAR,TYPE_ID,NAME,LABEL,QTEXT,ETEXT,LIST_CODE
0,2001,1,ER17001,RELEASE NUMBER,Release Number,"May 2008: Variables ER20395-ER20458, formerly...","{'CODE': [{'VALUE': '1', 'TEXT': 'Release numb..."
1,2001,1,ER17002,2001 FAMILY INTERVIEW (ID) NUMBER,2001 Interview Number,The values for this variable represent the 200...,"{'CODE': {'VALUE': '1 - 7,457', 'TEXT': 'Inter..."
2,2001,1,ER17004,PSID STATE OF RESIDENCE CODE,State of Residence,Please refer to PSID state codes here http://p...,"{'CODE': [{'VALUE': '1 - 51', 'TEXT': 'Actual ..."
3,2001,1,ER17013,AGE OF HEAD,Age of 2001 Head,This variable represents the actual age of the...,"{'CODE': [{'VALUE': '14 - 120', 'TEXT': 'Actua..."
4,2001,1,ER17014,SEX OF HEAD,Sex of 2001 Head,,"{'CODE': [{'VALUE': '1', 'TEXT': 'Male'}, {'VA..."
...,...,...,...,...,...,...,...
374,2021,1,ER81414,M12 WTR DONATED TO OTHER ORGANIZATIONS,M12. (Not counting any donations you just told...,,"{'CODE': [{'VALUE': '1', 'TEXT': 'Yes'}, {'VAL..."
375,2021,1,ER81416,M12B DOLLAR AMT OF OTHER DONATIONS,M12b. How much was that in 2020? (What was th...,,"{'CODE': [{'VALUE': '1 - 999,996', 'TEXT': 'Ac..."
376,2021,1,ER81775,TOTAL FAMILY INCOME-2020,Total 2020 Family Money Income,The income reported here was collected in 2021...,"{'CODE': [{'VALUE': '-999,997 - -1', 'TEXT': '..."
377,2021,1,ER81836,IMP WEALTH W/O EQUITY (WEALTH1) 2021,"Constructed Wealth Variable, Excluding Equity","Constructed wealth variable, excluding equity....","{'CODE': [{'VALUE': '-99,999,997 - -1', 'TEXT'..."


In [40]:
# standardize the column/label names
def label_reorg_func(label:str):
    if "FAMILY INTERVIEW (ID) NUMBER" in label:
        return "family_interview_id"
    if "TOTAL FAMILY INCOME" in label:
        return "total_family_income"
    if "WTR DONATED TO ORGANIZATN FOR HEALTH" in label:
        return "wtr_donated_to_health_org"
    if "DOLLAR AMT OF HEALTH DONATIONS" in label:
        return "amt_of_health_donations"
    if "SEQUENCE NUMBER" in label:
        return "sequence_number"
    if "SEX OF" in label:
        return "sex"
    if "RELATION TO" in label:
        return "relation_to_rp"
    if "CHILDREN" in label:
        return "number_of_children_in_fu"
    if ("HEALTH" in label) and ("STATUS" in label):
        return "health_status"
    if "RELIGIOUS PREF" in label:
        return "religion"
    if "MARITAL STATUS" in label:
        return "marital_status"
    if "RACE OF" in label:
        return "race"
    if "AGE OF" in label:
        return "age"
    if "IMP WEALTH W/O EQUITY" in label:
        return "wealth_wo_equity"
    if "IMP WEALTH W/ EQUITY" in label:
        return "wealth_with_equity"
    if "WTR DONATION&gt;25" in label:
        return "wtr_donated" 
    if "WTR DONATED TO RELIGIOUS ORGANIZATION" in label:
        return "wtr_donated_to_religious_org"
    if "DOLLAR AMT OF RELIGIOUS DONATIONS" in label:
        return "amt_of_religious_donations"
    if "WTR DONATD TO COMBO PURPOSE ORGANIZTN" in label:
        return "wtr_donated_to_combo_purpose_org"
    if "DOLLAR AMT OF COMBO DONATIONS" in label:
        return "amt_of_combo_donations"
    if "WTR DONATED TO ORGANIZATION FOR NEEDY" in label:
        return "wtr_donated_to_org_for_needy"
    if "DOLLAR AMT OF NEEDY DONATIONS" in label:
        return "amt_of_needy_donations"
    if "WTR DONATED TO ORGANZTION FOR EDUCATN" in label:
        return "wtr_donated_to_edu_org"
    if "DOLLAR AMT OF EDUCATION DONATIONS" in label:
        return "amt_of_edu_donations"
    if "WTR DONATED TO YOUTH ORGANIZATIONS" in label:
        return "wtr_donated_to_youth_org"
    if "DOLLAR AMT OF YOUTH ORG DONATIONS" in label:
        return "amt_of_youth_donations"
    if "WTR DONATED TO CULTURAL ORGS" in label:
        return "wtr_donated_to_cultural_org"
    if "DOLLAR AMT OF CULTURAL DONATIONS" in label:
        return "amt_of_cultural_donations"
    if "WTR DONATED TO COMMUNITY ORGS" in label:
        return "wtr_donated_to_community_org"
    if "DOLLAR AMT OF COMMUNITY DONATIONS" in label:
        return "amt_of_community_donations"
    if "WTR DONATED TO ENVIRONMENT ORGS" in label:
        return "wtr_donated_to_env_org"
    if "DOLLAR AMT OF ENVIRONMENT DONATION" in label:
        return "amt_of_env_donations"
    if "WTR DONATED TO OTHER ORGANIZATIONS" in label:
        return "wtr_donated_to_other"
    if ("AMT DONATED TO ORGS IN T7A-F" in label) or ("DOLLAR AMT OF OTHER DONATIONS" in label):
        return "amt_of_other_donations"
    if "WTR DONATED TO INTERNATL/PEACE ORG" in label:
        return "wtr_donated_intl/peace_org"
    if "DOLLAR AMT OF INTERNATL/PEACE DNTN" in label:
        return "amt_of_intl/peace_donations"
    if "PERSON NUMBER" in label:
        return "person_number"
    return label.replace(" ", "_").lower()

codebook_df["STANDARDIZED_LABEL"] = codebook_df.LABEL.apply(label_reorg_func)
codebook_df

Unnamed: 0,YEAR,TYPE_ID,NAME,LABEL,QTEXT,ETEXT,LIST_CODE,STANDARDIZED_LABEL
0,2001,1,ER17001,RELEASE NUMBER,Release Number,"May 2008: Variables ER20395-ER20458, formerly...","{'CODE': [{'VALUE': '1', 'TEXT': 'Release numb...",release_number
1,2001,1,ER17002,2001 FAMILY INTERVIEW (ID) NUMBER,2001 Interview Number,The values for this variable represent the 200...,"{'CODE': {'VALUE': '1 - 7,457', 'TEXT': 'Inter...",family_interview_id
2,2001,1,ER17004,PSID STATE OF RESIDENCE CODE,State of Residence,Please refer to PSID state codes here http://p...,"{'CODE': [{'VALUE': '1 - 51', 'TEXT': 'Actual ...",psid_state_of_residence_code
3,2001,1,ER17013,AGE OF HEAD,Age of 2001 Head,This variable represents the actual age of the...,"{'CODE': [{'VALUE': '14 - 120', 'TEXT': 'Actua...",age
4,2001,1,ER17014,SEX OF HEAD,Sex of 2001 Head,,"{'CODE': [{'VALUE': '1', 'TEXT': 'Male'}, {'VA...",sex
...,...,...,...,...,...,...,...,...
374,2021,1,ER81414,M12 WTR DONATED TO OTHER ORGANIZATIONS,M12. (Not counting any donations you just told...,,"{'CODE': [{'VALUE': '1', 'TEXT': 'Yes'}, {'VAL...",wtr_donated_to_other
375,2021,1,ER81416,M12B DOLLAR AMT OF OTHER DONATIONS,M12b. How much was that in 2020? (What was th...,,"{'CODE': [{'VALUE': '1 - 999,996', 'TEXT': 'Ac...",amt_of_other_donations
376,2021,1,ER81775,TOTAL FAMILY INCOME-2020,Total 2020 Family Money Income,The income reported here was collected in 2021...,"{'CODE': [{'VALUE': '-999,997 - -1', 'TEXT': '...",total_family_income
377,2021,1,ER81836,IMP WEALTH W/O EQUITY (WEALTH1) 2021,"Constructed Wealth Variable, Excluding Equity","Constructed wealth variable, excluding equity....","{'CODE': [{'VALUE': '-99,999,997 - -1', 'TEXT'...",wealth_wo_equity


In [41]:
codebook_df.STANDARDIZED_LABEL.unique()

array(['release_number', 'family_interview_id',
       'psid_state_of_residence_code', 'age', 'sex',
       'number_of_children_in_fu', 'marital_status', 'health_status',
       'race', 'religion', 'wtr_donated', 'wtr_donated_to_religious_org',
       'amt_of_religious_donations', 'wtr_donated_to_combo_purpose_org',
       'amt_of_combo_donations', 'wtr_donated_to_org_for_needy',
       'amt_of_needy_donations', 'wtr_donated_to_health_org',
       'amt_of_health_donations', 'wtr_donated_to_edu_org',
       'amt_of_edu_donations', 'total_family_income', 'wealth_wo_equity',
       'wealth_with_equity', 'wtr_donated_to_youth_org',
       'amt_of_youth_donations', 'wtr_donated_to_cultural_org',
       'amt_of_cultural_donations', 'wtr_donated_to_community_org',
       'amt_of_community_donations', 'wtr_donated_to_env_org',
       'amt_of_env_donations', 'wtr_donated_intl/peace_org',
       'wtr_donated_to_other', 'amt_of_other_donations',
       'm12_type_organization_donated_to',
       '

In [42]:
code_mapping_dict_year = codebook_df.set_index("NAME").YEAR.to_dict()
code_mapping_dict_stdlab = codebook_df.set_index("NAME").STANDARDIZED_LABEL.to_dict()

# Wrangle .csv for analysis

In [43]:
# This line does NOT work because the .xlsx is encoded with "synchVertical" rather than "syncVertical",
# and oepnpyxl does not support synchVertical.
# Current solution: use local MS Excel to export data manually to a .csv

# psid_raw = pd.read_excel("../data/psid/J321540.xlsx", sheet_name="Data")

psid_raw = pd.read_csv("../data/psid/J322110.csv")
psid_raw

Unnamed: 0,ER17001,ER17002,ER17004,ER17013,ER17014,ER17016,ER17024,ER19612,ER19989,ER20038,...,ER81398,ER81404,ER81405,ER81406,ER81407,ER81414,ER81416,ER81775,ER81836,ER81838
0,7.0,96.0,41.0,49.0,1.0,1.0,1.0,1.0,1.0,0.0,...,,,,,,,,,,
1,7.0,5987.0,41.0,47.0,2.0,0.0,4.0,3.0,1.0,8.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,7.0,7091.0,41.0,41.0,2.0,1.0,4.0,2.0,1.0,8.0,...,0.0,5.0,0.0,0.0,0.0,5.0,0.0,11720.0,50000.0,230000.0
4,7.0,5964.0,41.0,36.0,1.0,3.0,1.0,1.0,1.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17802,,,,,,,,,,,...,,,,,,,,,,
17803,,,,,,,,,,,...,,,,,,,,,,
17804,,,,,,,,,,,...,,,,,,,,,,
17805,,,,,,,,,,,...,,,,,,,,,,


In [44]:
# stash each year's income, contribution, etc. into same columns.
# Strategy: map each var's year and label in codebook and do a groupby

psid_year_grouped = psid_raw.groupby(by = code_mapping_dict_year, axis=1)
psid_processed = pd.concat([psid_year.rename(columns=code_mapping_dict_stdlab).assign(year = year) for year, psid_year in psid_year_grouped], axis=0)

# this cannot be used as is because families missing certain
# years of interviews are still kept in the original .csv with
# a value of nan in the year's variables
psid_processed

Unnamed: 0,release_number,family_interview_id,psid_state_of_residence_code,age,sex,number_of_children_in_fu,marital_status,health_status,race,religion,...,wtr_donated_to_other,amt_of_other_donations,m12_type_organization_donated_to,m12c_wtr_other_donations_ge_200,mgroup1_wtr_donate_combo/needy/med/intl,mgroup2_wtr_donate_educ/youth/cultr/envr,mcovid1_wtr_donate_to_help_re_pandemic,mcovid2_wtr_pandemic_donations_crowdfund,mcovid3_largest_crowdfund_donation_to,mcovid4_dollar_amt_pandemic_donations
0,7.0,96.0,41.0,49.0,1.0,1.0,1.0,1.0,1.0,0.0,...,,,,,,,,,,
1,7.0,5987.0,41.0,47.0,2.0,0.0,4.0,3.0,1.0,8.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,7.0,7091.0,41.0,41.0,2.0,1.0,4.0,2.0,1.0,8.0,...,,,,,,,,,,
4,7.0,5964.0,41.0,36.0,1.0,3.0,1.0,1.0,1.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17802,,,,,,,,,,,...,,,,,,,,,,
17803,,,,,,,,,,,...,,,,,,,,,,
17804,,,,,,,,,,,...,,,,,,,,,,
17805,,,,,,,,,,,...,,,,,,,,,,


In [45]:
psid_processed[psid_processed.family_interview_id.notna()]

Unnamed: 0,release_number,family_interview_id,psid_state_of_residence_code,age,sex,number_of_children_in_fu,marital_status,health_status,race,religion,...,wtr_donated_to_other,amt_of_other_donations,m12_type_organization_donated_to,m12c_wtr_other_donations_ge_200,mgroup1_wtr_donate_combo/needy/med/intl,mgroup2_wtr_donate_educ/youth/cultr/envr,mcovid1_wtr_donate_to_help_re_pandemic,mcovid2_wtr_pandemic_donations_crowdfund,mcovid3_largest_crowdfund_donation_to,mcovid4_dollar_amt_pandemic_donations
0,7.0,96.0,41.0,49.0,1.0,1.0,1.0,1.0,1.0,0.0,...,,,,,,,,,,
1,7.0,5987.0,41.0,47.0,2.0,0.0,4.0,3.0,1.0,8.0,...,,,,,,,,,,
3,7.0,7091.0,41.0,41.0,2.0,1.0,4.0,2.0,1.0,8.0,...,,,,,,,,,,
4,7.0,5964.0,41.0,36.0,1.0,3.0,1.0,1.0,1.0,0.0,...,,,,,,,,,,
6,7.0,5479.0,41.0,26.0,1.0,0.0,1.0,3.0,1.0,8.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17795,1.0,7197.0,42.0,63.0,2.0,0.0,4.0,4.0,2.0,6.0,...,5.0,0.0,,,5.0,5.0,5.0,0.0,0.0,0.0
17797,1.0,4856.0,12.0,64.0,1.0,0.0,4.0,3.0,2.0,6.0,...,5.0,0.0,,,5.0,5.0,5.0,0.0,0.0,0.0
17799,1.0,8687.0,12.0,58.0,1.0,0.0,2.0,4.0,2.0,6.0,...,5.0,0.0,,,5.0,5.0,5.0,0.0,0.0,0.0
17800,1.0,9242.0,10.0,33.0,2.0,1.0,2.0,2.0,2.0,99.0,...,5.0,0.0,,,5.0,5.0,5.0,0.0,0.0,0.0


In [46]:
psid_processed = psid_processed.drop(columns = ['m12_type_organization_donated_to', 'm12c_wtr_other_donations_ge_200'])

In [47]:
psid_processed.groupby('year').count()

Unnamed: 0_level_0,release_number,family_interview_id,psid_state_of_residence_code,age,sex,number_of_children_in_fu,marital_status,health_status,race,religion,...,amt_of_env_donations,wtr_donated_intl/peace_org,wtr_donated_to_other,amt_of_other_donations,mgroup1_wtr_donate_combo/needy/med/intl,mgroup2_wtr_donate_educ/youth/cultr/envr,mcovid1_wtr_donate_to_help_re_pandemic,mcovid2_wtr_pandemic_donations_crowdfund,mcovid3_largest_crowdfund_donation_to,mcovid4_dollar_amt_pandemic_donations
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001,7406,7406,7406,7406,7406,7406,7406,7406,7406,7406,...,0,0,0,0,0,0,0,0,0,0
2003,7822,7822,7822,7822,7822,7822,7822,7822,7822,7822,...,7822,7822,7822,7822,0,0,0,0,0,0
2005,8002,8002,8002,8002,8002,8002,8002,8002,8002,8002,...,8002,8002,8002,8002,0,0,0,0,0,0
2007,8289,8289,8289,8289,8289,8289,8289,8289,8289,8289,...,8289,8289,8289,8289,0,0,0,0,0,0
2009,8690,8690,8690,8690,8690,8690,8690,8690,8690,8690,...,8690,8690,8690,8690,0,0,0,0,0,0
2011,8907,8907,8907,8907,8907,8907,8907,8907,8907,8907,...,8907,8907,8907,8907,0,0,0,0,0,0
2013,9063,9063,9063,9063,9063,9063,9063,9063,9063,9063,...,9063,9063,9063,9063,0,0,0,0,0,0
2015,9048,9048,9048,9048,9048,9048,9048,9048,9048,9048,...,9048,9048,9048,9048,0,0,0,0,0,0
2017,9607,9607,9607,9607,9607,9607,9607,9607,9607,9607,...,9607,9607,9607,9607,0,0,0,0,0,0
2019,9569,9569,9569,9569,9569,9569,9569,9569,9569,9569,...,9569,9569,9569,9569,0,0,0,0,0,0
