In [73]:
import pandas as pd
import numpy as np

# Basic Exploratory Data Analysis:
## Financial Well-Being Survey
Consumer Financial Protection Bureau (CFPB), 2017

#### Items to address
* What questions can we address with each of these data sets?
* Breakdown of Financial Literacy Education & Behaviors, Home Ownership, Debt-to-Income by Demographic Information (Race/Ethnicity, Age, Gender, etc.)
* Any correlation between financial literacy education and each of our financial success metrics? Any differences among different demographics?
* Include data visualizations where appropriate


In [74]:
df = pd.read_csv("./NFWBS_PUF_2016_data.csv")

In [75]:
# Columns that most relate to our problem statement
columns_of_interest = [
    'PUF_ID',
    'agecat',
    'PPEDUC',
    'PPETHM',
    'PPGENDER',
    'PPINCIMP',
    'PPHHSIZE',
    'fpl',
    'PPREG4',
    'PPREG9',
    'FWB1_1',
    'FWB1_5',
    'FS2_3',
    'SUBKNOWL1',
    'ACT1_2',
    'PROPPLAN_3',
    'PROPPLAN_1',
    'MANAGE1_2',
    'MANAGE1_1',
    'MANAGE1_3',
    'SAVEHABIT',
    'AUTOMATED_1',
    'AUTOMATED_2',
    'SAVINGSRANGES',
    'HOUSING',
    'HOUSERANGES',
    'MORTGAGE',
    'PRODHAVE_8',
    'PRODUSE_1',
    'PRODUSE_2',
    'FWBscore',
    'FSscore'
]

In [76]:
# Column mapping to variable label; might come in handy when doing EDA
col_map = {
    'agecat':'Age Group',
    'PPEDUC':'Education (Highest Degree)',
    'PPETHM':'Race/Ethnicity',
    'PPGENDER':'Gender',
    'PPINCIMP':'Household Income',
    'PPHHSIZE':'Household Size',
    'fpl':'Federal Poverty Level',
    'PPREG4':'Census Region',
    'PPREG9':'Census Division',
    'FWB1_1':'I could handle a major unexpected expense.',
    'FWB1_5':'I am just getting by financially',
    'FS2_3':'I struggle to understand financial information',
    'SUBKNOWL1':'How would you assess your overall financial knowledge?',
    'ACT1_2':'I follow-through on financial goals I set for myself.',
    'PROPPLAN_3':'I set financial goals for what I want to achieve with my money',
    'PROPPLAN_1':'I consult my budget to see how much money I have left',
    'MANAGE1_2':'Stayed within your budget or spending plan',
    'MANAGE1_1':'Paid all your bills on time',
    'MANAGE1_3':'Paid off credit card balance in full',
    'SAVEHABIT':'Putting money into savings is a habit for me',
    'AUTOMATED_1':'A Retirement Savings Account',
    'AUTOMATED_2':'A Non-Retirement Savings Account',
    'SAVINGSRANGES':'How much money do you have in savings today?',
    'HOUSING':'Which one of the following best describes your housing situation?',
    'HOUSERANGES':'About how much do you pay for your home each month?',
    'MORTGAGE':'What do you owe on your home?',
    'PRODHAVE_8':'Student/Education Loan (for yourself or someone else)',
    'PRODUSE_1':'Used payday loan or cash advance loan',
    'PRODUSE_2':'Used pawn loan or auto title loan',
    'FWBscore':'Financial Well-Being Scale score',
    'FSscore':'Financial Skill Scale score'
}

In [77]:
filtered_df = df.loc[:,columns_of_interest]
filtered_df.head()

Unnamed: 0,PUF_ID,agecat,PPEDUC,PPETHM,PPGENDER,PPINCIMP,PPHHSIZE,fpl,PPREG4,PPREG9,...,AUTOMATED_2,SAVINGSRANGES,HOUSING,HOUSERANGES,MORTGAGE,PRODHAVE_8,PRODUSE_1,PRODUSE_2,FWBscore,FSscore
0,10350,8,4,1,1,7,1,3,4,8,...,0,6,1,4,2,0,0,0,55,44
1,7740,3,2,1,1,6,2,3,2,3,...,0,2,1,4,2,0,0,0,51,43
2,13699,3,3,2,1,6,3,3,4,9,...,1,4,1,3,2,0,0,0,49,42
3,7267,3,2,1,1,8,1,3,3,7,...,-1,-1,-1,99,-2,0,0,0,49,42
4,7375,2,2,3,1,7,5,3,2,4,...,1,98,2,2,-2,0,0,1,49,42


### What questions can we address with this data set?

We can address questions related to: 
* the current state of financial well-being and financial skill levels within the population
* the current state of financial well-being and financial skill levels within sub-populations and different communities
* trends between financial skills and financial well-being among the general population
* trends between financial skills and financial well-being among different sub-populations

## Current state of financial literacy and financial success metrics

#### Breakdown of Financial Literacy Education & Behaviors by Demographic Information

In [145]:
demographic_cols = ['PPGENDER','PPETHM','agecat']

In [146]:
fin_lit_cols = ['FSscore','FS2_3','SUBKNOWL1','ACT1_2','PROPPLAN_3','PROPPLAN_1','MANAGE1_2']
fin_lit_cols = demographic_cols + fin_lit_cols

In [147]:
fin_lit_df = filtered_df.loc[:,fin_lit_cols]
fin_lit_df = fin_lit_df.astype('int')
fin_lit_df = fin_lit_df.replace(value_map)
fin_lit_df.head()

Unnamed: 0,PPGENDER,PPETHM,agecat,FSscore,FS2_3,SUBKNOWL1,ACT1_2,PROPPLAN_3,PROPPLAN_1,MANAGE1_2
0,Male,"White, Non-Hispanic",75+,44,Often,5,Somewhat,Agree,Strongly agree,Often
1,Male,"White, Non-Hispanic",35-44,43,Rarely,5,Somewhat,Disagree,Neither agree nor disagree,Often
2,Male,"Black, Non-Hispanic",35-44,42,Sometimes,5,Somewhat,Agree,Agree,Sometimes
3,Male,"White, Non-Hispanic",35-44,42,Sometimes,Refused,Refused,Neither agree nor disagree,Neither agree nor disagree,Often
4,Male,"Other, Non-Hispanic",25-34,42,Sometimes,4,Somewhat,Neither agree nor disagree,Neither agree nor disagree,Sometimes


--- GENDER ---

In [148]:
fs_score_gender = fin_lit_df.loc[:,['PPGENDER','FSscore']].groupby(['PPGENDER']).sum()


In [149]:
fs_score_gender['Participant_Count'] = fin_lit_df.loc[:,['PPGENDER','FSscore']].groupby(['PPGENDER']).count().rename(columns={'FSscore':'Participant_Count'})['Participant_Count']


In [150]:
fs_score_gender['FSscore_per_unit'] = fs_score_gender['FSscore'] / fs_score_gender['Participant_Count']
fs_score_gender

Unnamed: 0_level_0,FSscore,Participant_Count,FSscore_per_unit
PPGENDER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,151825,3042,49.909599
Male,172473,3352,51.453759


--- RACE/ETHNICITY ---

In [143]:
fs_score_eth = fin_lit_df.loc[:,['PPETHM','FSscore']].groupby(['PPETHM']).sum()
fs_score_eth['Participant_Count'] = fin_lit_df.loc[:,['PPETHM','FSscore']].groupby(['PPETHM']).count().rename(columns={'FSscore':'Participant_Count'})['Participant_Count']
fs_score_eth['FSscore_per_unit'] = fs_score_eth['FSscore'] / fs_score_eth['Participant_Count']
fs_score_eth


Unnamed: 0_level_0,FSscore,Participant_Count,FSscore_per_unit
PPETHM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Black, Non-Hispanic",34979,685,51.064234
Hispanic,42463,875,48.529143
"Other, Non-Hispanic",16790,336,49.970238
"White, Non-Hispanic",230066,4498,51.14851


--- AGE ---

In [144]:
fs_score_age = fin_lit_df.loc[:,['agecat','FSscore']].groupby(['agecat']).sum()
fs_score_age['Participant_Count'] = fin_lit_df.loc[:,['agecat','FSscore']].groupby(['agecat']).count().rename(columns={'FSscore':'Participant_Count'})['Participant_Count']
fs_score_age['FSscore_per_unit'] = fs_score_age['FSscore'] / fs_score_age['Participant_Count']
fs_score_age

Unnamed: 0_level_0,FSscore,Participant_Count,FSscore_per_unit
agecat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
18-24,20419,414,49.321256
25-34,56148,1116,50.311828
35-44,41465,828,50.078502
45-54,53531,1075,49.796279
55-61,36127,708,51.026836
62-69,52989,1021,51.899119
70-74,25465,496,51.340726
75+,38154,736,51.839674


#### Breakdown of Home Ownership by Demographic Information

--- GENDER ---

--- RACE/ETHNICITY ---

--- AGE ---

#### Breakdown of Debt-to-Income Ratio by Demographic Information

--- GENDER ---

--- RACE/ETHNICITY ---

--- AGE ---

## Patterns between financial literacy and financial success metrics

#### Any correlation between financial literacy education and each of our financial success metrics?

#### Any differences among different demographics?

--- GENDER ---

--- RACE/ETHNICITY ---

--- AGE ---

In [120]:
# Value mapping to actual responses; might come in handy when doing EDA
value_map = {
    "agecat":{
      1: "18-24",
      2: "25-34",
      3: "35-44",
      4: "45-54",
      5: "55-61",
      6: "62-69",
      7: "70-74",
      8: "75+"
    },
    "PPEDUC":{
      1: "Less than high school",
      2: "High school degree/GED",
      3: "Some college/Associate",
      4: "Bachelors degree",
      5: "Graduate/professional degree"
    },
    "PPETHM":{
      1: "White, Non-Hispanic",
      2: "Black, Non-Hispanic",
      3: "Other, Non-Hispanic",
      4: "Hispanic"
    },
    "PPGENDER":{
      1: "Male",
      2: "Female"
    },
    "PPINCIMP":{
      1: "Less than $20,000",
      2: "$20,000 to $29,999",
      3: "$30,000 to $39,999",
      4: "$40,000 to $49,999",
      5: "$50,000 to $59,999",
      6: "$60,000 to $74,999",
      7: "$75,000 to $99,999",
      8: "$100,000 to $149,999",
      9: "$150,000 or more"
    },
    "PPHHSIZE":{
      1: "1",
      2: "2",
      3: "3",
      4: "4",
      5: "5+"
    },
    "fpl":{
      1: "<100% FPL",
      2: "100%-199% FPL",
      3: "200%+ FPL"
    },
    "PPREG4":{
      1: "Northeast",
      2: "Midwest",
      3: "South",
      4: "West"
    },
    "PPREG9":{
      1: "New England",
      2: "Mid-Atlantic",
      3: "East-North Central",
      4: "West-North Central",
      5: "South Atlantic",
      6: "East-South Central",
      7: "West-South Central",
      8: "Mountain",
      9: "Pacific"
    },
    "FWB1_1":{
      -4: "Response not written to database",
      -1: "Refused",
      1: "Not at all",
      2: "Very little",
      3: "Somewhat",
      4: "Very well",
      5: "Completely"
    },
    "FWB1_5":{
      -4: "Response not written to database",
      -1: "Refused",
      1: "Not at all",
      2: "Very little",
      3: "Somewhat",
      4: "Very well",
      5: "Completely"
    },
    "FS2_3":{
      -1: "Refused",
      1: "Never",
      2: "Rarely",
      3: "Sometimes",
      4: "Often",
      5: "Always"
    },
    "SUBKNOWL1":{
      -1: "Refused",
      1: "1 - Very low",
      2: "2",
      3: "3",
      4: "4",
      5: "5",
      6: "6",
      7: "7 - Very high"
    },
    "ACT1_2":{
      -1: "Refused",
      1: "Not at all",
      2: "Very little",
      3: "Somewhat",
      4: "Very well",
      5: "Completely"
    },
    "PROPPLAN_1":{
      -1: "Refused",
      1: "Strongly disagree",
      2: "Disagree",
      3: "Neither agree nor disagree",
      4: "Agree",
      5: "Strongly agree"
    },
    "PROPPLAN_3":{
      -1: "Refused",
      1: "Strongly disagree",
      2: "Disagree",
      3: "Neither agree nor disagree",
      4: "Agree",
      5: "Strongly agree"
    },
    "MANAGE1_1":{
      -1: "Refused",
      1: "Not applicable or never",
      2: "Seldom",
      3: "Sometimes",
      4: "Often",
      5: "Always"
    },
    "MANAGE1_2":{
      -1: "Refused",
      1: "Not applicable or never",
      2: "Seldom",
      3: "Sometimes",
      4: "Often",
      5: "Always"
    },
    "MANAGE1_3":{
      -1: "Refused",
      1: "Not applicable or never",
      2: "Seldom",
      3: "Sometimes",
      4: "Often",
      5: "Always"
    },
    "SAVEHABIT":{
      -1: "Refused",
      1: "Strongly disagree",
      2: "Disagree",
      3: "Disagree slightly",
      4: "Agree slightly",
      5: "Agree",
      6: "Strongly agree"
    },
    "AUTOMATED_1":{
      -1: "Refused",
      0: "No",
      1: "Yes",
      7: "I do not have this type of account"
    },
    "AUTOMATED_2":{
      -1: "Refused",
      0: "No",
      1: "Yes",
      7: "I do not have this type of account"
    },
    "SAVINGSRANGES":{
      -1: "Refused",
      1: "0",
      2: "$1-99",
      3: "$100-999",
      4: "$1,000-4,999",
      5: "$5,000-19,999",
      6: "$20,000-74,999",
      7: "$75,000 or more",
      98: "I dont know",
      99: "Prefer not to say"
    },
    "HOUSING":{
      -1: "Refused",
      1: "I own my home",
      2: "I rent",
      3: "I do not currently own or rent"
    },
    "HOUSERANGES":{
      -1: "Refused",
      1: "Less than $300",
      2: "$300-499",
      3: "$500-749",
      4: "$750-999",
      5: "$1,000-1,499",
      6: "$1,500-1,999",
      7: "$2,000 or more",
      98: "I dont know",
      99: "Prefer not to say"
    },
    "MORTGAGE":{
      -2: "Question not asked because respondent not in item base",
      -1: "Refused",
      1: "Less than $50,000",
      2: "$50,000-199,999",
      3: "$200,000 or more",
      98: "I dont know",
      99: "Prefer not to say"
    },
    "PRODHAVE_8":{
      0: "No",
      1: "Yes"
    },
    "PRODUSE_1":{
      0: "No",
      1: "Yes"
    },
    "PRODUSE_2":{
      0: "No",
      1: "Yes"
    },
    "FWBscore":{
      -4: "Response not written to database",
      -1: "Refused"
    },
#     "FSscore":{
#       -1: "Refused"
#     },
}