In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
data = pd.read_csv('familyxx.csv')

In [14]:
data.columns

Index(['FINT_Y_P', 'FINT_M_P', 'FMX', 'RECTYPE', 'SRVY_YR', 'HHX', 'FM_SIZE',
       'FM_STRCP', 'FM_TYPE', 'FM_STRP',
       ...
       'COVCONF', 'FHICOST', 'FMEDBILL', 'FMEDBPAY', 'FMEDBNOP', 'FSAF',
       'FHICOVCT', 'FHICOVYN', 'FPRCOOH', 'FHIEBCCT'],
      dtype='object', length=127)

In [29]:
features = ['FM_SIZE', 'FM_TYPE','FLNGINTV', 'FM_KIDS', 'FM_ELDR','FM_EDUC1',
           'F10DVCT', 'FDMEDCT', 'FHOSP2CT', 'FNMEDCT', 'FSRUNOUT', 'FSBALANC',
            'FHSTATEX', 'FHSTATVG', 'FHSTATG', 'FHSTATFR', 'FHSTATPR', 'FSNAPMYR',
            'INCGRP5', 'FHIPRVCT', 'FHICARCT', 'FHICADCT', 'FHICOST', 'FMEDBILL', 
            'FHICOVCT', 'FHIEBCCT'
           ]

In [81]:
# new column names 
new_col_names = {'FM_SIZE' : 'fam_size',
                'FM_TYPE' : 'fam_type',
                'FLNGINTV' : 'language',
                 'FM_KIDS' : 'num_child',
                 'FM_ELDR' : 'num_elder',
                 'FM_EDUC1': 'max_educ',
                 'F10DVCT' : 'care_10', # 12 mo
                 'FDMEDCT' : 'care_delayed', # 12 mo
                 'FHOSP2CT': 'hosp_ovrngt', # 12 mo
                 'FNMEDCT' : 'no_care_when_need', # 12 mo
                 'FSRUNOUT' : 'food_runout', # 30 days
                 'FSBALANC' : 'food_balance', # 30 days
                 'FHSTATEX' : 'health_ex',
                 'FHSTATVG' : 'health_vg',
                 'FHSTATG' : 'health_g',
                 'FHSTATFR' : 'health_fr',
                 'FHSTATPR' : 'health_pr',
                 'FSNAPMYR' : 'mos_snap_ben', # last cal yr
                 'INCGRP5' : 'fam_income', 
                 'FHIPRVCT' : 'priv_health_ins',
                 'FHICARCT' : 'num_medicare',
                 'FHICADCT' : 'num_medicaid',
                 'FHICOST' : 'fam_med_den_cost',
                 'FMEDBILL' : 'difficult_pay_bills',
                 'FHICOVCT' : 'health_cov',
                 'FHIEBCCT' : 'employer_cov'
                }

# replacing ordinal values with human readable values 
# 'fam_type'
family_types = {1 : 'one_adult_no_child',
               2 : 'mult_adult_no_child',
               3 : 'one_adult_child',
               4 : 'mult_adult_child',
               9 : 'unknown'}

# 'language'
languages = {1 :'english',
             2 : 'spanish',
             3 : 'english_spanish',
             4 : 'other',
             8 : 'not_ascertained'}

# 'max_educ'
education = {
    1: 'no_hs_diploma',
    2: 'no_hs_diploma',
    3: 'GED',
    4: 'hs_grad',
    5: 'some_college',
    6: 'associates',
    7: 'associates',
    8: 'bachelors',
    9: 'masters_doctoral',
    97:'unknown',
    98:'unknown',
    99:'unknown'
}

# 'food_runout' and 'food_balance'
food = {
    1 : 'often_true',
    2 : 'sometimes_true',
    3 : 'never_true',
    7 : 'unknown',
    8 : 'unknown',
    9 : 'unknown'
}

# 'fam_income'
income = {
    1 : '0-34_999',
    2 : '35_000-74_999',
    3 : '75_000-99_999',
    4 : '100_000_more',
    96 : 'unknown',
    99 : 'unknown'
}

# 'fam_med_den_cost'
costs = {
    0 : 'zero',
    1 : 'less_than_500',
    2 : '500-1_999',
    3 : '2_000-2_999',
    4 : '3_000-4_999',
    5 : '5_000_more',
    7 : 'unknown',
    8 : 'unknown',
    9 : 'unknown'
}

# 'difficult_pay_bills'
pay_bills = {
    1 : 'yes',
    2 : 'no',
    7 : 'unknown',
    8 : 'unknown',
    9 : 'unknown'
}

In [77]:
health = data[features].copy() # copy of og data frame with selected columns
health.columns = health.columns.map(new_col_names) # rename columns

# rename variables from ordinal values to labels 
health['fam_type'] = health['fam_type'].map(family_types)
health['language'] = health['language'].map(languages)
health['max_educ'] = health['max_educ'].map(education)
health['food_runout'] = health['food_runout'].map(food)
health['food_balance'] = health['food_balance'].map(food)
health['fam_income'] = health['fam_income'].map(income)
health['fam_med_den_cost'] = health['fam_med_den_cost'].map(costs)
health['difficult_pay_bills'] = health['difficult_pay_bills'].map(pay_bills)

# fill NaN w/ 0s
health['mos_snap_ben'] = health['mos_snap_ben'].fillna(0)
health['mos_snap_ben'] = health['mos_snap_ben'].map(lambda x: 0 if (x > 12) else x)
health['employer_cov'] = health['employer_cov'].fillna(0)
health.head()

Unnamed: 0,fam_size,fam_type,language,num_child,num_elder,max_educ,care_10,care_delayed,hosp_ovrngt,no_care_when_need,...,health_pr,mos_snap_ben,fam_income,priv_health_ins,num_medicare,num_medicaid,fam_med_den_cost,difficult_pay_bills,health_cov,employer_cov
0,1,one_adult_no_child,english,0,1,no_hs_diploma,0,0,0,0,...,0,0.0,0-34_999,0,1,0,less_than_500,no,1,0.0
1,3,mult_adult_child,english,1,0,some_college,0,2,0,1,...,0,0.0,35_000-74_999,3,0,0,500-1_999,yes,3,3.0
2,4,mult_adult_child,english,2,0,masters_doctoral,1,0,0,0,...,0,0.0,100_000_more,4,0,0,500-1_999,no,4,4.0
3,3,mult_adult_child,english,1,0,associates,0,0,0,0,...,0,0.0,75_000-99_999,3,0,0,2_000-2_999,no,3,3.0
4,1,one_adult_no_child,english,0,1,no_hs_diploma,1,1,0,1,...,1,12.0,0-34_999,1,1,0,500-1_999,yes,1,0.0


In [84]:
health.isnull().sum().sum()

0