In [1]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# load data
df = pd.read_csv('data.csv')

In [3]:
# recode HIMCAIDE
df['HIMCAIDE'] = df['HIMCAIDE'].replace({
    1: 0,
    2: 1,
    3: 1
}).where(df['HIMCAIDE'].isin([1, 2, 3]), np.nan).astype('Int64')

# recode HIMCAIRE
df['HIMCAREE'] = df['HIMCAREE'].replace({
    1: 0,
    2: 1,
    3: 1
}).where(df['HIMCAREE'].isin([1, 2, 3]), np.nan).astype('Int64')

# recode 
df['HIP1RELPOLICY'] = df['HIP1RELPOLICY'].replace({
    1: 'Child',
    2: 'Spouse',
    3: 'Other',
    4: 'Other'
}).where(df['HIP1RELPOLICY'].isin([1, 2, 3]), np.nan)

In [4]:
# recode REGION
df['REGION'] = df['REGION'].replace({1: 'Northeast', 2: 'North Central/Midwest', 3: 'South', 4: 'West'})

# recode SEX
df['SEX'] = df['SEX'].astype(object)
df.loc[df['SEX'].isin([7, 8, 9]), 'SEX'] = np.nan
df.loc[df['SEX'] == 1, 'SEX'] = 'Male'
df.loc[df['SEX'] == 2, 'SEX'] = 'Female'

# recode MARSTCUR
df['MARSTCUR'] = df['MARSTCUR'].astype(object)
df.loc[df['MARSTCUR'].isin([0, 9]), 'MARSTCUR'] = np.nan
df.loc[df['MARSTCUR'] == 1, 'MARSTCUR'] = 'Married, spouse present'
df.loc[df['MARSTCUR'] == 2, 'MARSTCUR'] = 'Married, spouse absent'
df.loc[df['MARSTCUR'] == 3, 'MARSTCUR'] = 'Married, spouse in household unknown'
df.loc[df['MARSTCUR'] == 4, 'MARSTCUR'] = 'Separated'
df.loc[df['MARSTCUR'] == 5, 'MARSTCUR'] = 'Divorced'
df.loc[df['MARSTCUR'] == 6, 'MARSTCUR'] = 'Widowed'
df.loc[df['MARSTCUR'] == 7, 'MARSTCUR'] = 'Living with partner'
df.loc[df['MARSTCUR'] == 8, 'MARSTCUR'] = 'Never Married'

# recode RACENEW
df['RACENEW'] = df['RACENEW'].astype(object)
df.loc[df['RACENEW'] == 100, 'RACENEW'] = 'White'
df.loc[df['RACENEW'] == 510, 'RACENEW'] = 'Other Race and Multiple Race'
df.loc[df['RACENEW'] == 200, 'RACENEW'] = 'Black/African American'
df.loc[df['RACENEW'] == 542, 'RACENEW'] = 'American Indian/Alaska Native and Any Other Race'
df.loc[df['RACENEW'] == 400, 'RACENEW'] = 'Asian only'
df.loc[df['RACENEW'] == 300, 'RACENEW'] = 'American Indian/Alaska Native only'
df.loc[df['RACENEW'].isin([997, 998, 999]), 'RACENEW'] = np.nan

# recode CITIZEN
df['CITIZEN'] = df['CITIZEN'].astype(object)
df.loc[df['CITIZEN'] == 1, 'CITIZEN'] = 0
df.loc[df['CITIZEN'] == 2, 'CITIZEN'] = 1
df.loc[df['CITIZEN'].isin([7, 8, 9]), 'CITIZEN'] = np.nan

# recode ARMFEV
df['ARMFEV'] = df['ARMFEV'].astype(object)
df.loc[df['ARMFEV'] == 0, 'ARMFEV'] = np.nan
df.loc[df['ARMFEV'] == 11, 'ARMFEV'] = 0
df.loc[df['ARMFEV'] == 20, 'ARMFEV'] = 1
df.loc[df['ARMFEV'].isin([98, 99, 97]), 'ARMFEV'] = np.nan
df.loc[df['ARMFEV'] == 12, 'ARMFEV'] = 0

In [5]:
# recode EDUC
df['EDUC'] = df['EDUC'].astype(object)
df.loc[df['EDUC'] == 201, 'EDUC'] = 'High School Graduate'
df.loc[df['EDUC'] == 301, 'EDUC'] = 'Some college, no degree'
df.loc[df['EDUC'] == 0, 'EDUC'] = np.nan
df.loc[df['EDUC'] == 505, 'EDUC'] = 'Professional School or Doctoral degree, topcoded (MD, DDS, DVM, JD, PhD, EdD)'
df.loc[df['EDUC'] == 303, 'EDUC'] = 'AA degree: academic program'
df.loc[df['EDUC'] == 400, 'EDUC'] = "Bachelor's degree (BA,AB,BS,BBA)"
df.loc[df['EDUC'] == 202, 'EDUC'] = 'GED or equivalent'
df.loc[df['EDUC'] == 103, 'EDUC'] = 'Grades 1-11 (no further detail)'
df.loc[df['EDUC'] == 501, 'EDUC'] = "Master's degree (MA,MS,Med,MBA)"
df.loc[df['EDUC'] == 302, 'EDUC'] = 'AA degree: technical/vocational/occupational'
df.loc[df['EDUC'] == 116, 'EDUC'] = '12th grade, no diploma'
df.loc[df['EDUC'] == 999, 'EDUC'] = np.nan
df.loc[df['EDUC'] == 997, 'EDUC'] = np.nan

In [6]:
# recode EMPSTAT
df['EMPSTAT'] = df['EMPSTAT'].astype(object)
df.loc[df['EMPSTAT'] == 100, 'EMPSTAT'] = 'Employed'
df.loc[df['EMPSTAT'] == 0, 'EMPSTAT'] = np.nan
df.loc[df['EMPSTAT'] == 200, 'EMPSTAT'] = 'Not Employed'
df.loc[df['EMPSTAT'] == 999, 'EMPSTAT'] = np.nan
df.loc[df['EMPSTAT'] == 'Not Employed', 'EMPSTAT'] = 0
df.loc[df['EMPSTAT'] == 'Employed', 'EMPSTAT'] = 1

In [7]:
df.loc[df['HOURSWRK'].isin([97, 98, 99]), 'HOURSWRK'] = np.nan

In [8]:
# recode GOTRET
df['GOTRET'] = df['GOTRET'].astype(object)
df.loc[df['GOTRET'] == 2, 'GOTRET'] = 'SSDI'
df.loc[df['GOTRET'] == 1, 'GOTRET'] = 'SSI'
df.loc[df['GOTRET'].isin([9, 0, 7, 8]), 'GOTRET'] = np.nan

In [9]:
# recode WCHSSISSDI
df['WCHSSISSDI'] = df['WCHSSISSDI'].astype(object)
df.loc[df['WCHSSISSDI'] == 0, 'WCHSSISSDI'] = np.nan
df.loc[df['WCHSSISSDI'] == 1, 'WCHSSISSDI'] = 'SSI'
df.loc[df['WCHSSISSDI'] == 2, 'WCHSSISSDI'] = 'SSDI'
df.loc[df['WCHSSISSDI'] == 9, 'WCHSSISSDI'] = np.nan
df.loc[df['WCHSSISSDI'] == 3, 'WCHSSISSDI'] = 'Both SSI and SSDI'

In [10]:
# recode HEALTH
df['HEALTH'] = df['HEALTH'].astype(object)
df.loc[df['HEALTH'] == 3, 'HEALTH'] = 'Good'
df.loc[df['HEALTH'] == 2, 'HEALTH'] = 'Very Good'
df.loc[df['HEALTH'] == 1, 'HEALTH'] = 'Excellent'
df.loc[df['HEALTH'] == 4, 'HEALTH'] = 'Fair'
df.loc[df['HEALTH'] == 5, 'HEALTH'] = 'Poor'
df.loc[df['HEALTH'].isin([7, 9]), 'HEALTH'] = np.nan

In [11]:
df['DVINT'] = df['DVINT'].replace({
    100: 'Never',
    200: 'Less than 1 year',
    201: 'Visits in the past 2 weeks',
    202: '2 weeks to less than 6 months',
    203: 'Under 6 months',
    204: '6 months to less than 12 months',
    300: '1 year or more',
    301: '1 year',
    302: '1 year to less than 2 years',
    303: '1 year to less than 3 years',
    304: '2 years to less than 3 years',
    305: '2 years to less than 5 years',
    306: '2 years',
    307: '2 to 4 years',
    308: '3 to 4 years',
    309: '3 years to less than 5 years',
    310: 'More than 3 years',
    400: '5 years or more',
    401: '5 to 9 years',
    402: 'More than 10 years'
}).where(df['DVINT'].isin(range(100,403)), np.nan)

df['URGCAREYRNO'] = df['URGCAREYRNO'].replace({
    0: '0 times',
    1: '1 time',
    2: '2 times',
    3: '3 times',
    4: '4 times',
    5: '5+ times',
}).where(df['URGCAREYRNO'].isin(range(0,6)), np.nan)

df['ERYRNO'] = df['ERYRNO'].replace({
    10: 'No visits',
    20: '1 visit',
    30: '2 to 3 visits',
    31: '2 visits',
    32: '3 visits',
    40: '4 or more visits',
    41: '4 to 9 visits',
    42: '4 to 5 visits',
    43: '6 to 7 visits',
    44: '8 to 9 visits',
    45: '10 to 12 visits',
    46: '13 or more visits',
    47: '13 to 15 visits',
    48: '16 or more visits'
}).where(df['ERYRNO'].isin(range(10,49)), np.nan)

df['DVINTWELL'] = df['DVINTWELL'].replace({
    0: 'Never',
    1: 'Within the past year',
    2: 'More than 1 year ago but less than 2 years ago',
    3: 'More than 2 years ago but less than 3 years ago',
    4: 'More than 3 years ago but less than 5 years ago',
    5: 'More than 5 years ago but less than 10 years ago',
    6: '10+ years ago'
}).where(df['DVINTWELL'].isin(range(0,7)), np.nan)

df['WORMEDBILL'] = df['WORMEDBILL'].replace({
    1: 'Very worried',
    2: 'Somewhat worried',
    3: 'Not at all worried'
}).where(df['WORMEDBILL'].isin(range(1,4)), np.nan)

df['HICHIPE'] = np.select(
    [df['HICHIPE'] == 10, df['HICHIPE'].isin(range(20, 23))],
    [0, 1],
    default=np.nan
)

df['HIMILITE'] = np.select(
    [df['HIMILITE'] == 10, df['HIMILITE'].isin(range(20, 26))],
    [0, 1],
    default=np.nan
)

df['HIOTHGOVE'] = np.select(
    [df['HIOTHGOVE'] == 10, df['HIOTHGOVE'].isin(range(20, 23))],
    [0, 1],
    default=np.nan
)

In [12]:
# change vars values from 1,2 to 0,1
def replace_yes_no(column):
    return column.replace({1: 0, 2: 1}).where(column.isin([1, 2]), np.nan).astype('Int64')

cols_to_replace = ['HIPRIVATE', 'HISTATE', 'HICHIP', 'HIMILANY', 'HINOTYR', 'HIP1CVROTR',
                   'HIP1CAID', 'HIP1CARE', 'HIP2CAID', 'HIP2CARE', 'HIP2OGOV',
                   'MDEXUPADV', 'ARTHGLUPEV', 'ASTHMAEV', 'AUTISMEV','HOSPNGHT','THERAPYR','SAWMENT',
                   'DELAYCOST', 'YDELAYMENTAL', 'DELAYINS','YDELAYMEDYR','YSKIMPMEDYR',
                   'YSKIPMEDYR', 'HIPROBPAYR', 'HEALTHPROV', 'HINOTCOVE', 'EMPHI', 'GOTWELF',
                   'CHEARTDIEV', 'CHRFATIGEV', 'CHOLHIGHEV', 'DEMENTIAEV', 'DEPRESSEV', 'DIABETICEV',
                   'HEARTATTEV', 'HYPERTENEV', 'LEARNDEV', 'ODDEV', 'RETEV', 'STROKEV', 'COPDEV',
                   'IMSPCHC', 'HRAUSES', 'GLASSLENS']
df[cols_to_replace] = df[cols_to_replace].apply(replace_yes_no)

In [13]:
# recode ALCANYTP
df['ALCANYTP'] = df['ALCANYTP'].replace({
    1: 'Every day',
    2: 'Week',
    3: 'Month',
    4: 'Year',
    5: 'Never/None'
}).where(df['ALCANYTP'].isin(range(1,6)), np.nan)

# recode SMOKFREQNOW
df['SMOKFREQNOW'] = df['SMOKFREQNOW'].replace({
    1: 'Not at all',
    2: 'Some days',
    3: 'Every day',
}).where(df['SMOKFREQNOW'].isin(range(1,4)), np.nan)

# recode LAMTWRK
df['LAMTWRK'] = df['LAMTWRK'].replace({
    1: 'Not limited in work',
    2: 'Limited in kind/amt of work',
    3: 'Unable to work',
}).where(df['LAMTWRK'].isin(range(1,4)), np.nan)
