In [1]:
#all imports for this workbook

import numpy as np
import pandas as pd
import time
import xport

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline 
%config InlineBackend.figure_format='retina'

import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('dark')

#for choropleth maps
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.io as pio
plotly.offline.init_notebook_mode(connected=True)

In [2]:
from cycler import cycler
# Update matplotlib defaults to something nicer, including the tableau 20 colormap 
mpl_update = {'font.size':16,
              'xtick.labelsize':14,
              'ytick.labelsize':14,
              'figure.figsize':[16,9],
              'axes.labelsize':20,
#               'axes.labelcolor':'#677385',
              'axes.titlesize':20,
              'lines.color':'#0055A7',
              'lines.linewidth':3,
#               'text.color':'#677385',
              'axes.prop_cycle': cycler('color', ['#1f77b4','#aec7e8','#ff7f0e','#ffbb78','#2ca02c',
                                                  '#98df8a','#d62728','#ff9896','#9467bd','#c5b0d5',
                                                  '#8c564b','#c49c94','#e377c2','#f7b6d2','#7f7f7f',
                                                  '#c7c7c7','#bcbd22','#dbdb8d','#17becf','#9edae5'])
             }
mpl.rcParams.update(mpl_update)

In [3]:
#change these paths as necessary
data_path = '/Users/dhawan/Documents/K2/exploratory_analysis/health_project/data/raw/'
hdf_path = '/Users/dhawan/Documents/K2/exploratory_analysis/health_project/data/interim/brfss.h5'

# Cleaning Data for All Years

Now we'll implement the same methodology from notebook 2.0 to clean and build the dataset features on the 5 year dataset.

In [4]:
# If data not yet loaded
%time df = pd.read_hdf(hdf_path, 'trim_13_17')

CPU times: user 364 ms, sys: 234 ms, total: 598 ms
Wall time: 951 ms


In [5]:
df.head()

Unnamed: 0,_state,fmonth,dispcode,_psu,ctelenm1,pvtresd1,colghous,statere1,cellfon4,ladult,...,sdhbills,sdhmove,howsafe1,sdhfood,sdhmeals,sdhmoney,sdhstres,_llcpwt,_age_g,year
0,1,1,1100,2013000580,-1,1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,238,5,2013
1,1,1,1100,2013000593,-1,1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,737,4,2013
2,1,1,1100,2013000600,-1,1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,568,5,2013
3,1,1,1100,2013000606,-1,1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,606,5,2013
4,1,2,1100,2013000608,-1,1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,629,6,2013


In [6]:
#rename variables
demo_name_map = {'educa': 'education', 'employ1': 'employment', 'income2': 'income', 
                 '_state': 'state', '_age_g': 'age'}
env_name_map = {'emtsuprt': 'emotional_support', 'lsatisfy': 'satisfied_life', 'sdhbills': 'cannot_pay_bills', 
                 'sdhmove': 'times_moved', 'howsafe1': 'safe_living', 'sdhmeals': 'cannot_afford_meals', 'sdhmoney': 'finances',
                  'sdhstres': 'stress', 'addepev2': 'depression', 'menthlth': 'mental_health'}
target_name_map = {'smokday2': 'smoke_now', 'ecignow': 'ecig_now', 'avedrnk2': 'avg_drinks', 
                 'drnk3ge5': 'many_drinks', 'maxdrnks': 'max_drinks', 
                 'marijana': 'marijuana_days', 'rsnmrjna': 'marijuana_purpose'}
df.rename(demo_name_map, axis=1, inplace=True)
df.rename(env_name_map, axis=1, inplace=True)
df.rename(target_name_map, axis=1, inplace=True)

In [7]:
#map demographic variables to categories
demo_maps = {

'sex': {
    1: 'male',
    2: 'female',
    9: 'unknown',
    -1: 'unknown'
},
    
'age': {
    1: '18-24',
    2: '25-34',
    3: '35-44',
    4: '45-54',
    5: '55-64',
    6: '>65'
},

'marital': {
    1: 'married',
    2: 'divorced',
    3: 'widowed',
    4: 'separated',
    5: 'never_married',
    6: 'unknown',
    9: 'unknown',
    -1: 'unknown'
},


'education': {
    1: 'no_school',
    2: 'elementary',
    3: 'high_school',
    4: 'high_school',
    5: 'college',
    6: 'college',
    9: 'unknown',
    -1: 'unknown',
},


'employment': {
    1: 'employed',
    2: 'employed',
    3: 'unemployed',
    4: 'unemployed',
    5: 'homemaker',
    6: 'student',
    7: 'retired',
    8: 'unable',
    9: 'unknown',
    -1: 'unknown'
},

'income': {
    1: '<10k',
    2: '<15k',
    3: '<20k',
    4: '<25k',
    5: '<35k',
    6: '<50k',
    7: '<75k',
    8: '>75k',
    77: 'unknown',
    99: 'unknown',
    -1: 'unknown'
},

'state': {
    1: 'Alabama', 
    2: 'Alaska', 
    4: 'Arizona', 
    5: 'Arkansas', 
    6: 'California', 
    8: 'Colorado',
    9: 'Connecticut', 
    10: 'Delaware', 
    11: 'District of Columbia',
    12: 'Florida', 
    13: 'Georgia', 
    15: 'Hawaii', 
    16: 'Idaho', 
    17: 'Illinois', 
    18: 'Indiana', 
    19: 'Iowa', 
    20: 'Kansas', 
    21: 'Kentucky', 
    22: 'Louisiana', 
    23: 'Maine', 
    24: 'Maryland', 
    25: 'Massachusetts', 
    26: 'Michigan', 
    27: 'Minnesota', 
    28: 'Mississippi', 
    29: 'Missouri', 
    30: 'Montana', 
    31: 'Nebraska', 
    32: 'Nevada', 
    33: 'New Hampshire', 
    34: 'New Jersey', 
    35: 'New Mexico', 
    36: 'New York', 
    37: 'North Carolina', 
    38: 'North Dakota', 
    39: 'Ohio', 
    40: 'Oklahoma', 
    41: 'Oregon', 
    42: 'Pennsylvania', 
    44: 'Rhode Island', 
    45: 'South Carolina', 
    46: 'South Dakota', 
    47: 'Tennessee', 
    48: 'Texas', 
    49: 'Utah', 
    50: 'Vermont', 
    51: 'Virginia', 
    53: 'Washington', 
    54: 'West Virginia', 
    55: 'Wisconsin', 
    56: 'Wyoming',
    66: 'Guam',
    72: 'Puerto Rico' }
}

for i,v in enumerate(demo_maps):
    df[v] = df[v].map(demo_maps[v])

In [8]:
# Map environmental attributes to human readable
env_maps = {

'emotional_support': {
    1: 'always',
    2: 'usually',
    3: 'sometimes',
    4: 'rarely',
    5: 'never',
    7: 'unknown',
    9: 'unknown',
    -1: 'unknown'
},
    
'satisfied_life': {
    1: 'very_satisfied',
    2: 'satisfied',
    3: 'dissatisfied',
    4: 'very_dissatisfied',
    7: 'unknown',
    9: 'unknown',
    -1: 'unknown'
},

'cannot_pay_bills': {
    1: 'yes',
    2: 'no',
    7: 'unknown',
    9: 'unknown',
    -1: 'unknown'
},

'safe_living': {
    1: 'extremely_safe',
    2: 'safe',
    3: 'unsafe',
    4: 'extremely_unsafe',
    7: 'unknown',
    9: 'unknown',
    -1: 'unknown'
},

'cannot_afford_meals': {
    1: 'often_true',
    2: 'sometimes_true',
    3: 'never_true',
    7: 'unknown',
    9: 'unknown',
    -1: 'unknown'
},
    
'finances': {
    1: 'some_money_left',
    2: 'just_enough_money',
    3: 'not_enough_money',
    7: 'unknown',
    9: 'unknown',
    -1: 'unknown'
},  
    
'stress': {
    1: 'none',
    2: 'a_little',
    3: 'some_of_time',
    4: 'most_of_time',
    5: 'all_of_time',
    7: 'unknown',
    9: 'unknown',
    -1: 'unknown'
},  
    
'depression': {
    1: 'yes',
    2: 'no',
    7: 'unknown',
    9: 'unknown',
    -1: 'unknown'
}}

for i,v in enumerate(env_maps.keys()):
    df[v] = df[v].map(env_maps[v])

In [10]:
#bin the times moved and mental health variables
def bin_numerical(x, bins):
    """
    Return the binned category for variable in x according to dictionary bins
    
    x should be a single value
    
    bins should be a dictionary with keys either as a single int (e.g., 88) or as
    a tuple representing an inclusive range (e.g., (5,10)). This structure reflects
    the coding of the CDC dataset
    """
    for k, v in bins.items():
        if (isinstance(k,int) and x==k): return v
        elif (isinstance(k,tuple) and x>=k[0] and x<=k[1]): return v
        else: continue
            
    return 'unknown'
        

#define the bins as outlined in the documentation for our bin_numerical function
move_bins = {88: '0', 77: 'unknown', 99: 'unknown', -1: 'unknown', 
             1: '1', 2: '2', (3,df.times_moved.max()): '3 or more'}

mh_bins = {88: '0', 77: 'unknown', 99: 'unknown', -1: 'unknown', 
             (1,5): '<5', (6,10): '<10', (11,20): '<20', (21,df.mental_health.max()): '>20'}

#create bins for numerical environmental variables
df['times_moved_bins'] = df.times_moved.apply(lambda x: bin_numerical(x,move_bins))
df['poor_mental_health_days'] = df.mental_health.apply(lambda x: bin_numerical(x,mh_bins))

In [11]:
#drop the helper variables we used to bin the variables
df.drop(['times_moved', 'mental_health'], axis=1, inplace=True)

In [12]:
# Map smoking target attributes to human readable (alcohol are numeric rather than category)
target_maps = {

'smoke_now': {
    0: 'not_at_all',
    1: 'every_day',
    2: 'some_days',
    3: 'not_at_all',
    7: 'unknown',
    9: 'unknown',
    -1: 'unknown'
},
    
'ecig_now': {
    0: 'not_at_all',
    1: 'every_day',
    2: 'some_days',
    3: 'not_at_all',
    7: 'unknown',
    9: 'unknown',
    -1: 'unknown'
},
    
'marijuana_purpose': {
    0: 'do_not_smoke',
    1: 'medical',
    2: 'pleasure',
    3: 'both',
    7: 'unknown',
    9: 'unknown',
    -1: 'unknown'
}}

#map smoking variables (we have to use smoke100 and ecigaret from codebook to help fill in zeros)
df.loc[df.smoke100==2, 'smoke_now'] = 0
df.loc[df.ecigaret==2, 'ecig_now'] = 0
df.loc[df.marijuana_days==88, 'marijuana_purpose'] = 0
df['smoke_now'] = df['smoke_now'].map(target_maps['smoke_now'])
df['ecig_now'] = df['ecig_now'].map(target_maps['ecig_now'])
df['marijuana_purpose'] = df['marijuana_purpose'].map(target_maps['marijuana_purpose'])

#use alcday5 to code 0 into other variables
df.loc[df.alcday5==888, ['avg_drinks', 'many_drinks', 'max_drinks']] = 0

#we have to code -1 (unknown) for some of these values to conform with the other variables
df.loc[df.avg_drinks==77, 'avg_drinks'] = -1
df.loc[df.avg_drinks==99, 'avg_drinks'] = -1

df.loc[df.many_drinks==77, 'many_drinks'] = -1
df.loc[df.many_drinks==88, 'many_drinks'] = 0      #this variable has an 88 in the codebook, corresponding to 0
df.loc[df.many_drinks==99, 'many_drinks'] = -1

df.loc[df.max_drinks==77, 'max_drinks'] = -1
df.loc[df.max_drinks==99, 'max_drinks'] = -1

df.loc[df.marijuana_days==77, 'marijuana_days'] = -1
df.loc[df.marijuana_days==88, 'marijuana_days'] = 0      #this variable has an 88 in the codebook, corresponding to 0
df.loc[df.marijuana_days==99, 'marijuana_days'] = -1

#create a new feature representing the number of days marijuana was used for pleasure
df['marijuana_pleasure_days'] = df['marijuana_days']
df.loc[df.marijuana_purpose=='medical','marijuana_pleasure_days'] = 0
df.loc[df.marijuana_purpose=='both','marijuana_pleasure_days'] = df.loc[df.marijuana_purpose=='both','marijuana_pleasure_days'] // 2

#drop the helper variables we used to fill in 0s
df.drop(['smoke100', 'ecigaret','alcday5', 'marijuana_days'], axis=1, inplace=True)

In [13]:
#drop rows with unknowns for all target variables
df = df.drop(df[(df.smoke_now=='unknown') & (df.ecig_now=='unknown') \
                        & (df.avg_drinks==-1) & (df.many_drinks==-1) \
                        & (df.max_drinks==-1) & (df.marijuana_purpose=='unknown') \
                        & (df.marijuana_pleasure_days==-1)].index) \
                        .reset_index(drop=True)

In [15]:
#create categories to make sure bar charts are ordered correctly
df['education'] = pd.Categorical(df['education'], categories=['unknown', 'no_school', 'elementary',
                                'high_school', 'college'], ordered=True)
df['marital'] = pd.Categorical(df['marital'], categories=['unknown', 'never_married', 'married',
                                'separated', 'divorced', 'widowed'], ordered=True)
df['employment'] = pd.Categorical(df['employment'], categories=['unknown', 'student', 'homemaker',
                                'employed', 'unemployed', 'unable', 'retired'], ordered=True)
df['emotional_support'] = pd.Categorical(df['emotional_support'], categories=['unknown', 'never', 'rarely',
                                'sometimes', 'usually', 'always'], ordered=True)
df['satisfied_life'] = pd.Categorical(df['satisfied_life'], categories=['unknown', 'very_dissatisfied', 'dissatisfied',
                                'satisfied', 'very_satisfied'], ordered=True)
df['safe_living'] = pd.Categorical(df['safe_living'], categories=['unknown', 'extremely_unsafe', 'unsafe',
                                'safe', 'extremely_safe'], ordered=True)
df['cannot_afford_meals'] = pd.Categorical(df['cannot_afford_meals'], categories=['unknown', 'never_true', 'sometimes_true',
                                'often_true'], ordered=True)
df['finances'] = pd.Categorical(df['finances'], categories=['unknown', 'not_enough_money', 'just_enough_money',
                                'some_money_left'], ordered=True)
df['stress'] = pd.Categorical(df['stress'], categories=['unknown', 'none', 'a_little', 'some_of_time',
                                'most_of_time', 'all_of_time'], ordered=True)
df['poor_mental_health_days'] = pd.Categorical(df['poor_mental_health_days'], categories=['unknown', '0', '<5', '<10',
                                '<20', '>20'], ordered=True)

In [16]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
    'District of Columbia': 'DC',
    'Puerto Rico': 'PR',
    'Guam': 'GU'
}

#apply the state map
df['state_code'] = df.state.map(us_state_abbrev)

In [18]:
# store the clean dataset in an HDF store
df.to_hdf(hdf_path, 'clean_13_17', format='table')