In [9]:
#all imports for this workbook

import numpy as np
import pandas as pd
import time
import xport

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline 
%config InlineBackend.figure_format='retina'

import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('dark')

#for choropleth maps
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.io as pio
plotly.offline.init_notebook_mode(connected=True)

In [2]:
from cycler import cycler
# Update matplotlib defaults to something nicer, including the tableau 20 colormap 
mpl_update = {'font.size':16,
              'xtick.labelsize':14,
              'ytick.labelsize':14,
              'figure.figsize':[16,9],
              'axes.labelsize':20,
#               'axes.labelcolor':'#677385',
              'axes.titlesize':20,
              'lines.color':'#0055A7',
              'lines.linewidth':3,
#               'text.color':'#677385',
              'axes.prop_cycle': cycler('color', ['#1f77b4','#aec7e8','#ff7f0e','#ffbb78','#2ca02c',
                                                  '#98df8a','#d62728','#ff9896','#9467bd','#c5b0d5',
                                                  '#8c564b','#c49c94','#e377c2','#f7b6d2','#7f7f7f',
                                                  '#c7c7c7','#bcbd22','#dbdb8d','#17becf','#9edae5'])
             }
mpl.rcParams.update(mpl_update)

In [6]:
#change these paths as necessary
data_path = '/Users/dhawan/Documents/K2/exploratory_analysis/health_project/data/raw/'
hdf_path = '/Users/dhawan/Documents/K2/exploratory_analysis/health_project/data/interim/brfss.h5'

# Read in 5 Years of Data

In this notebook, we'll take the analysis 1 step further by downloading, cleaning and storing BRFSS data for the past 5 years rather than just 2017. As we're building the dataset, we'll get a flavor for what variables are available across all 5 years, which will inform how much analysis we can do.

To start, just like you did for the 2017 data set, download the 2013 - 2016 datasets from the BRFSS website in .xpt format. Remember to unzip the files and take the trailing space out of the file names, and update the data_path variable as necessary.

In [7]:
#read the xpt files into a pandas DataFrame

file_names = [data_path + 'LLCP' + str(x) + '.XPT' for x in range(2013,2018,1)]
file_names

['/Users/dhawan/Documents/K2/exploratory_analysis/health_project/data/raw/LLCP2013.XPT',
 '/Users/dhawan/Documents/K2/exploratory_analysis/health_project/data/raw/LLCP2014.XPT',
 '/Users/dhawan/Documents/K2/exploratory_analysis/health_project/data/raw/LLCP2015.XPT',
 '/Users/dhawan/Documents/K2/exploratory_analysis/health_project/data/raw/LLCP2016.XPT',
 '/Users/dhawan/Documents/K2/exploratory_analysis/health_project/data/raw/LLCP2017.XPT']

In [10]:
tic = time.time()

#create a giant dataframe for all 5 years
for year in file_names:
    with open(year, 'rb') as f:
        if (year==file_names[0]): df = xport.to_dataframe(f)
        else: df = pd.concat([df, xport.to_dataframe(f)], ignore_index=True)
        print(year)

toc = time.time()
print("Sucessfully read all DataFrames in {} seconds".format(toc-tic))

/Users/dhawan/Documents/K2/exploratory_analysis/health_project/data/raw/LLCP2013.XPT
/Users/dhawan/Documents/K2/exploratory_analysis/health_project/data/raw/LLCP2014.XPT
/Users/dhawan/Documents/K2/exploratory_analysis/health_project/data/raw/LLCP2015.XPT
/Users/dhawan/Documents/K2/exploratory_analysis/health_project/data/raw/LLCP2016.XPT
/Users/dhawan/Documents/K2/exploratory_analysis/health_project/data/raw/LLCP2017.XPT
Sucessfully read all DataFrames in 2322.9990870952606 seconds


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2334212 entries, 0 to 2334211
Columns: 522 entries, ACTIN11_ to _WT2RAKE
dtypes: float64(507), object(15)
memory usage: 9.1+ GB


In [12]:
#pythonize the column names
df.columns = [x.lower() for x in df.columns]

In [13]:
# store the entire dataset in an HDF store
df.to_hdf(hdf_path, 'raw_13_17')

### Remove Columns and Optimize Dataframe

Now we follow a similar methodology as in notebook 1.0 to clean and optimize the dataset.

In [15]:
df.idate.head()

0    01092013
1    01192013
2    01192013
3    01112013
4    02062013
Name: idate, dtype: object

In [18]:
#convert idate column to datetime object and extract year
def convert_date(x):
    try:
        return pd.to_datetime(x, format='%m%d%Y')
    except:    #some dates in 2014 dataset are illegal (e.g., 09312014)
        return pd.to_datetime(x[:2]+'01'+x[-4:], format='%m%d%Y')

df['date'] = df.idate.apply(lambda x: convert_date(x))
df['year'] = df.date.apply(lambda x: x.year)

In [19]:
# load the raw_17 dataset and extract columns prior to renaming
%time df_17 = pd.read_hdf(hdf_path, 'trim_17')
cols_17 = df_17.columns

CPU times: user 81.9 ms, sys: 68.4 ms, total: 150 ms
Wall time: 274 ms


In [20]:
#trim the all_years data set to have the same columns as the 2017 dataset
df = df.loc[:, cols_17]

In [21]:
#fill na's with -1, downcast floats to ints
df =  df.fillna(-1)\
        .select_dtypes(include=['int','float'])\
        .astype('int')\
        .apply(pd.to_numeric, downcast='integer')

In [22]:
df.info(verbose=False, memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2334212 entries, 0 to 2334211
Columns: 90 entries, _state to year
dtypes: int16(5), int32(2), int8(83)
memory usage: 224.8 MB


In [23]:
# store the trimmed and datatype optimized dataset in an HDF store
df.to_hdf(hdf_path, 'trim_13_17')