### BRFSS Physical vs. Mental Health Data

In [74]:
import pandas as pd
import numpy as np
import random
from bokeh.plotting import figure, show
from bokeh.palettes import Spectral4, Set2
from bokeh.io import output_notebook, gridplot
from bokeh.models import Range1d

In [2]:
# Load all of the data into a pandas dataframe.
alldata = pd.read_excel('brfss2015.xlsx')

In [3]:
# Subselect Columns.
cols = ['PHYSHLTH','MENTHLTH','SEX','_AGEG5YR','_AGE65YR','_AGE80','_AGE_G','_LLCPWT']
data = alldata[cols]

# Remove unusable subject data based on flag codes.
drop = [77,88,99]
for d in drop:
    data = data[data.PHYSHLTH != d]
    data = data[data.MENTHLTH != d]
data.head()

Unnamed: 0,PHYSHLTH,MENTHLTH,SEX,_AGEG5YR,_AGE65YR,_AGE80,_AGE_G,_LLCPWT
17,2,10,1,6,1,49,4,752.974927
22,3,5,1,8,1,59,5,395.776516
27,15,30,2,10,2,69,6,133.928089
30,15,10,1,13,2,80,6,104.432632
49,10,30,2,11,2,72,6,180.540539


In [4]:
# After removing unusable subjects, how many entries are left?
len(data)

16714

In [5]:
# That's a lot of data! Randomly subselect a thousand data points:
subdata = data.loc[np.random.choice(data.index, 1000, replace=False)]

In [33]:
# Add new columns with some jitter for plotting
subdata['jitPhys'] = [x + random.random() for x in subdata.PHYSHLTH]
subdata['jitMent'] = [x + random.random() for x in subdata.MENTHLTH]

In [35]:
subdata.head()

Unnamed: 0,PHYSHLTH,MENTHLTH,SEX,_AGEG5YR,_AGE65YR,_AGE80,_AGE_G,_LLCPWT,jitPhys,jitMent
97641,14,7,2,10,2,68,6,21.130479,14.622298,7.475411
93667,2,5,1,1,1,18,1,564.939971,2.732558,5.66151
98641,3,7,2,8,1,57,5,51.794606,3.318106,7.762044
10878,12,4,2,1,1,18,1,199.309475,12.481722,4.785025
47411,20,25,2,10,2,65,6,320.97437,20.092056,25.836516


In [36]:
# Create a dataframe for each sex:
men = subdata[subdata.SEX==1]
women = subdata[subdata.SEX==2]

In [7]:
# Load Bokeh for plotting
output_notebook()

In [70]:
plot = figure(title ='BRFSS 2015 Survey Data')

plot.xaxis.axis_label = 'Physical Health Metric'
plot.yaxis.axis_label ='Mental Health Metric'

# Plot data from subjects under 50 yo, or _AGEG5YR<10
plot.circle(men[men._AGEG5YR<10]['jitPhys'], men[men._AGEG5YR<10]['jitMent'], 
            legend = 'Men <50yo', color = 'green', alpha=0.5, size = 8)
plot.circle(women[women._AGEG5YR<10]['jitPhys'],women[women._AGEG5YR<10]['jitMent'], 
            legend = 'Women <50yo', color = 'orange', alpha=0.5, size = 8)

# Plot data from subjects over 50 yo, or _AGEG5YR>=10
plot.circle(men[men._AGEG5YR>=10]['jitPhys'],men[men._AGEG5YR>=10]['jitMent'], 
       legend = 'Men >=50yo', color = 'navy', alpha=0.5, size = 8)
plot.circle(women[women._AGEG5YR>=10]['jitPhys'],women[women._AGEG5YR>=10]['jitMent'], 
            legend = 'Women >=50yo', color = 'red', alpha=0.5, size = 8)
show(plot)

This graphs shows all of the subsampled data colored by gender and age category. There does not appear to be a clear linear trend between mental and physical health from this graph. Instead the data appears to cluster in the lower left quandrant (low mental and physical health), or at the extremes of mental and physical health. Still the graph is pretty busy, so I will plot again separating out each gender/age group.

In [81]:
# Create a gridplot of each gender/age group
plot1 = figure(width=250, height=250, title='Men <50yo')
plot2 = figure(width=250, height=250, title='Women <50yo')
plot3 = figure(width=250, height=250, title='Men >=50yo')
plot4 = figure(width=250, height=250, title='Women >=50yo')

plot3.xaxis.axis_label = 'Physical Health Metric'
plot4.xaxis.axis_label = 'Physical Health Metric'
plot1.yaxis.axis_label ='Mental Health Metric'
plot3.yaxis.axis_label ='Mental Health Metric'

plot1.circle(men[men._AGEG5YR<10]['jitPhys'], men[men._AGEG5YR<10]['jitMent'], 
            color = 'green', alpha=0.5, size = 8)
plot2.circle(women[women._AGEG5YR<10]['jitPhys'],women[women._AGEG5YR<10]['jitMent'], 
            color = 'orange', alpha=0.5, size = 8)
plot3.circle(men[men._AGEG5YR>=10]['jitPhys'],men[men._AGEG5YR>=10]['jitMent'], 
            color = 'navy', alpha=0.5, size = 8)
plot4.circle(women[women._AGEG5YR>=10]['jitPhys'],women[women._AGEG5YR>=10]['jitMent'], 
             color = 'red', alpha=0.5, size = 8)
p = gridplot([[plot1, plot2], [plot3, plot4]])
show(p)

When we separate the four gender/age groups out, we can observe the same clustering behavior at low menthal and physical health, as well as at the extremes. This means that a person could have high mental health metric, but low physical health metric, and visa versa. This suggests that the two are likely correllated, but there are a number of other influencing factors that should be taken into account.