Source: [National Combined YRBS data set, ACCESS format](https://www.cdc.gov/healthyyouth/data/yrbs/data.htm)

In ACCESS:
- deleted all data from before 2007 
- deleted all sample data except "year"
- deleted all demographic data except "bmi"


In [1]:
# dependencies
import pandas as pd
from statsmodels.stats.weightstats import DescrStatsW
import numpy as np

In [2]:
# read in National data
national = pd.read_csv("data/SADCQ_07_17.csv")
national.head()

Unnamed: 0,year,weight,age,sex,race7,bmi,sexid,q8,q9,q10,...,qwenthungry,qmusclestrength,qsunscreenuse,qindoortanning,qsunburn,qconcentrating,qcurrentasthma,qwheresleep,qspeakenglish,qtransgender
0,2017,1.3041,5.0,2.0,6.0,18.048443,1.0,5.0,1.0,2.0,...,,4.0,,1.0,6.0,1.0,,,1.0,
1,2017,0.2672,5.0,2.0,3.0,30.483565,1.0,5.0,2.0,2.0,...,,4.0,,1.0,1.0,2.0,,,1.0,
2,2017,0.7263,5.0,2.0,2.0,14.66449,,5.0,1.0,1.0,...,,4.0,,1.0,1.0,1.0,,,2.0,
3,2017,0.3991,5.0,2.0,7.0,20.893566,1.0,4.0,2.0,1.0,...,,4.0,,1.0,2.0,2.0,,,2.0,
4,2017,0.3695,5.0,2.0,4.0,,1.0,5.0,1.0,,...,,8.0,,1.0,1.0,2.0,,,2.0,


In [3]:
# recoding values for clarity

# screen use
# code "none" as 0, "less than 1 hour" as 0.5, ">=5" as 6, all others as actual hour values
# same scheme as in Twenge et al (2017)
screen_map= {0:0, 1:0.5, 2:1, 3:2, 4:3, 5:4, 6:6}

national['q80'] = national['q80'].map(screen_map)
national['q81'] = national['q81'].map(screen_map)


In [4]:
#testing DescrStatsW
t = national.loc[:, ['q80', 'bmi', 'weight']].dropna()
ds = DescrStatsW(t.loc[:, ['bmi', 'q80']], weights=t.weight)
ds.corrcoef

array([[1.        , 0.05942734],
       [0.05942734, 1.        ]])

In [8]:
# generating a bunch of weighted correlation coefficients
# looking for variables to highlight in the notebook
qs = national.drop(['year', 'age', 'race7',"sexid",'sex'], axis=1)
corrs = pd.DataFrame(index=qs.columns.drop('weight'), columns = ['q80','q81'])

for c1 in qs.columns.drop('weight'):
    for c2 in ['q80','q81']:
        df = qs.loc[:, [c1, c2, 'weight']].dropna()
        stats = DescrStatsW(df.drop('weight', axis=1), weights=df.weight)
        r = stats.corrcoef[0][1]
        corrs.loc[c1, c2] = r
        
    
corrs.head()

Unnamed: 0,q80,q81
bmi,0.0594273,0.00901578
q8,-0.0571774,0.022803
q9,0.0365949,-0.0255863
q10,0.0175759,-0.0845708
q11,0.005412,-0.0832871


In [12]:
# p-hacking, but for pedagogical purposes
corrs.sort_values(by="q80", axis=0, ascending=True)
corrs.sort_values(by="q80", axis=0, ascending=False)
corrs.sort_values(by="q81", axis=0, ascending=True)
corrs.sort_values(by="q81", axis=0, ascending=Falase)

Unnamed: 0,q80,q81
q10,0.0175759,-0.0845708
q39,0.0403654,-0.0842108
q11,0.005412,-0.0832871
qmusclestrength,0.0111305,-0.0723976
q37,0.0122256,-0.0720967
q31,0.0293375,-0.0677471
qdrivemarijuana,0.0252937,-0.0669691
q60,0.0137699,-0.0603034
q64,0.0283465,-0.0597546
q61,0.0363464,-0.059108


In [13]:
# renaming columns for clarity
# and subsetting those to use in the notebook

col_name_dic = {"q80": "tv use", "q81": "computer/video game use", "q76": "soda drinking",
               "q89": "grades", "q8":'seat belt use', 'qsunscreenuse':'sunscreen use',
               'q25': 'depression', 'q61':'sexual partners (lifetime)', 'qmusclestrength':'strength training',
               'q88':'hours of sleep', 'q87': "asthma", 'q17': 'fights', 'q37':'current tobacco',
               'q11':'text/email while driving', 'weight':'weight'}



In [14]:
# subset only relevant columns

screen_health = national.loc[:, col_name_dic.keys()]
screen_health.rename(mapper=col_name_dic, axis=1, inplace=True)
screen_health.head()

Unnamed: 0,tv use,computer/video game use,soda drinking,grades,seat belt use,sunscreen use,depression,sexual partners (lifetime),strength training,hours of sleep,asthma,fights,current tobacco,text/email while driving,weight
0,1.0,3.0,3.0,2.0,5.0,,2.0,1.0,4.0,4.0,2.0,1.0,1.0,2.0,1.3041
1,3.0,,2.0,3.0,5.0,,1.0,3.0,4.0,3.0,2.0,2.0,1.0,2.0,0.2672
2,4.0,,,7.0,5.0,,1.0,1.0,4.0,5.0,2.0,1.0,1.0,1.0,0.7263
3,0.5,,2.0,2.0,4.0,,2.0,1.0,4.0,4.0,3.0,3.0,1.0,1.0,0.3991
4,4.0,3.0,5.0,3.0,5.0,,2.0,7.0,8.0,4.0,2.0,1.0,1.0,,0.3695


In [15]:
# recoding values for clarity

# grades: dropping "not sure" and "none of these" answers
# and flipping the scale (e.g. "Fs":0 and "As" : 4)
screen_health['grades'] = screen_health['grades'].map({i:-(i - 4)  for i in range(5)})

# sexual partners- changing to actual number of partners
# coding >=6 as 6
screen_health['sexual partners (lifetime)'] -= 1

# hrs of sleep- changing to actual number of hours
# coding <=4 as 4, >=10 as 10
screen_health['hours of sleep'] += 3

# depression: coding "yes" as 1, "no" as 0
screen_health['depression'] = screen_health['depression'] * (-1) + 2

screen_health.head()

Unnamed: 0,tv use,computer/video game use,soda drinking,grades,seat belt use,sunscreen use,depression,sexual partners (lifetime),strength training,hours of sleep,asthma,fights,current tobacco,text/email while driving,weight
0,1.0,3.0,3.0,2.0,5.0,,0.0,0.0,4.0,7.0,2.0,1.0,1.0,2.0,1.3041
1,3.0,,2.0,1.0,5.0,,1.0,2.0,4.0,6.0,2.0,2.0,1.0,2.0,0.2672
2,4.0,,,,5.0,,1.0,0.0,4.0,8.0,2.0,1.0,1.0,1.0,0.7263
3,0.5,,2.0,2.0,4.0,,0.0,0.0,4.0,7.0,3.0,3.0,1.0,1.0,0.3991
4,4.0,3.0,5.0,1.0,5.0,,0.0,6.0,8.0,7.0,2.0,1.0,1.0,,0.3695


In [16]:
# writing cleaned data to csv
screen_health.to_csv('data/yrbs_clean.csv')

Experiments with vis

In [None]:
# playing with vis ideas around correlation
# all variables are ordinal, most have limited (<10) values
# spacing not always even (e.g. "0", "0-1", "1", "2", >2")
import seaborn as sns
clean = qs.loc[:, ['q80', 'q81']].dropna(axis=0)
clean.shape

In [None]:
clean = qs.loc[:, ['q80', 'q76']].dropna()
sns.kdeplot(clean['q80'], clean['q76'], shade=True)

In [None]:
# yes I know I should have used groupby
sns.heatmap(pd.pivot_table(data=qs, values='q8', index='q80', columns='q81', aggfunc=len), cmap='Blues')

In [None]:
clean = qs.loc[:, ['q80', 'bmi']].dropna()
sns.kdeplot(clean['q80'], clean['bmi'])

In [None]:
sns.violinplot(data=qs, x='q80', y='bmi')

In [None]:
sns.regplot(data=qs, x='q80', y='q76')