In [124]:
import pandas as pd
import numpy as np

## Linear Regression Analysis Final Project Data Scrape
#### Data Source: [The CDC's 2017 NHANES survey](https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?BeginYear=2017)
- Use 'xport' package to convert from .XPT to csv: $ python -m xport <b>.XPT_file</b> > <b>.csv_file</b>



In [126]:
demographics = pd.read_csv('./data/demographics.csv')
demographics = demographics.rename(columns={'RIAGENDR': 'gender', 'RIDAGEYR': 'age', 'RIDRETH1': 'race', 
                                            'DMDHHSIZ': 'household_person_count', 'DMDHHSZA': 'children_u5_household_count'})
demographics = demographics[['SEQN', 'gender', 'age', 'race', 'household_person_count', 'children_u5_household_count']]
demographics['gender'] = demographics['gender'].replace({1.0: 'male', 2.0: 'female'})
demographics['race'] = demographics['race'].replace({1.0: 'mexican_american', 2.0: 'other_hispanic', 
                                                     3.0: 'white', 4.0: 'black', 5.0: 'multi_racial'})


#### for nutrients reported usual_intake: 2 = usual filter for only these to account for potential bias

In [134]:
nutrients = pd.read_csv('./data/nutrients_day1.csv')
nutrients = nutrients.rename(columns={'DBQ095Z': 'salt_type', 
                                        'DRQSDIET': 'diet', 'DRQSDT1': 'low_cal_diet', 'DRQSDT3': 'low_sodium_diet',
                                        'DR1TPROT': 'protien', 'DR1TCARB': 'carbs', 'DR1TSUGR': 'sugar', 'DR1TFIBE': 'fiber',
                                        'DR1TTFAT': 'fats', 'DR1TCHOL': 'cholesterol', 'DR1TSODI': 'sodium', 'DR1TCAFF': 'caffeine',
                                        'DR1TALCO': 'alchohol', 'DR1_320Z': 'water', 'DR1_300': 'usual_intake'})
nutrients = nutrients[['SEQN', 'salt_type', 'diet', 'low_cal_diet', 'low_sodium_diet',
                         'protien', 'carbs', 'sugar', 'fiber', 'fats', 'cholesterol', 'sodium', 'caffeine', 'alchohol',
                         'water', 'usual_intake']]
nutrients = nutrients[nutrients['usual_intake']==2]
nutrients = nutrients.drop(columns='usual_intake')
nutrients['salt_type'] = nutrients['salt_type'].replace({1.0: 'ordinary', 2.0: 'lite', 3.0: 'salt_substitute', 4.0: 'no_extra_salt', 99.0: 'dont_know'})
nutrients['diet'] = nutrients['diet'].replace({9.0: 2.0, 1.0: 'yes', 2.0: 'no'})

In [128]:
body = pd.read_csv('./data/body.csv')
body = body.rename(columns={'BMXWT': 'weight', 'BMXHT': 'height', 'BMXBMI': 'bmi'})
body = body[['SEQN', 'weight', 'height', 'bmi']]

In [129]:
blood = pd.read_csv('./data/blood.csv')
blood = blood.rename(columns={'BPXSY1': 'blood_pressure'})
blood = blood[['SEQN', 'blood_pressure']].dropna()

In [130]:
df = blood.merge(body, on='SEQN', how='left')
df = df.merge(nutrients, on='SEQN', how='left')
df = df.merge(demographics, on='SEQN', how='left')
df.isna().sum()

SEQN                              0
blood_pressure                    0
weight                           62
height                           64
bmi                              72
salt_type                      2134
diet                           2134
low_cal_diet                   6032
low_sodium_diet                6234
protien                        2137
carbs                          2137
sugar                          2137
fiber                          2137
fats                           2137
cholesterol                    2137
sodium                         2137
caffeine                       2137
alchohol                       2137
water                          2137
gender                            0
age                               0
race                              0
household_person_count            0
children_u5_household_count       0
dtype: int64

In [131]:
df['salt_type'] = df['salt_type'].fillna('dont_know')
df = df.fillna(0)
df = df.dropna(subset=['protien', 'bmi'])
print(len(df))
df.isna().sum()

6302


SEQN                           0
blood_pressure                 0
weight                         0
height                         0
bmi                            0
salt_type                      0
diet                           0
low_cal_diet                   0
low_sodium_diet                0
protien                        0
carbs                          0
sugar                          0
fiber                          0
fats                           0
cholesterol                    0
sodium                         0
caffeine                       0
alchohol                       0
water                          0
gender                         0
age                            0
race                           0
household_person_count         0
children_u5_household_count    0
dtype: int64

In [133]:
df.to_csv('blood_pressure.csv', index=False)
df[:10].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
SEQN,93706.0,93707.0,93709.0,93711.0,93712.0,93713.0,93715.0,93716.0,93717.0,93718.0
blood_pressure,112.0,128.0,120.0,108.0,112.0,104.0,112.0,120.0,116.0,128.0
weight,66.3,45.4,88.8,62.1,58.9,74.9,65.6,77.7,74.4,54.4
height,175.7,158.4,151.1,170.6,172.8,178.6,170.6,159.2,174.1,157.3
bmi,21.5,18.1,38.9,21.3,19.7,23.5,22.5,30.7,24.5,22.0
salt_type,dont_know,ordinary,dont_know,ordinary,dont_know,dont_know,dont_know,ordinary,ordinary,dont_know
diet,0,no,0,yes,0,0,0,no,no,0
low_cal_diet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
low_sodium_diet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
protien,0.0,59.48,0.0,101.33,0.0,0.0,0.0,103.91,97.12,0.0
