In [17]:
import pandas as pd
import numpy as np

## Linear Regression Analysis Final Project Data Scrape
#### Data Source: [The CDC's 2017 NHANES survey](https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?BeginYear=2017)
- Use 'xport' package to convert from .XPT to csv: $ python -m xport <b>.XPT_file</b> > <b>.csv_file</b>



In [18]:
demographics = pd.read_csv('./data/initial/demographics.csv')
demographics = demographics.rename(columns={'RIAGENDR': 'gender', 'RIDAGEYR': 'age', 'RIDRETH1': 'race', 
                                            'DMDHHSIZ': 'household_person_count', 'DMDHHSZA': 'children_u5_household_count'})
demographics = demographics[['SEQN', 'gender', 'age', 'race', 'household_person_count', 'children_u5_household_count']]
demographics['gender'] = demographics['gender'].replace({1.0: 'male', 2.0: 'female'})
demographics['race'] = demographics['race'].replace({1.0: 'mexican_american', 2.0: 'other_hispanic', 
                                                     3.0: 'white', 4.0: 'black', 5.0: 'multi_racial'})


#### for nutrients reported usual_intake: 2 = usual filter for only these to account for potential bias

In [19]:
nutrients = pd.read_csv('./data/initial/nutrients_day1.csv')
nutrients = nutrients.rename(columns={'DBQ095Z': 'salt_type', 
                                        'DRQSDIET': 'diet',
                                        'DR1TPROT': 'protien', 'DR1TCARB': 'carbs', 'DR1TSUGR': 'sugar', 'DR1TFIBE': 'fiber',
                                        'DR1TTFAT': 'fats', 'DR1TCHOL': 'cholesterol', 'DR1TSODI': 'sodium', 'DR1TCAFF': 'caffeine',
                                        'DR1TALCO': 'alchohol', 'DR1_320Z': 'water', 'DR1_300': 'usual_intake'})
nutrients = nutrients[['SEQN', 'salt_type', 'diet',
                         'protien', 'carbs', 'sugar', 'fiber', 'fats', 'cholesterol', 'sodium', 'caffeine', 'alchohol',
                         'water', 'usual_intake']]
nutrients = nutrients[nutrients['usual_intake']==2]
nutrients = nutrients.drop(columns='usual_intake')
nutrients['salt_type'] = nutrients['salt_type'].replace({1.0: 'ordinary', 2.0: 'lite', 3.0: 'salt_substitute', 4.0: 'no_extra_salt', 99.0: 'dont_know'})
nutrients['diet'] = nutrients['diet'].replace({9.0: 2.0, 1.0: 'yes', 2.0: 'no'})

In [20]:
body = pd.read_csv('./data/initial/body.csv')
body = body.rename(columns={'BMXWT': 'weight', 'BMXHT': 'height', 'BMXBMI': 'bmi'})
body = body[['SEQN', 'weight', 'height', 'bmi']]

In [21]:
drugs = pd.read_csv('./data/initial/drugs.csv')
drugs = drugs.rename(columns={'DUQ217': 'marijuana_use', 'DUQ272': 'cocaine_uses'})
drugs = drugs[['SEQN', 'marijuana_use', 'cocaine_uses']].fillna(0)

In [22]:
physical = pd.read_csv('./data/initial/physical.csv')
physical = physical.rename(columns={'PAD615': 'min_vig_work', 'PAD660': 'min_vig_rec'})
physical = physical[['SEQN', 'min_vig_work', 'min_vig_rec']]
physical = physical[physical['min_vig_work']!=9999]
physical = physical[physical['min_vig_rec']!=9999]

In [23]:
diabetes = pd.read_csv('./data/initial/diabetes.csv')
diabetes = diabetes.rename(columns={'DIQ010': 'doc_diabetes', 'DIQ170': 'risk_diabetes'})
diabetes = diabetes[['SEQN', 'doc_diabetes', 'risk_diabetes']]
diabetes = diabetes[diabetes['doc_diabetes']!=9]

In [24]:
blood = pd.read_csv('./data/initial/blood.csv')
blood = blood.rename(columns={'BPXDI1': 'blood_pressure', 'BPXSY1': 'systolic_blood_pressure'})
blood = blood[['SEQN', 'blood_pressure', 'systolic_blood_pressure']].dropna()

In [25]:
df = blood.merge(body, on='SEQN', how='inner')
df = df.merge(nutrients, on='SEQN', how='inner')
df = df.merge(demographics, on='SEQN', how='inner')
df = df.merge(drugs, on='SEQN', how='inner')
df = df.merge(physical, on='SEQN', how='inner')
df = df.merge(diabetes, on='SEQN', how='inner')
print(df.isna().sum())

SEQN                              0
blood_pressure                    0
systolic_blood_pressure           0
weight                           15
height                           16
bmi                              17
salt_type                         0
diet                              0
protien                           0
carbs                             0
sugar                             0
fiber                             0
fats                              0
cholesterol                       0
sodium                            0
caffeine                          0
alchohol                          0
water                             0
gender                            0
age                               0
race                              0
household_person_count            0
children_u5_household_count       0
marijuana_use                     0
cocaine_uses                      0
min_vig_work                   1955
min_vig_rec                    1892
doc_diabetes                

In [26]:
df = df.dropna(subset=['protien', 'bmi'])
df['salt_type'] = df['salt_type'].fillna('dont_know')
df = df.fillna(0)
df.isna().sum()

SEQN                           0
blood_pressure                 0
systolic_blood_pressure        0
weight                         0
height                         0
bmi                            0
salt_type                      0
diet                           0
protien                        0
carbs                          0
sugar                          0
fiber                          0
fats                           0
cholesterol                    0
sodium                         0
caffeine                       0
alchohol                       0
water                          0
gender                         0
age                            0
race                           0
household_person_count         0
children_u5_household_count    0
marijuana_use                  0
cocaine_uses                   0
min_vig_work                   0
min_vig_rec                    0
doc_diabetes                   0
risk_diabetes                  0
dtype: int64

In [27]:
df.to_csv('blood_pressure.csv', index=False)
df[:10].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
SEQN,93711.0,93716.0,93717.0,93721.0,93723.0,93726.0,93728.0,93738.0,93743.0,93746.0
blood_pressure,68.0,72.0,62.0,68.0,72.0,52.0,74.0,82.0,86.0,64.0
systolic_blood_pressure,108.0,120.0,116.0,132.0,124.0,140.0,122.0,120.0,152.0,106.0
weight,62.1,77.7,74.4,85.1,64.9,74.3,118.1,97.7,79.3,62.1
height,170.6,159.2,174.1,154.0,170.1,154.5,188.1,170.2,187.8,158.4
bmi,21.3,30.7,24.5,35.9,22.4,31.1,33.4,33.7,22.5,24.8
salt_type,ordinary,ordinary,ordinary,no_extra_salt,ordinary,ordinary,ordinary,salt_substitute,ordinary,ordinary
diet,yes,no,no,no,no,no,no,yes,no,no
protien,101.33,103.91,97.12,79.98,41.62,81.89,80.0,116.36,175.55,80.67
carbs,339.6,442.98,345.52,226.61,269.6,184.99,213.19,50.95,684.76,222.26
