In [1]:
%reload_ext autoreload
%autoreload 2

import sys
import os

path_to_project = os.path.abspath(os.path.join(os.getcwd(), '../'))    
sys.path.insert(1, os.path.join(path_to_project))

In [2]:
import re
import pandas as pd
import warnings
from src.directory import data_dir, NHANES_dir, NHANES_preprocessed_filename, NHANES_vars_lookup_filename
from src.utils import preprocess_NHANES, download_nhanes_xpt

# Download data

In [3]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# read in variable lookup df
vars_lookup_df = pd.read_csv(os.path.join(data_dir, NHANES_vars_lookup_filename))

# get questionnaire names
questionnaires = vars_lookup_df['Data File Name'].apply(lambda x: re.findall('\(([^)]+)', x)[0]).unique()

url_list = [
    f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/{questionnaire}.xpt" for questionnaire in questionnaires
]

# download datasets (if necessary)
download_nhanes_xpt(url_list)

ALQ_H.xpt already exists. Skipping.
PAXMIN_H.xpt already exists. Skipping.
SLQ_H.xpt already exists. Skipping.
BPX_H.xpt already exists. Skipping.
BPQ_H.xpt already exists. Skipping.
DIQ_H.xpt already exists. Skipping.
BMX_H.xpt already exists. Skipping.
SMQ_H.xpt already exists. Skipping.
SMQRTU_H.xpt already exists. Skipping.
DEMO_H.xpt already exists. Skipping.
DPQ_H.xpt already exists. Skipping.
MCQ_H.xpt already exists. Skipping.
HIQ_H.xpt already exists. Skipping.
RXQ_DRUG.xpt already exists. Skipping.
RXQ_RX_H.xpt already exists. Skipping.
PAQ_H.xpt already exists. Skipping.
PAXDAY_H.xpt already exists. Skipping.


# Read in data to dataframe

In [5]:
NHANES_preprocessed_filepath = os.path.join(data_dir, NHANES_preprocessed_filename)

In [None]:
df = preprocess_NHANES(exclude=['RXQ_DRUG.xpt', 'PAXMIN_H.xpt'])
df.to_csv(NHANES_preprocessed_filepath)

Preprocessing HIQ_H.xpt...
Preprocessing PAQ_H.xpt...
Preprocessing ALQ_H.xpt...
Preprocessing BPQ_H.xpt...
Preprocessing DPQ_H.xpt...
Preprocessing PAXDAY_H.xpt...
Preprocessing RXQ_RX_H.xpt...


In [None]:
# read in lux values
lux_filepath = os.path.join(data_dir, 'lux_df.csv')
if os.path.exists(lux_filepath):
    lux_df = pd.read_csv(lux_filepath, index_col='SEQN')
else:
    lux_df = preprocess_NHANES(exclude=[
        os.path.basename(x) for x in os.listdir(NHANES_dir) if 'PAXMIN_H.xpt' not in x])
    lux_df.to_csv(lux_filepath)

In [None]:
lux_df

Unnamed: 0_level_0,total_sleep_minutes,summed_lux,ambient_light
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
73664.0,6635,0.00,0.000000
73665.0,11529,6699.11,0.581066
73666.0,11529,39518.93,3.427785
73667.0,11529,0.00,0.000000
73668.0,11529,519.69,0.045077
...,...,...,...
83725.0,11529,0.00,0.000000
83727.0,11529,407178.24,35.317741
83729.0,11529,9644.65,0.836556
83730.0,11529,0.00,0.000000


In [None]:
# merge with df
df = pd.concat([df, lux_df[['ambient_light']]], axis=1)
df.to_csv(NHANES_preprocessed_filepath)

# Get descriptive statistics (over missing data)

In [None]:
# get indices for rows with valid HTN or sleep deprivation values
# valid_HTN_or_deprivation = ~(df['HTN'].isna() & df['sleep_deprivation'].isna())

# get indices for rows with valid HTN and sleep deprivation values
valid_HTN_and_deprivation = ~(df['HTN'].isna() | df['sleep_deprivation'].isna())

total_valid_subjects = valid_HTN_and_deprivation.sum().item()
print('Total subjects with valid HTN & sleep deprivation values:', total_valid_subjects)

# get ratio of valid responses for each column
ratio_of_valid_responses = df[valid_HTN_and_deprivation].describe().loc['count'] / total_valid_subjects
ratio_of_valid_responses.sort_values(ascending=False)

Total subjects with valid HTN & sleep deprivation values: 6187


diabetes                       1.000000
physical_activity              1.000000
HTN                            1.000000
age                            1.000000
gender                         1.000000
race_ethnicity                 1.000000
ANTIDEPRESSANTS_ANXIOLYTICS    1.000000
GLUCOCORTICOIDS                1.000000
sleep_deprivation              1.000000
sleep_troubles                 0.999838
health_insurance               0.998869
daily_sedentary                0.996121
BMI                            0.978503
SBP                            0.975594
DBP                            0.975594
smoker                         0.951511
smoker_hx                      0.946339
poverty_ratio                  0.922903
CVD                            0.894133
martial_status                 0.893648
accelerometer                  0.864716
depression                     0.858251
ambient_light                  0.855019
yearly_alcohol                 0.711815
Name: count, dtype: float64

In [None]:
# get subjects with complete data
subjects_with_complete_data = df.dropna(how='any')

print('Total subjects with complete data:', len(subjects_with_complete_data))

Total subjects with complete data: 3438


# Impute missing data (optional)

In [None]:
%reload_ext autoreload
%autoreload 2

import sys
import os

path_to_project = os.path.abspath(os.path.join(os.getcwd(), '../'))    
sys.path.insert(1, os.path.join(path_to_project))

In [None]:
import pandas as pd
from src.directory import data_dir, NHANES_preprocessed_filename

In [None]:
NHANES_preprocessed_filepath = os.path.join(data_dir, NHANES_preprocessed_filename)
df = pd.read_csv(NHANES_preprocessed_filepath, index_col='SEQN')

In [None]:
# impute numerical values
# for col in numerical_cols:
#     fill_value = df[col].mean().item() # mean 
#     df[col] = df[col].fillna(fill_value)

# TODO: (optional) MICE imputation / categorical imputation