In [1]:
%reload_ext autoreload
%autoreload 2

import sys
import os

path_to_project = os.path.abspath(os.path.join(os.getcwd(), '../'))    
sys.path.insert(1, os.path.join(path_to_project))

In [2]:
import re
import pandas as pd
import warnings
from src.data_dict import NHANES_nan_fill
from src.directory import data_dir, NHANES_dir, NHANES_preprocessed_filename, NHANES_vars_lookup_filename
from src.utils import preprocess_NHANES, download_nhanes_xpt

# Download data

In [3]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# read in variable lookup df
vars_lookup_df = pd.read_csv(os.path.join(data_dir, NHANES_vars_lookup_filename))

# get questionnaire names
questionnaires = vars_lookup_df['Data File Name'].apply(lambda x: re.findall('\(([^)]+)', x)[0]).unique()

url_list = [
    f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/{questionnaire}.xpt" for questionnaire in questionnaires
]

# download datasets (if necessary)
download_nhanes_xpt(url_list)

PAXMIN_H.xpt already exists. Skipping.
SLQ_H.xpt already exists. Skipping.
BPX_H.xpt already exists. Skipping.
BPQ_H.xpt already exists. Skipping.
DIQ_H.xpt already exists. Skipping.
BMX_H.xpt already exists. Skipping.
SMQ_H.xpt already exists. Skipping.
SMQRTU_H.xpt already exists. Skipping.
DEMO_H.xpt already exists. Skipping.
DPQ_H.xpt already exists. Skipping.
RXQ_DRUG.xpt already exists. Skipping.
RXQ_RX_H.xpt already exists. Skipping.
PAQ_H.xpt already exists. Skipping.
PAXDAY_H.xpt already exists. Skipping.


# Read in data to dataframe

In [5]:
NHANES_preprocessed_filepath = os.path.join(data_dir, NHANES_preprocessed_filename)

In [9]:
df = preprocess_NHANES(exclude=['RXQ_DRUG.xpt', 'PAXMIN_H.xpt'])
df.to_csv(NHANES_preprocessed_filepath)

Preprocessing PAQ_H.xpt...
Preprocessing BPQ_H.xpt...
Preprocessing DPQ_H.xpt...
Preprocessing PAXDAY_H.xpt...
Preprocessing RXQ_RX_H.xpt...
Preprocessing SLQ_H.xpt...
Preprocessing BPX_H.xpt...
Preprocessing SMQRTU_H.xpt...
Preprocessing DIQ_H.xpt...
Preprocessing BMX_H.xpt...
Preprocessing SMQ_H.xpt...
Preprocessing DEMO_H.xpt...


In [7]:
# read in lux values
lux_filepath = os.path.join(data_dir, 'lux_df.csv')
if os.path.exists(lux_filepath):
    lux_df = pd.read_csv(lux_filepath, index_col='SEQN')
else:
    lux_df = preprocess_NHANES(exclude=[
        os.path.basename(x) for x in os.listdir(NHANES_dir) if 'PAXMIN_H.xpt' not in x])
    lux_df.to_csv(lux_filepath)

Preprocessing PAXMIN_H.xpt...
Processing chunk 1000000.0 - 2000000.0
Processing chunk 2000000.0 - 3000000.0
Processing chunk 3000000.0 - 4000000.0
Processing chunk 4000000.0 - 5000000.0
Processing chunk 5000000.0 - 6000000.0
Processing chunk 6000000.0 - 7000000.0
Processing chunk 7000000.0 - 8000000.0
Processing chunk 8000000.0 - 9000000.0
Processing chunk 9000000.0 - 10000000.0
Processing chunk 10000000.0 - 11000000.0
Processing chunk 11000000.0 - 12000000.0
Processing chunk 12000000.0 - 13000000.0
Processing chunk 13000000.0 - 14000000.0
Processing chunk 14000000.0 - 15000000.0
Processing chunk 15000000.0 - 16000000.0
Processing chunk 16000000.0 - 17000000.0
Processing chunk 17000000.0 - 18000000.0
Processing chunk 18000000.0 - 19000000.0
Processing chunk 19000000.0 - 20000000.0
Processing chunk 20000000.0 - 21000000.0
Processing chunk 21000000.0 - 22000000.0
Processing chunk 22000000.0 - 23000000.0
Processing chunk 23000000.0 - 24000000.0
Processing chunk 24000000.0 - 25000000.0
Pro

In [8]:
lux_df

Unnamed: 0_level_0,total_sleep_minutes,summed_lux,ambient_light
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
73664.0,6635,0.00,0.000000
73665.0,11529,6699.11,0.581066
73666.0,11529,39518.93,3.427785
73667.0,11529,0.00,0.000000
73668.0,11529,519.69,0.045077
...,...,...,...
83725.0,11529,0.00,0.000000
83727.0,11529,407178.24,35.317741
83729.0,11529,9644.65,0.836556
83730.0,11529,0.00,0.000000


In [10]:
# merge with df
df = pd.concat([df, lux_df[['ambient_light']]], axis=1)
df.to_csv(NHANES_preprocessed_filepath)

# Get descriptive statistics (over missing data)

In [35]:
# get indices for rows with valid HTN or sleep deprivation values
# valid_HTN_or_deprivation = ~(df['HTN'].isna() & df['sleep_deprivation'].isna())

# get indices for rows with valid HTN and sleep deprivation values
valid_HTN_and_deprivation = ~(df['HTN'].isna() | df['sleep_deprivation'].isna())

total_valid_subjects = valid_HTN_and_deprivation.sum().item()
print('Total subjects with valid HTN & sleep deprivation values:', total_valid_subjects)

# get ratio of valid responses for each column
ratio_of_valid_responses = df[valid_HTN_and_deprivation].describe().loc['count'] / total_valid_subjects
ratio_of_valid_responses.sort_values(ascending=False)

Total subjects with valid HTN & sleep deprivation values: 6187


physical_activity              1.000000
sleep_deprivation              1.000000
HTN                            1.000000
age                            1.000000
gender                         1.000000
race_ethnicity                 1.000000
diabetes                       1.000000
GLUCOCORTICOIDS                1.000000
ANTIDEPRESSANTS_ANXIOLYTICS    1.000000
sleep_troubles                 0.999838
daily_sedentary                0.996121
BMI                            0.978503
smoker                         0.951511
poverty_ratio                  0.922903
accelerometer                  0.864716
depression                     0.856473
ambient_light                  0.855019
Name: count, dtype: float64

In [36]:
# get subjects with complete data
subjects_with_complete_data = df.dropna(how='any')

print('Total subjects with complete data:', len(subjects_with_complete_data))

Total subjects with complete data: 4319


# Impute missing data (optional)

In [1]:
%reload_ext autoreload
%autoreload 2

import sys
import os

path_to_project = os.path.abspath(os.path.join(os.getcwd(), '../'))    
sys.path.insert(1, os.path.join(path_to_project))

In [None]:
import pandas as pd
from src.directory import data_dir, NHANES_preprocessed_filename

In [3]:
NHANES_preprocessed_filepath = os.path.join(data_dir, NHANES_preprocessed_filename)
df = pd.read_csv(NHANES_preprocessed_filepath, index_col='SEQN')

In [29]:
categorical_cols = ['physical_activity', 'depression', 'ANTIDEPRESSANTS_ANXIOLYTICS', 'GLUCOCORTICOIDS', 'sleep_troubles',
       'sleep_deprivation', 'diabetes', 'smoker', 'race_ethnicity', 'gender', 'HTN']
numerical_cols = ['daily_sedentary', 'accelerometer', 'BMI', 'age', 'poverty_ratio', 'ambient_light']

In [None]:
# impute numerical values
for col in numerical_cols:
    fill_value = df[col].mean().item() # mean 
    df[col] = df[col].fillna(fill_value)

# TODO: (optional) MICE imputation / categorical imputation