# Check Availability of all features in other years of NHANES dataset 

In [3]:
biomarker2fileprefix = {
    "LBDSALSI": "BIOPRO",  # g/L
    "LBDSCRSI": "BIOPRO",  # umol/L
    "LBDSGLSI": "BIOPRO",  # mmol/L
    "LBXHSCRP": "HSCRP",  # mg/L
    "LBXLYPCT": "CBC",  # %
    "LBXMCVSI": "CBC",  # fL
    "LBXRDW": "CBC",  # %
    "LBXSAPSI": "BIOPRO",  # U/L
    "LBXWBCSI": "CBC",  # 1000 cells/uL
    "RIDAGEYR": "DEMO",  # Years
}
biomarker2description = {
    "LBDSALSI":         "Albumin, refrigerated serum(g/L)",
    "LBDSCRSI":         "Creatinine, refrigerated serum (umol/L)",
    "LBDSGLSI":         "Glucose, refrigerated serum (mmol/L)",  # mmol/L
    "LBXHSCRP":         "High-Sensitivity C-Reactive Protein (hs-CRP) (mg/L)",  # TODO: mg/L -> log(mg/dL) (for phenoage calculation)
    "LBXLYPCT":         "Lymphocyte percent (%)",  # %
    "LBXMCVSI":         "Mean cell volume (fL)",  # fL
    "LBXRDW":           "Red cell distribution width (%)",  # %
    "LBXSAPSI":         "Alkaline Phosphatase (ALP) (IU/L)",  # IU/L == U/L
    "LBXWBCSI":         "White blood cell count (1000 cells/uL)",  # 1000 cells/uL
    "RIDAGEYR":         "Age in years of the participant at the time of screening. Individuals 80 and over are topcoded at 80 years of age.",  # Years
    "LBXHSCRP_mg_dL":   "[self-calculated] High-Sensitivity C-Reactive Protein (hs-CRP) (mg/L)"   # mg/dL
}
feature2description = {
    "RIDAGEYR": "Age in years of the participant at the time of screening. Individuals 80 and over are topcoded at 80 years of age.",
    "RIAGENDR": "Gender of the participant.",
    "BMXHT": "Standing Height (cm)",
    "BMXWAIST": "Waist Circumference (cm)",
    "BMXWT": "Weight (kg)",
    "PAD680": "(Minutes) The following question is about sitting at school, at home, getting to and from places, or with friends including time spent sitting at a desk, traveling in a car or bus, reading, playing cards, watching television, or using a computer. Do not include time spent sleeping. How much time {do you/does SP} usually spend sitting on a typical day?",
    "SMQ020": "These next questions are about cigarette smoking and other tobacco use. {Have you/Has SP} smoked at least 100 cigarettes in {your/his/her} entire life?",
    "ALQ121": "ALQ121 - Past 12 mo how often have alcohol drink",
    "SLD012": "Number of hours usually sleep on weekdays or workdays.",
    "SLD013": "Number of hours usually sleep on weekends or non-workdays.",
}
feature2fileprefix = {
    "RIDAGEYR": "DEMO",
    "RIAGENDR": "DEMO",
    "BMXHT": "BMX",
    "BMXWAIST": "BMX",
    "BMXWT": "BMX",
    "PAD680": "PAQ",
    "SMQ020": "SMQ",
    "ALQ121": "ALQ",
    "SLD012": "SLQ",
    "SLD013": "SLQ",
}

In [4]:
import numpy as np
unique_file_prefixes = np.unique(list((biomarker2fileprefix | feature2fileprefix).values()))
print(unique_file_prefixes)

['ALQ' 'BIOPRO' 'BMX' 'CBC' 'DEMO' 'HSCRP' 'PAQ' 'SLQ' 'SMQ']


In [45]:
# download all files to data directory
import requests
import os

def download_file(url, filename):
    response = requests.get(url)
    if not response.ok:
        raise ValueError(f"Requested file not available at url {url}")
    with open(filename, mode="wb") as file:
        file.write(response.content)

def get_nhanes_suffix(year):
    """get NHANES suffix for a specific start year"""
    suffixes = {
        1999: "A",
        2001: "B",
        2003: "C",
        2005: "D",
        2007: "E",
        2009: "F",
        2011: "G",
        2013: "H",
        2015: "I",
        2017: "J",
        2019: "K",
        2021: "L"
    }
    if year in suffixes:
        return suffixes[year]
    else:
        raise ValueError(f"no NHANES suffix known for year {year}")

def get_nhanes_url(year, file_prefix):
    return f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{str(year)}/DataFiles/{get_nhanes_filename(year, file_prefix)}"

def get_nhanes_filename(year, file_prefix):
    return f"{file_prefix}_{get_nhanes_suffix(year)}.xpt"

In [59]:
year = 2015
data_path = os.path.join("..", "data", str(year))
for file_prefix in unique_file_prefixes:
    os.makedirs(data_path, exist_ok=True)
    file_url = get_nhanes_url(year, file_prefix)
    download_file(file_url, os.path.join(data_path, get_nhanes_filename(year, file_prefix)))
    print(file_url, os.path.join(data_path, get_nhanes_filename(year, file_prefix)))

https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/ALQ_I.xpt ../data/2015/ALQ_I.xpt
https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/BIOPRO_I.xpt ../data/2015/BIOPRO_I.xpt
https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/BMX_I.xpt ../data/2015/BMX_I.xpt
https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/CBC_I.xpt ../data/2015/CBC_I.xpt
https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/DEMO_I.xpt ../data/2015/DEMO_I.xpt
https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/HSCRP_I.xpt ../data/2015/HSCRP_I.xpt
https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/PAQ_I.xpt ../data/2015/PAQ_I.xpt
https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/SLQ_I.xpt ../data/2015/SLQ_I.xpt
https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/SMQ_I.xpt ../data/2015/SMQ_I.xpt


In [60]:
print(unique_file_prefixes, data_path)

['ALQ' 'BIOPRO' 'BMX' 'CBC' 'DEMO' 'HSCRP' 'PAQ' 'SLQ' 'SMQ'] ../data/2015


In [None]:
import pandas as pd
# try to merge based on SEQN/patient
files_to_read = [ os.path.join(data_path, get_nhanes_filename(year, file_prefix)) for file_prefix in unique_file_prefixes ]
print(files_to_read)
raw_dfs = [ pd.read_sas(f) for f in files_to_read ]
result_df = raw_dfs[0]
for raw_df in raw_dfs[1:]:
    result_df = result_df.merge(raw_df, on="SEQN")
# result_df.describe()
result_df = result_df.filter((biomarker2fileprefix|feature2fileprefix).keys())

['../data/2015/ALQ_I.xpt', '../data/2015/BIOPRO_I.xpt', '../data/2015/BMX_I.xpt', '../data/2015/CBC_I.xpt', '../data/2015/DEMO_I.xpt', '../data/2015/HSCRP_I.xpt', '../data/2015/PAQ_I.xpt', '../data/2015/SLQ_I.xpt', '../data/2015/SMQ_I.xpt']


In [72]:
result_df
result_df.dropna(axis=0).reset_index()

Unnamed: 0,level_0,index,LBDSALSI,LBDSCRSI,LBDSGLSI,LBXHSCRP,LBXLYPCT,LBXMCVSI,LBXRDW,LBXSAPSI,LBXWBCSI,RIDAGEYR,RIAGENDR,BMXHT,BMXWAIST,BMXWT,PAD680,SMQ020,SLD012
0,0,0,46.0,79.56,5.22,0.6,23.9,90.8,13.9,52.0,9.8,62.0,1.0,184.5,101.1,94.8,480.0,1.0,5.5
1,1,1,45.0,92.82,5.22,1.4,31.3,101.8,13.4,47.0,7.3,53.0,1.0,171.4,107.9,90.4,300.0,1.0,8.0
2,2,2,45.0,99.01,5.72,0.6,29.9,90.8,14.7,46.0,4.4,78.0,1.0,170.1,116.5,83.4,480.0,1.0,7.0
3,3,3,38.0,84.86,3.50,9.0,17.1,88.3,13.1,65.0,6.1,56.0,2.0,160.9,110.1,109.8,480.0,2.0,6.5
4,5,5,41.0,101.66,5.38,2.5,31.7,92.6,14.1,83.0,6.1,72.0,2.0,150.0,92.9,64.4,10.0,2.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4998,5728,5728,48.0,142.32,4.94,1.4,15.6,91.1,12.5,68.0,11.8,32.0,1.0,164.9,101.0,89.5,480.0,2.0,7.0
4999,5729,5729,48.0,56.58,4.94,2.8,24.2,80.5,13.3,67.0,7.1,25.0,1.0,136.5,75.4,39.2,240.0,2.0,8.0
5000,5730,5730,41.0,97.24,5.44,2.0,21.0,88.0,13.6,50.0,6.4,76.0,2.0,165.8,95.0,59.1,360.0,1.0,9.5
5001,5733,5733,46.0,82.21,4.61,16.4,26.4,92.7,13.0,140.0,7.6,35.0,1.0,173.3,98.9,78.2,600.0,1.0,6.0


# => only 2015 and 2017 data are usable with these features as of now