# Check Availability of all features in other years of NHANES dataset 

In [24]:
biomarker2fileprefix = {
    "LBDSALSI": "BIOPRO",  # g/L
    "LBDSCRSI": "BIOPRO",  # umol/L
    "LBDSGLSI": "BIOPRO",  # mmol/L
    "LBXHSCRP": "HSCRP",  # mg/L
    "LBXLYPCT": "CBC",  # %
    "LBXMCVSI": "CBC",  # fL
    "LBXRDW": "CBC",  # %
    "LBXSAPSI": "BIOPRO",  # U/L
    "LBXWBCSI": "CBC",  # 1000 cells/uL
    "RIDAGEYR": "DEMO",  # Years
}
biomarker2description = {
    "LBDSALSI":         "Albumin, refrigerated serum(g/L)",
    "LBDSCRSI":         "Creatinine, refrigerated serum (umol/L)",
    "LBDSGLSI":         "Glucose, refrigerated serum (mmol/L)",  # mmol/L
    "LBXHSCRP":         "High-Sensitivity C-Reactive Protein (hs-CRP) (mg/L)",  # TODO: mg/L -> log(mg/dL) (for phenoage calculation)
    "LBXLYPCT":         "Lymphocyte percent (%)",  # %
    "LBXMCVSI":         "Mean cell volume (fL)",  # fL
    "LBXRDW":           "Red cell distribution width (%)",  # %
    "LBXSAPSI":         "Alkaline Phosphatase (ALP) (IU/L)",  # IU/L == U/L
    "LBXWBCSI":         "White blood cell count (1000 cells/uL)",  # 1000 cells/uL
    "RIDAGEYR":         "Age in years of the participant at the time of screening. Individuals 80 and over are topcoded at 80 years of age.",  # Years
    "LBXHSCRP_mg_dL":   "[self-calculated] High-Sensitivity C-Reactive Protein (hs-CRP) (mg/L)"   # mg/dL
}
feature2description = {
    "RIDAGEYR": "Age in years of the participant at the time of screening. Individuals 80 and over are topcoded at 80 years of age.",
    "RIAGENDR": "Gender of the participant.",
    "BMXHT": "Standing Height (cm)",
    "BMXWAIST": "Waist Circumference (cm)",
    "BMXWT": "Weight (kg)",
    "PAD680": "(Minutes) The following question is about sitting at school, at home, getting to and from places, or with friends including time spent sitting at a desk, traveling in a car or bus, reading, playing cards, watching television, or using a computer. Do not include time spent sleeping. How much time {do you/does SP} usually spend sitting on a typical day?",
    "SMQ020": "These next questions are about cigarette smoking and other tobacco use. {Have you/Has SP} smoked at least 100 cigarettes in {your/his/her} entire life?",
    "ALQ121": "ALQ121 - Past 12 mo how often have alcohol drink",
    "SLD012": "Number of hours usually sleep on weekdays or workdays.",
    "SLD013": "Number of hours usually sleep on weekends or non-workdays.",
}
feature2fileprefix = {
    "RIDAGEYR": "DEMO",
    "RIAGENDR": "DEMO",
    "BMXHT": "BMX",
    "BMXWAIST": "BMX",
    "BMXWT": "BMX",
    "PAD680": "PAQ",
    "SMQ020": "SMQ",
    "ALQ121": "ALQ",
    "SLD012": "SLQ",
    "SLD013": "SLQ",
}

In [27]:
import numpy as np
unique_file_prefixes = np.unique(list((biomarker2fileprefix | feature2fileprefix).values()))
print(unique_file_prefixes)

['ALQ' 'BIOPRO' 'BMX' 'CBC' 'DEMO' 'HSCRP' 'PAQ' 'SLQ' 'SMQ']


In [28]:
year = 2015
base_url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{str(year)}/DataFiles/"
base_url

'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/'

In [29]:
# download all files to data directory
import requests
import os

def download_file(url, filename):
    response = requests.get(url)
    with open(filename, mode="wb") as file:
        file.write(response.content)

def get_nhanes_suffix(year):
    """get NHANES suffix for a specific start year"""
    suffixes = {
        1999: "A",
        2001: "B",
        2003: "C",
        2005: "D",
        2007: "E",
        2009: "F",
        2011: "G",
        2013: "H",
        2015: "I",
        2017: "J",
        2019: "K",
        2021: "L"
    }
    if year in suffixes:
        return f"_{suffixes[year]}"
    else:
        raise ValueError(f"no NHANES suffix known for year {year}")

def get_nhanes_url(year, file_prefix):
    return f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{str(year)}/DataFiles/{file_prefix}_{get_nhanes_suffix(year)}.xpt"

data_path = os.path.join("..", "data", str(year))
for file_prefix in unique_file_prefixes:
    os.makedirs(data_path, exist_ok=True)
    file_url = get_nhanes_url(year, file_prefix)
    download_file(file_url, os.path.join(data_path, file))

In [None]:
import pandas as pd
df = pd.read_sas(os.path.join(data_path, unique_files[3]))
counts = df["SEQN"].value_counts()
print(f"counts>1 ={counts[counts > 1]}")
print(f"number of counts>1 is: {len(counts[counts > 1])}")

ValueError: Header record is not an XPORT file.

In [None]:
# try to merge based on SEQN/patient
raw_dfs = [ pd.read_sas(os.path.join(data_path, file)) for file in unique_files ]
result_df = raw_dfs[0]
for raw_df in raw_dfs[1:]:
    result_df = result_df.merge(raw_df, on="SEQN")
result_df.describe()

ValueError: Header record is not an XPORT file.