# Blood Lab Data Cleaning

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
!pip install pyreadstat
import pyreadstat #since the data files are .xpt files, this library is needed to import the table
import re
import requests
from bs4 import BeautifulSoup
from nhanes_utils import to_snake_case, get_common_nan_ids, standardize_id_column, drop_rows_with_common_nan_ids

Defaulting to user installation because normal site-packages is not writeable


### Alpha-1-acid glycoprotein (AGP)

Alpha-1-acid glycoprotein (AGP), also known as orosomucoid (ORM), is an acute-phase serum protein present in humans and many animal species. It is produced in response to inflammation, although its precise biological role remains under investigation and somewhat ambiguous [2]. According to Ceciliani et al. (2019), AGP may play a role in immunometabolism, a function potentially relevant to understanding the obesity epidemic in the U.S.

In the NHANES dataset, AGP levels were measured in children aged 3–5 years and females aged 12–49 years. This data offers an opportunity to explore potential correlations between AGP serum concentrations and obesity prevalence among the female participants in the study.

In [4]:
file_path = '2017-2020/blood/1.P_SSAGP.xpt'

df_b1, meta = pyreadstat.read_xport(file_path)
df_b1 = standardize_id_column(df_b1)

In [5]:
df_b1.head(10)

Unnamed: 0,participant_id,WTSSAGPP,SSAGP
0,109264.0,0.0,
1,109266.0,10003.783188,0.796
2,109277.0,23329.384783,0.746
3,109279.0,14416.168293,0.94
4,109284.0,17705.030492,1.08
5,109286.0,21951.734438,0.358
6,109288.0,0.0,
7,109291.0,25981.10596,0.766
8,109297.0,0.0,
9,109309.0,0.0,


In [6]:
df_b1.shape

(3823, 3)

In [7]:
df_b1 = df_b1.rename(columns={
    'SSAGP': 'alpha_1_agp_g_l'
})

In [8]:
df_b1.head()

Unnamed: 0,participant_id,WTSSAGPP,alpha_1_agp_g_l
0,109264.0,0.0,
1,109266.0,10003.783188,0.796
2,109277.0,23329.384783,0.746
3,109279.0,14416.168293,0.94
4,109284.0,17705.030492,1.08


In [9]:
df_b1 = df_b1.drop('WTSSAGPP', axis=1)

In [10]:
df_b1 = df_b1.dropna(subset = ['alpha_1_agp_g_l'])

In [11]:
df_b1.head(5)

Unnamed: 0,participant_id,alpha_1_agp_g_l
1,109266.0,0.796
2,109277.0,0.746
3,109279.0,0.94
4,109284.0,1.08
5,109286.0,0.358


### Lipid Panel

Lipids are essential molecules that support a range of physiological functions, including hormone production and cellular structure. However, excessive lipid levels—particularly certain types—are associated with increased risk of cardiovascular disease.

To assess lipid status, a fasting lipid panel is commonly used. This test typically includes measurements of:
- LDL (low-density lipoprotein, or “bad” cholesterol),
- HDL (high-density lipoprotein, or “good” cholesterol),
- Total cholesterol, and
- Triglycerides

The NHANES dataset includes all of these values, enabling analysis of lipid profiles across a large representative population. This section focuses on cleaning and preparing these variables for analysis.

In [13]:
file_path = '2017-2020/blood/2.P_HDL.xpt'

df_b2, meta = pyreadstat.read_xport(file_path)
df_b2 = standardize_id_column(df_b2)

In [14]:
df_b2.head()

Unnamed: 0,participant_id,LBDHDD,LBDHDDSI
0,109264.0,72.0,1.86
1,109266.0,56.0,1.45
2,109270.0,47.0,1.22
3,109271.0,33.0,0.85
4,109273.0,42.0,1.09


In [15]:
df_b2 = df_b2.rename(columns ={
    'LBDHDD':'direct_hdl_mg_dl',
    'LBDHDDSI':'direct_hdl_mmol_l'
})

In [16]:
df_b2.isnull().sum()

participant_id          0
direct_hdl_mg_dl     1370
direct_hdl_mmol_l    1370
dtype: int64

In [17]:
common_nan = get_common_nan_ids(df_b2, 'direct_hdl_mg_dl', 'direct_hdl_mmol_l', id_col='participant_id')

Number of NaNs in direct_hdl_mg_dl: 1370
Number of NaNs in direct_hdl_mmol_l: 1370
Number of IDs with NaNs in both columns: 1370


In [18]:
df_b2 = drop_rows_with_common_nan_ids(df_b2, 'direct_hdl_mg_dl', 'direct_hdl_mmol_l', id_col='participant_id')

Rows dropped where both direct_hdl_mg_dl and direct_hdl_mmol_l were NaN: 1370


In [19]:
file_path = '2017-2020/blood/3.P_TRIGLY.xpt'

df_b3, meta = pyreadstat.read_xport(file_path)
df_b3 = standardize_id_column(df_b3)

In [20]:
df_b3.columns.to_list()

['participant_id',
 'WTSAFPRP',
 'LBXTR',
 'LBDTRSI',
 'LBDLDL',
 'LBDLDLSI',
 'LBDLDLM',
 'LBDLDMSI',
 'LBDLDLN',
 'LBDLDNSI']

In [21]:
df_b3 = df_b3.rename(columns={
    'LBXTR':'triglyceride_mg_dl',
    'LBDTRSI':'triglyceride_mmol_l',
    'LBDLDL':'ldl_friedewald_mg_dl',
    'LBDLDLSI':'ldl_friedwalkd_mmol_l',
    'LBDLDLM': 'ldl_martin_hopkins_mg_dl',
    'LBDLDMSI': 'ldl_martin_hopkins_mmol_l',
    'LBDLDLN':'ldl_nih_mg_dl',
    'LBDLDNSI':'ldl_nih_mmol_l'
})

In [22]:
df_b3 = df_b3.drop('WTSAFPRP', axis=1)

In [23]:
df_b3.isnull().sum()

participant_id                 0
triglyceride_mg_dl           440
triglyceride_mmol_l          440
ldl_friedewald_mg_dl         473
ldl_friedwalkd_mmol_l        473
ldl_martin_hopkins_mg_dl     473
ldl_martin_hopkins_mmol_l    473
ldl_nih_mg_dl                448
ldl_nih_mmol_l               448
dtype: int64

In [24]:
value_cols = [col for col in df_b3.columns if col != 'participant_id']
rows_all_nan = df_b3[value_cols].isna().all(axis=1)
print(f"Number of rows missing all cholesterol values: {rows_all_nan.sum()}")

Number of rows missing all cholesterol values: 440


In [25]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b3_cleaned = df_b3[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 440


In [26]:
file_path = '2017-2020/blood/4.P_TCHOL.xpt'

df_b4, meta = pyreadstat.read_xport(file_path)
df_b4 = standardize_id_column(df_b4)

In [27]:
df_b4.head()

Unnamed: 0,participant_id,LBXTC,LBDTCSI
0,109264.0,166.0,4.29
1,109266.0,195.0,5.04
2,109270.0,103.0,2.66
3,109271.0,147.0,3.8
4,109273.0,164.0,4.24


In [28]:
df_b4 = df_b4.rename(columns={
    'LBXTC': 'total_cholesterol_mg_dl',
    'LBDTCSI':'total_cholesterol_mmol_l'
})

In [29]:
df_b4.isnull().sum()

participant_id                 0
total_cholesterol_mg_dl     1370
total_cholesterol_mmol_l    1370
dtype: int64

In [30]:
common_nan = get_common_nan_ids(df_b4, 'total_cholesterol_mg_dl', 'total_cholesterol_mmol_l', id_col='participant_id')

Number of NaNs in total_cholesterol_mg_dl: 1370
Number of NaNs in total_cholesterol_mmol_l: 1370
Number of IDs with NaNs in both columns: 1370


In [31]:
df_b4 = drop_rows_with_common_nan_ids(df_b4, 'total_cholesterol_mg_dl', 'total_cholesterol_mmol_l', id_col='participant_id')

Rows dropped where both total_cholesterol_mg_dl and total_cholesterol_mmol_l were NaN: 1370


### Chromium and Cobalt (Blood)

NHANES data on chromium and cobalt levels were collected on patients aged 40-150 years old. 

In [33]:
file_path = '2017-2020/blood/5.P_CRCO.xpt'

df_b5, meta = pyreadstat.read_xport(file_path)
df_b5 = standardize_id_column(df_b5)

In [34]:
df_b5.columns.to_list()

['participant_id',
 'LBXBCR',
 'LBDBCRSI',
 'LBDBCRLC',
 'LBXBCO',
 'LBDBCOSI',
 'LBDBCOLC']

In [35]:
df_b5 = df_b5.rename(columns={
    'LBXBCR':'chromium_blood_ug_l', 
    'LBDBCRSI': 'chromium_blood_nmol_l',
    'LBDBCRLC':'chromium_blood_comment', 
    'LBXBCO':'cobalt_blood_ug_l', 
    'LBDBCOSI':'cobalt_blood_nmol_l', 
    'LBDBCOLC' :'cobalt_blood_comment'
})

In [36]:
df_b5.isnull().sum()

participant_id              0
chromium_blood_ug_l       302
chromium_blood_nmol_l     302
chromium_blood_comment    302
cobalt_blood_ug_l         299
cobalt_blood_nmol_l       299
cobalt_blood_comment      299
dtype: int64

In [37]:
common_nan = get_common_nan_ids(df_b5, 'chromium_blood_ug_l', 'cobalt_blood_ug_l', id_col='participant_id')

Number of NaNs in chromium_blood_ug_l: 302
Number of NaNs in cobalt_blood_ug_l: 299
Number of IDs with NaNs in both columns: 297


In [38]:
df_b5 = drop_rows_with_common_nan_ids(df_b5, 'chromium_blood_ug_l', 'cobalt_blood_ug_l', id_col='participant_id')

Rows dropped where both chromium_blood_ug_l and cobalt_blood_ug_l were NaN: 297


### Complete Blood Count with Differential

CBC with diff is the most common blood work that is ordered for a baseline lab. CBC can be useful to assess the patients for acute inflammation in the body and anemia. 

There are many values that are extracted and assessed through the CBC panel. To more efficiently extract the information, the decision was made to utilize webscraping rather than individually typing out each lab value. The units for each of the columns is included in the README file for this project.

In [40]:
file_path = '2017-2020/blood/6.P_CBC.xpt'

df_b6, meta = pyreadstat.read_xport(file_path)
df_b6 = standardize_id_column(df_b6)

In [41]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_CBC.htm"

df_info_raw = pd.read_html(url)[0]

# Use the first row as the header
df_info_raw.columns = df_info_raw.iloc[0]
df_info = df_info_raw.drop(index=0).reset_index(drop=True)

In [42]:
df_info.head()

Unnamed: 0,Variable Name,Analyte Description,LLOD,ULOD,Units
0,LBXWBCSI,White blood cell count,0.02,363.0,x 103 cells/uL
1,LBXLYPCT,Lymphocyte percent,0.0,100.0,%
2,LBXMOPCT,Monocyte percent,0.0,100.0,%
3,LBXNEPCT,Segmented neutrophils percent,0.0,100.0,%
4,LBXEOPCT,Eosinophils percent,0.0,100.0,%


In [43]:
df_info.columns.to_list()

['Variable  Name', 'Analyte  Description', 'LLOD', 'ULOD', 'Units']

In [44]:
rename_dict = {
    row["Variable  Name"]: to_snake_case(row["Analyte  Description"])
    for _, row in df_info.iterrows()
    if row["Variable  Name"] in df_b6.columns
}

In [45]:
df_b6 = df_b6.rename(columns=rename_dict)

In [46]:
df_b6.head()

Unnamed: 0,participant_id,white_blood_cell_count,lymphocyte_percent,monocyte_percent,segmented_neutrophils_percent,eosinophils_percent,basophils_percent,LBDLYMNO,LBDMONO,LBDNENO,...,red_blood_cell_count,hemoglobin,LBXHCT,mean_cell_volume,LBXMC,LBXMCHSI,red_cell_distribution_width,platelet_count,mean_platelet_volume,LBXNRBC
0,109263.0,,,,,,,,,,...,,,,,,,,,,
1,109264.0,4.5,45.6,6.2,46.4,1.4,0.5,2.1,0.3,2.1,...,4.8,13.7,40.5,84.3,33.7,28.4,13.1,263.0,8.2,0.1
2,109265.0,9.5,46.4,10.9,39.2,2.9,0.7,4.4,1.0,3.7,...,4.5,12.6,36.6,81.2,34.4,27.9,13.1,286.0,6.6,0.1
3,109266.0,7.8,34.5,6.0,58.3,0.8,0.5,2.7,0.5,4.5,...,4.35,12.3,36.5,83.7,33.6,28.1,14.0,314.0,6.9,0.1
4,109269.0,9.1,38.3,7.8,48.8,4.1,1.1,3.5,0.7,4.4,...,4.21,11.7,33.5,79.6,34.9,27.8,13.4,287.0,6.9,0.1


In [47]:
df_b6.columns.to_list()

#Some of the names were renamed based on what was available on the first table in the URL. For the other ones that were not, they were manually renamed

['participant_id',
 'white_blood_cell_count',
 'lymphocyte_percent',
 'monocyte_percent',
 'segmented_neutrophils_percent',
 'eosinophils_percent',
 'basophils_percent',
 'LBDLYMNO',
 'LBDMONO',
 'LBDNENO',
 'LBDEONO',
 'LBDBANO',
 'red_blood_cell_count',
 'hemoglobin',
 'LBXHCT',
 'mean_cell_volume',
 'LBXMC',
 'LBXMCHSI',
 'red_cell_distribution_width',
 'platelet_count',
 'mean_platelet_volume',
 'LBXNRBC']

In [48]:
df_b6 = df_b6.rename(columns={
 'LBDLYMNO': 'lymphocyte_number',
 'LBDMONO': 'monocyte_number',
 'LBDNENO':'segmented_neutrophils_number',
 'LBDEONO':'eosinophils_number',
 'LBDBANO':'basophils_number',
 'LBXHCT':'hematocrit_percent',
 'LBXMC':'mean_cell_hgb_concentration',
 'LBXMCHSI':'mean_cell_hemoglobin',
 'LBXNRBC':'nucelated_red_blood_cells'
})

In [49]:
df_b6.columns

Index(['participant_id', 'white_blood_cell_count', 'lymphocyte_percent',
       'monocyte_percent', 'segmented_neutrophils_percent',
       'eosinophils_percent', 'basophils_percent', 'lymphocyte_number',
       'monocyte_number', 'segmented_neutrophils_number', 'eosinophils_number',
       'basophils_number', 'red_blood_cell_count', 'hemoglobin',
       'hematocrit_percent', 'mean_cell_volume', 'mean_cell_hgb_concentration',
       'mean_cell_hemoglobin', 'red_cell_distribution_width', 'platelet_count',
       'mean_platelet_volume', 'nucelated_red_blood_cells'],
      dtype='object')

In [50]:
df_b6.isnull().sum()

participant_id                      0
white_blood_cell_count           1616
lymphocyte_percent               1621
monocyte_percent                 1621
segmented_neutrophils_percent    1621
eosinophils_percent              1621
basophils_percent                1621
lymphocyte_number                1621
monocyte_number                  1621
segmented_neutrophils_number     1621
eosinophils_number               1621
basophils_number                 1621
red_blood_cell_count             1616
hemoglobin                       1616
hematocrit_percent               1616
mean_cell_volume                 1616
mean_cell_hgb_concentration      1616
mean_cell_hemoglobin             1616
red_cell_distribution_width      1616
platelet_count                   1616
mean_platelet_volume             1616
nucelated_red_blood_cells        1621
dtype: int64

In [51]:
value_cols = [col for col in df_b6.columns if col != 'participant_id']
rows_all_nan = df_b6[value_cols].isna().all(axis=1)
print(f"Number of rows missing all CBC values: {rows_all_nan.sum()}")

Number of rows missing all CBC values: 1616


In [52]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b6_cleaned = df_b6[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 1616


### Cotinine

Cotinine is a metabolite that is produced when nicotine is processed. Its long half-life makes it a good marker for assessing tobacco exposure or usage. 

In [54]:
file_path = '2017-2020/blood/7.P_COT.xpt'

df_b7, meta = pyreadstat.read_xport(file_path)
df_b7 = standardize_id_column(df_b7)

In [55]:
df_b7.columns.to_list()

['participant_id', 'LBXCOT', 'LBDCOTLC', 'LBXHCOT', 'LBDHCOLC']

In [56]:
df_b7 = df_b7.rename(columns={
    'LBXCOT':'serum_cotinine_ng_ml',
    'LBDCOTLC':'serum_cotinine_comment',
    'LBXHCOT':'serum_hydroxycotinine_ng_ml',
    'LBDHCOLC':'serum_hydroxycotinine_comment'
})

In [57]:
df_b7.isnull().sum()

participant_id                      0
serum_cotinine_ng_ml             1632
serum_cotinine_comment           1632
serum_hydroxycotinine_ng_ml      1632
serum_hydroxycotinine_comment    1632
dtype: int64

In [58]:
common_nan = get_common_nan_ids(df_b7, 'serum_cotinine_ng_ml', 'serum_hydroxycotinine_ng_ml', id_col='participant_id')

Number of NaNs in serum_cotinine_ng_ml: 1632
Number of NaNs in serum_hydroxycotinine_ng_ml: 1632
Number of IDs with NaNs in both columns: 1632


In [59]:
df_b7 = drop_rows_with_common_nan_ids(df_b7, 'serum_cotinine_ng_ml', 'serum_hydroxycotinine_ng_ml', id_col='participant_id')

Rows dropped where both serum_cotinine_ng_ml and serum_hydroxycotinine_ng_ml were NaN: 1632


### Cytomegalovirus

Cytomegalovirus (CMV) is a double-stranded DNA virus that causes flu-like symptoms in immunocompetant population but can cause organ damage in immunocompromised (i.e. HIV/AIDS) population. CMV virus is transmitted via bodily fluids including sexual contact [12]. 

Avidity tests for whether the CMV infection was recent or in the past. Low avidity shows recent infection and high avidity shows past infeciton.

In [61]:
file_path = '2017-2020/blood/8.P_CMV.xpt'

df_b8, meta = pyreadstat.read_xport(file_path)
df_b8 = standardize_id_column(df_b8)

In [62]:
df_b8.columns.to_list()

['participant_id', 'LBXIGG', 'LBXIGM', 'LBXIGGA']

In [63]:
df_b8 = df_b8.rename(columns={
    'LBXIGG':'cmv_igg',
    'LBXIGM':'cmv_igm', 
    'LBXIGGA':'cmv_igg_avidity'
})

In [64]:
df_b8.isnull().sum()

#there are missing avidity value which would indicate that the person was never infected with CMV. The null values for the IgG and IgM would indicate missing data so rows without these two values will be dropped

participant_id        0
cmv_igg             617
cmv_igm             617
cmv_igg_avidity    1307
dtype: int64

In [65]:
common_nan = get_common_nan_ids(df_b8, 'cmv_igg', 'cmv_igm', id_col='participant_id')

Number of NaNs in cmv_igg: 617
Number of NaNs in cmv_igm: 617
Number of IDs with NaNs in both columns: 617


In [66]:
df_b8 = drop_rows_with_common_nan_ids(df_b8, 'cmv_igg', 'cmv_igm', id_col='participant_id')

Rows dropped where both cmv_igg and cmv_igm were NaN: 617


### Ethylene Oxide

Ethylene Oxide (EtO) is a colorless gas that is used to produce various materials as well as sterilize medical equipments. Exposure to EtO most often is due to aerosolization. EtO is a well-known carcinogen and long term exposure to this substance could lead to blood cancers such as non-Hodgkin lymphoma, myeloma and lymphocytic leukemia [13].

The unit for EtO measurement in the blood is picomoles per gram of hemoglobin (pmol/g Hb).

In [68]:
file_path = '2017-2020/blood/9.P_ETHOX.xpt'

df_b9, meta = pyreadstat.read_xport(file_path)
df_b9 = standardize_id_column(df_b9)

In [69]:
df_b9.columns.to_list()

['participant_id', 'WTSAPRP', 'LBXEOA', 'LBDEOALC']

In [70]:
df_b9 = df_b9.drop('WTSAPRP',axis=1)

In [71]:
df_b9 = df_b9.rename(columns={
    'LBXEOA':'eto_pmol_g_hb',
    'LBDEOALC':'eto_comment'
})

In [72]:
df_b9.isnull().sum()

participant_id      0
eto_pmol_g_hb     424
eto_comment       424
dtype: int64

In [73]:
df_b9 = df_b9.dropna(subset=['eto_pmol_g_hb'])

In [74]:
df_b9.head()

Unnamed: 0,participant_id,eto_pmol_g_hb,eto_comment
0,109266.0,18.7,0.0
1,109270.0,37.7,0.0
2,109273.0,359.0,0.0
3,109274.0,61.2,0.0
5,109290.0,25.5,0.0


### Ferritin and iron panel

Ferritin and iron panel are used to assess someone's iron status. Low values in the iron panel and ferritin along with clinical symptoms are corroborated to diagnose iron deficiency anemia. 

In [76]:
file_path = '2017-2020/blood/10.P_FERTIN.xpt'

df_b10, meta = pyreadstat.read_xport(file_path)
df_b10 = standardize_id_column(df_b10)

In [77]:
df_b10.head()

Unnamed: 0,participant_id,LBXFER,LBDFERSI
0,109263.0,,
1,109264.0,15.7,15.7
2,109265.0,42.1,42.1
3,109266.0,11.6,11.6
4,109269.0,41.7,41.7


In [78]:
df_b10 = df_b10.rename(columns={
    'LBXFER':'ferritin_ng_ml',
    'LBDFERSI':'ferritin_ug_l'
})

In [79]:
df_b10.isnull().sum()

participant_id       0
ferritin_ng_ml    1426
ferritin_ug_l     1426
dtype: int64

In [80]:
common_nan = get_common_nan_ids(df_b10, 'ferritin_ng_ml', 'ferritin_ug_l', id_col='participant_id')

Number of NaNs in ferritin_ng_ml: 1426
Number of NaNs in ferritin_ug_l: 1426
Number of IDs with NaNs in both columns: 1426


In [81]:
df_b10 = drop_rows_with_common_nan_ids(df_b10, 'ferritin_ng_ml', 'ferritin_ug_l', id_col='participant_id')

Rows dropped where both ferritin_ng_ml and ferritin_ug_l were NaN: 1426


In [82]:
file_path = '2017-2020/blood/11.P_FETIB.xpt'

df_b11, meta = pyreadstat.read_xport(file_path)
df_b11 = standardize_id_column(df_b11)

In [83]:
df_b11.columns.to_list()

['participant_id',
 'LBXIRN',
 'LBDIRNSI',
 'LBXUIB',
 'LBDUIBLC',
 'LBDUIBSI',
 'LBDTIB',
 'LBDTIBSI',
 'LBDPCT']

In [84]:
df_b11 = df_b11.rename(columns={
 'LBXIRN':'iron_frozen_ug_dl',
 'LBDIRNSI':'iron_frozen_umol_l',
 'LBXUIB':'uibc_ug_dl',
 'LBDUIBLC':'uibc_comment',
 'LBDUIBSI':'uibc_umol_l',
 'LBDTIB':'tibc_ug_dl',
 'LBDTIBSI':'tibc_umol_l',
 'LBDPCT':'transferrin_saturation'
})

In [85]:
df_b11.isnull().sum()

participant_id              0
iron_frozen_ug_dl         904
iron_frozen_umol_l        904
uibc_ug_dl                949
uibc_comment              949
uibc_umol_l               949
tibc_ug_dl                956
tibc_umol_l               956
transferrin_saturation    956
dtype: int64

In [86]:
value_cols = [col for col in df_b11.columns if col != 'participant_id']
rows_all_nan = df_b11[value_cols].isna().all(axis=1)
print(f"Number of rows missing all iron panel values: {rows_all_nan.sum()}")

Number of rows missing all iron panel values: 904


In [87]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b11_cleaned = df_b11[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 904


In [378]:
file_path = '2017-2020/blood/23.P_TFR.xpt'

df_b23, meta = pyreadstat.read_xport(file_path)
df_b23 = standardize_id_column(df_b23)

In [382]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_TFR.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

pattern = re.compile(r'^([A-Z0-9_]+)\s*-\s*(.+)$')

rename_dict = {}

# Loop through all h3 tags and filter those that match our pattern
for tag in soup.find_all('h3'):
    text = tag.get_text(strip=True)
    match = pattern.match(text)
    if match:
        var_name = match.group(1)
        description = match.group(2)
                
        clean_name = to_snake_case(description)
        rename_dict[var_name] = clean_name

In [384]:
filtered_rename_dict = {
    k: v for k, v in rename_dict.items() if k in df_b23.columns
}

df_b23 = df_b23.rename(columns=filtered_rename_dict)

In [386]:
df_b23.head()

Unnamed: 0,participant_id,transferrin_receptor_mg_l,transferrin_receptor_nmol_l
0,109263.0,,
1,109264.0,2.82,33.3
2,109265.0,2.91,34.3
3,109266.0,3.7,43.7
4,109269.0,2.59,30.6


In [388]:
df_b23.isnull().sum()

participant_id                   0
transferrin_receptor_mg_l      946
transferrin_receptor_nmol_l    946
dtype: int64

In [390]:
df_b23 = df_b23.dropna()

In [392]:
df_b23.head()

Unnamed: 0,participant_id,transferrin_receptor_mg_l,transferrin_receptor_nmol_l
1,109264.0,2.82,33.3
2,109265.0,2.91,34.3
3,109266.0,3.7,43.7
4,109269.0,2.59,30.6
5,109277.0,2.77,32.7


### Folate


In [89]:
file_path = '2017-2020/blood/12.P_FOLATE.xpt'

df_b12, meta = pyreadstat.read_xport(file_path)
df_b12 = standardize_id_column(df_b12)

In [90]:
df_b12.columns

Index(['participant_id', 'WTFOLPRP', 'LBDRFO', 'LBDRFOSI'], dtype='object')

In [91]:
df_b12 = df_b12.drop('WTFOLPRP', axis=1)

In [92]:
df_b12 = df_b12.rename(columns={
    'LBDRFO':'rbc_folate_ng_ml',
    'LBDRFOSI':'rbc_folate_nmol_l'
})

In [93]:
df_b12.isnull().sum()

participant_id         0
rbc_folate_ng_ml     966
rbc_folate_nmol_l    966
dtype: int64

In [94]:
common_nan = get_common_nan_ids(df_b12, 'rbc_folate_ng_ml', 'rbc_folate_nmol_l', id_col='participant_id')

Number of NaNs in rbc_folate_ng_ml: 966
Number of NaNs in rbc_folate_nmol_l: 966
Number of IDs with NaNs in both columns: 966


In [95]:
df_b12 = drop_rows_with_common_nan_ids(df_b12, 'rbc_folate_ng_ml', 'rbc_folate_nmol_l', id_col='participant_id')

Rows dropped where both rbc_folate_ng_ml and rbc_folate_nmol_l were NaN: 966


In [96]:
file_path = '2017-2020/blood/13.P_FOLFMS.xpt'

df_b13, meta = pyreadstat.read_xport(file_path)
df_b13 = standardize_id_column(df_b13)

#There are a lot of technical names for these values so webscraping will be done instead of manual renaming of the columns

In [97]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_FOLFMS.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# Pattern: match things like "LBDRFOSI - RBC folate (nmol/L)"
pattern = re.compile(r'^([A-Z0-9_]+)\s*-\s*(.+)$')

rename_dict = {}

# Loop through all h3 tags and filter those that match our pattern
for tag in soup.find_all('h3'):
    text = tag.get_text(strip=True)
    match = pattern.match(text)
    if match:
        var_name = match.group(1)
        description = match.group(2)
                
        clean_name = to_snake_case(description)
        rename_dict[var_name] = clean_name

In [98]:
# Only keep entries in rename_dict where the variable name is in df_b13
filtered_rename_dict = {
    k: v for k, v in rename_dict.items() if k in df_b13.columns
}

# Rename columns in df_b13
df_b13 = df_b13.rename(columns=filtered_rename_dict)

In [99]:
df_b13.head()

Unnamed: 0,participant_id,folate_folate_form_weight_pre_pandemic,serum_total_folate_nmol_l,serum_total_folate_ng_ml,5_methyl_tetrahydrofolate_nmol_l,5_methyl_tetrahydrofolate_cmt,folic_acid_nmol_l,folic_acid_cmt,5_formyl_tetrahydrofolate_nmol_l,5_formyl_tetrahydrofolate_cmt,tetrahydrofolate_nmol_l,tetrahydrofolate_cmt,510_methenyl_tetrahydrofolate_nmol_l,510_methenyl_tetrahydrofolate_cmt,mefox_oxidation_product_nmol_l,mefox_oxidation_product_cmt
0,109264.0,13078.84474,75.7,33.4,74.2,0.0,0.347,0.0,0.141,1.0,0.909,0.0,0.141,1.0,0.692,0.0
1,109265.0,51567.643136,28.6,12.6,27.6,0.0,0.344,0.0,0.141,1.0,0.418,0.0,0.141,1.0,0.511,0.0
2,109266.0,8998.028716,42.4,18.7,41.4,0.0,0.402,0.0,0.141,1.0,0.34,0.0,0.141,1.0,0.661,0.0
3,109269.0,17148.10026,50.6,22.3,48.5,0.0,0.444,0.0,0.141,1.0,1.41,0.0,0.141,1.0,0.761,0.0
4,109270.0,13466.605962,31.1,13.7,28.6,0.0,1.46,0.0,0.141,1.0,0.719,0.0,0.141,1.0,2.18,0.0


In [100]:
df_b13.columns.to_list()

['participant_id',
 'folate_folate_form_weight_pre_pandemic',
 'serum_total_folate_nmol_l',
 'serum_total_folate_ng_ml',
 '5_methyl_tetrahydrofolate_nmol_l',
 '5_methyl_tetrahydrofolate_cmt',
 'folic_acid_nmol_l',
 'folic_acid_cmt',
 '5_formyl_tetrahydrofolate_nmol_l',
 '5_formyl_tetrahydrofolate_cmt',
 'tetrahydrofolate_nmol_l',
 'tetrahydrofolate_cmt',
 '510_methenyl_tetrahydrofolate_nmol_l',
 '510_methenyl_tetrahydrofolate_cmt',
 'mefox_oxidation_product_nmol_l',
 'mefox_oxidation_product_cmt']

In [101]:
df_b13 = df_b13.drop('folate_folate_form_weight_pre_pandemic',axis=1)

In [102]:
df_b13.isnull().sum()

participant_id                             0
serum_total_folate_nmol_l               1052
serum_total_folate_ng_ml                1052
5_methyl_tetrahydrofolate_nmol_l        1052
5_methyl_tetrahydrofolate_cmt           1052
folic_acid_nmol_l                       1052
folic_acid_cmt                          1052
5_formyl_tetrahydrofolate_nmol_l        1052
5_formyl_tetrahydrofolate_cmt           1052
tetrahydrofolate_nmol_l                 1052
tetrahydrofolate_cmt                    1052
510_methenyl_tetrahydrofolate_nmol_l    1052
510_methenyl_tetrahydrofolate_cmt       1052
mefox_oxidation_product_nmol_l          1052
mefox_oxidation_product_cmt             1052
dtype: int64

In [103]:
value_cols = [col for col in df_b13.columns if col != 'participant_id']
rows_all_nan = df_b13[value_cols].isna().all(axis=1)
print(f"Number of rows missing all folate values: {rows_all_nan.sum()}")

Number of rows missing all folate values: 1052


In [104]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b13_cleaned = df_b13[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 1052


### Glycohemoglobin (%)

In [106]:
file_path = '2017-2020/blood/14.P_GHB.xpt'

df_b14, meta = pyreadstat.read_xport(file_path)
df_b14 = standardize_id_column(df_b14)

In [107]:
df_b14.head()

Unnamed: 0,participant_id,LBXGH
0,109264.0,5.3
1,109266.0,5.2
2,109271.0,5.6
3,109273.0,5.1
4,109274.0,5.7


In [108]:
df_b14 = df_b14.rename(columns = {'LBXGH':'glycohemoglobin_percent'})

In [109]:
df_b14.isnull().sum()

participant_id               0
glycohemoglobin_percent    672
dtype: int64

In [110]:
df_b14 = df_b14.dropna(subset=['glycohemoglobin_percent'])

### High-Sensitivity C-Reactive Protein

In [112]:
file_path = '2017-2020/blood/15.P_HSCRP.xpt'

df_b15, meta = pyreadstat.read_xport(file_path)
df_b15 = standardize_id_column(df_b15)

In [113]:
df_b15.head()

Unnamed: 0,participant_id,LBXHSCRP,LBDHRPLC
0,109263.0,,
1,109264.0,0.11,1.0
2,109265.0,0.31,0.0
3,109266.0,0.72,0.0
4,109269.0,0.73,0.0


In [114]:
df_b15 = df_b15.rename(columns = {
    'LBXHSCRP':'hs_crp_mg_l',
    'LBDHRPLC':'hs_crp_cmt'
})

In [115]:
df_b15.isnull().sum()

participant_id       0
hs_crp_mg_l       2158
hs_crp_cmt        2158
dtype: int64

In [116]:
df_b15 = df_b15.dropna(subset=['hs_crp_mg_l'])

### Inorganic metyl and ethyl mercury

In [118]:
file_path = '2017-2020/blood/16.P_IHGEM.xpt'

df_b16, meta = pyreadstat.read_xport(file_path)
df_b16 = standardize_id_column(df_b16)

In [233]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_IHGEM.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

pattern = re.compile(r'^([A-Z0-9_]+)\s*-\s*(.+)$')

rename_dict = {}

# Loop through all h3 tags and filter those that match our pattern
for tag in soup.find_all('h3'):
    text = tag.get_text(strip=True)
    match = pattern.match(text)
    if match:
        var_name = match.group(1)
        description = match.group(2)
                
        clean_name = to_snake_case(description)
        rename_dict[var_name] = clean_name

In [242]:
# Only keep entries in rename_dict where the variable name is in df_b13
filtered_rename_dict = {
    k: v for k, v in rename_dict.items() if k in df_b16.columns
}

# Rename columns in df_b13
df_b16 = df_b16.rename(columns=filtered_rename_dict)

In [246]:
df_b16.head()

Unnamed: 0,participant_id,mercury_inorganic_ug_l,mercury_inorganic_nmol_l,mercury_inorganic_comment_code,mercury_ethyl_ug_l,mercury_ethyl_nmol_l,mercury_ethyl_comment_code,mercury_methyl_ug_l,mercury_methyl_nmol_l,mercury_methyl_comment_code
0,109263.0,,,,,,,,,
1,109264.0,0.15,0.75,1.0,0.05,0.25,1.0,0.18,0.9,1.0
2,109265.0,0.15,0.75,1.0,0.05,0.25,1.0,0.18,0.9,1.0
3,109266.0,0.24,1.2,0.0,0.05,0.25,1.0,0.18,0.9,1.0
4,109269.0,0.15,0.75,1.0,0.05,0.25,1.0,0.18,0.9,1.0


In [248]:
df_b16.isnull().sum()

participant_id                       0
mercury_inorganic_ug_l            1743
mercury_inorganic_nmol_l          1743
mercury_inorganic_comment_code    1743
mercury_ethyl_ug_l                1743
mercury_ethyl_nmol_l              1743
mercury_ethyl_comment_code        1743
mercury_methyl_ug_l               1743
mercury_methyl_nmol_l             1743
mercury_methyl_comment_code       1743
dtype: int64

In [250]:
value_cols = [col for col in df_b16.columns if col != 'participant_id']
rows_all_nan = df_b16[value_cols].isna().all(axis=1)
print(f"Number of rows missing all inorganic mercury values: {rows_all_nan.sum()}")

Number of rows missing all inorganic mercury values: 1743


In [252]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b16_cleaned = df_b16[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 1743


### Insulin

In [261]:
file_path = '2017-2020/blood/17.P_INS.xpt'

df_b17, meta = pyreadstat.read_xport(file_path)
df_b17 = standardize_id_column(df_b17)

# this file may be corrupted?

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb5 in position 9: invalid start byte

In [265]:
with open(file_path, "rb") as f:
    content = f.read()
    if b'\xb5' in content:
        print("Contains micro symbol (µ)")

Contains micro symbol (µ)


In [271]:
# Step 1: Read the raw .xpt file as bytes
with open(file_path, "rb") as f:
    content = f.read()

# Step 2: Replace µ (byte 0xB5) with 'u' or another safe character
cleaned = content.replace(b'\xb5', b'u')  # or b'mu' if you prefer

# Step 3: Save it to a new temporary file
with open(file_path, "wb") as f:
    f.write(cleaned)

# Step 4: Now read it with pyreadstat
df_b17, meta = pyreadstat.read_xport(file_path)
df_b17 = standardize_id_column(df_b17)

In [273]:
df_b17.head()

Unnamed: 0,participant_id,WTSAFPRP,LBXIN,LBDINSI,LBDINLC
0,109264.0,27533.174559,6.05,36.3,0.0
1,109271.0,18100.965319,16.96,101.76,0.0
2,109274.0,16281.758327,13.52,81.12,0.0
3,109277.0,32230.046209,6.44,38.64,0.0
4,109282.0,79007.100787,7.49,44.94,0.0


In [275]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_INS.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

pattern = re.compile(r'^([A-Z0-9_]+)\s*-\s*(.+)$')

rename_dict = {}

# Loop through all h3 tags and filter those that match our pattern
for tag in soup.find_all('h3'):
    text = tag.get_text(strip=True)
    match = pattern.match(text)
    if match:
        var_name = match.group(1)
        description = match.group(2)
                
        clean_name = to_snake_case(description)
        rename_dict[var_name] = clean_name

In [277]:
# Only keep entries in rename_dict where the variable name is in df_b13
filtered_rename_dict = {
    k: v for k, v in rename_dict.items() if k in df_b17.columns
}

# Rename columns in df_b13
df_b17 = df_b17.rename(columns=filtered_rename_dict)

In [279]:
df_b17.head()

Unnamed: 0,participant_id,fasting_subsample_weight,insulin_μu_ml,insulin_pmol_l,insulin_comment_code
0,109264.0,27533.174559,6.05,36.3,0.0
1,109271.0,18100.965319,16.96,101.76,0.0
2,109274.0,16281.758327,13.52,81.12,0.0
3,109277.0,32230.046209,6.44,38.64,0.0
4,109282.0,79007.100787,7.49,44.94,0.0


In [281]:
df_b17.isnull().sum()

participant_id                0
fasting_subsample_weight      0
insulin_μu_ml               465
insulin_pmol_l              465
insulin_comment_code        465
dtype: int64

In [283]:
df_b17 = df_b17.drop(['fasting_subsample_weight'],axis=1)

In [287]:
value_cols = [col for col in df_b17.columns if col != 'participant_id']
rows_all_nan = df_b17[value_cols].isna().all(axis=1)
print(f"Number of rows missing all insulin values: {rows_all_nan.sum()}")

Number of rows missing all insulin values: 465


In [289]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b17_cleaned = df_b17[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 465


### Lead, Cadmium, Total Mercury, Selenium, & Manganese

In [292]:
file_path = '2017-2020/blood/18.P_PBCD.xpt'

df_b18, meta = pyreadstat.read_xport(file_path)
df_b18 = standardize_id_column(df_b18)

In [294]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_PBCD.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

pattern = re.compile(r'^([A-Z0-9_]+)\s*-\s*(.+)$')

rename_dict = {}

# Loop through all h3 tags and filter those that match our pattern
for tag in soup.find_all('h3'):
    text = tag.get_text(strip=True)
    match = pattern.match(text)
    if match:
        var_name = match.group(1)
        description = match.group(2)
                
        clean_name = to_snake_case(description)
        rename_dict[var_name] = clean_name

In [296]:
filtered_rename_dict = {
    k: v for k, v in rename_dict.items() if k in df_b18.columns
}

df_b18 = df_b18.rename(columns=filtered_rename_dict)

In [298]:
df_b18.head()

Unnamed: 0,participant_id,blood_lead_ug_dl,blood_lead_umol_l,blood_lead_comment_code,blood_cadmium_ug_l,blood_cadmium_nmol_l,blood_cadmium_comment_code,blood_mercury_total_ug_l,blood_mercury_total_nmol_l,blood_mercury_total_comment_code,blood_selenium_ug_l,blood_selenium_umol_l,blood_selenium_comment_code,blood_manganese_ug_l,blood_manganese_nmol_l,blood_manganese_comment_code
0,109263.0,,,,,,,,,,,,,,,
1,109264.0,0.36,0.017,0.0,0.1,0.89,0.0,0.2,1.0,1.0,167.69,2.13,0.0,15.21,276.85,0.0
2,109265.0,,,,0.071,0.632,1.0,0.2,1.0,1.0,168.27,2.14,0.0,10.62,193.31,0.0
3,109266.0,1.699,0.082,0.0,0.223,1.984,0.0,0.36,1.8,0.0,167.51,2.13,0.0,8.85,161.09,0.0
4,109269.0,,,,0.071,0.632,1.0,0.2,1.0,1.0,157.94,2.01,0.0,9.61,174.92,0.0


In [300]:
df_b18.isnull().sum()

participant_id                         0
blood_lead_ug_dl                    2665
blood_lead_umol_l                   2665
blood_lead_comment_code             2665
blood_cadmium_ug_l                  1670
blood_cadmium_nmol_l                1670
blood_cadmium_comment_code          1670
blood_mercury_total_ug_l            1670
blood_mercury_total_nmol_l          1670
blood_mercury_total_comment_code    1670
blood_selenium_ug_l                 1670
blood_selenium_umol_l               1670
blood_selenium_comment_code         1670
blood_manganese_ug_l                1670
blood_manganese_nmol_l              1670
blood_manganese_comment_code        1670
dtype: int64

In [304]:
value_cols = [col for col in df_b18.columns if col != 'participant_id']
rows_all_nan = df_b18[value_cols].isna().all(axis=1)
print(f"Number of rows missing metal values: {rows_all_nan.sum()}")

Number of rows missing metal values: 1670


In [306]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b18_cleaned = df_b18[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 1670


### Perfluoroalkyl and Polyfluoroalkyl Substances

In [309]:
file_path = '2017-2020/blood/19.P_PFAS.xpt'

df_b19, meta = pyreadstat.read_xport(file_path)
df_b19 = standardize_id_column(df_b19)

In [311]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_PFAS.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

pattern = re.compile(r'^([A-Z0-9_]+)\s*-\s*(.+)$')

rename_dict = {}

# Loop through all h3 tags and filter those that match our pattern
for tag in soup.find_all('h3'):
    text = tag.get_text(strip=True)
    match = pattern.match(text)
    if match:
        var_name = match.group(1)
        description = match.group(2)
                
        clean_name = to_snake_case(description)
        rename_dict[var_name] = clean_name

In [313]:
filtered_rename_dict = {
    k: v for k, v in rename_dict.items() if k in df_b19.columns
}

# Rename columns in df_b13
df_b19 = df_b19.rename(columns=filtered_rename_dict)

In [315]:
df_b19.head()

Unnamed: 0,participant_id,subsample_ba_weights_pre_pandemic,perfluorodecanoic_acid_ng_ml,perfluorodecanoic_acid_comment_code,perfluorohexane_sulfonic_acid_ng_ml,perfluorohexane_sulfonic_acid_comt_code,2_n_methyl_pfosaacetic_acid_ng_ml,2_n_methyl_pfosa_acetic_acid_comt_code,perfluorononanoic_acid_ng_ml,perfluorononanoic_acid_comment_code,perfluoroundecanoic_acid_ng_ml,perfluoroundecanoic_acid_comment_code,n_perfluorooctanoic_acid_ng_ml,n_perfluorooctanoic_acid_comment_code,br_perfluorooctanoic_acid_iso_ng_ml,br_perfluorooctanoic_acid_iso_comt_code,n_perfluorooctane_sulfonic_acid_ng_ml,n_perfluorooctane_sulfonic_comt_code,sm_pfos_ng_ml,sm_pfos_comment_code
0,109266.0,30126.837044,0.07,1.0,0.7,0.0,0.07,1.0,0.8,0.0,0.07,1.0,0.9,0.0,0.07,1.0,1.1,0.0,0.5,0.0
1,109271.0,23343.201,0.5,0.0,2.3,0.0,0.07,1.0,0.6,0.0,0.3,0.0,4.1,0.0,0.07,1.0,7.9,0.0,2.0,0.0
2,109277.0,54853.289667,0.07,1.0,0.4,0.0,0.1,0.0,0.2,0.0,0.07,1.0,0.7,0.0,0.1,0.0,0.8,0.0,0.4,0.0
3,109282.0,103069.396443,0.1,0.0,0.8,0.0,0.2,0.0,0.5,0.0,0.07,1.0,1.0,0.0,0.07,1.0,2.5,0.0,1.2,0.0
4,109285.0,0.0,,,,,,,,,,,,,,,,,,


In [317]:
df_b19 = df_b19.drop(['subsample_ba_weights_pre_pandemic'],axis=1)

In [319]:
df_b19.isnull().sum()

participant_id                               0
perfluorodecanoic_acid_ng_ml               388
perfluorodecanoic_acid_comment_code        388
perfluorohexane_sulfonic_acid_ng_ml        388
perfluorohexane_sulfonic_acid_comt_code    388
2_n_methyl_pfosaacetic_acid_ng_ml          388
2_n_methyl_pfosa_acetic_acid_comt_code     388
perfluorononanoic_acid_ng_ml               388
perfluorononanoic_acid_comment_code        388
perfluoroundecanoic_acid_ng_ml             388
perfluoroundecanoic_acid_comment_code      388
n_perfluorooctanoic_acid_ng_ml             388
n_perfluorooctanoic_acid_comment_code      388
br_perfluorooctanoic_acid_iso_ng_ml        388
br_perfluorooctanoic_acid_iso_comt_code    388
n_perfluorooctane_sulfonic_acid_ng_ml      388
n_perfluorooctane_sulfonic_comt_code       388
sm_pfos_ng_ml                              388
sm_pfos_comment_code                       388
dtype: int64

In [323]:
value_cols = [col for col in df_b19.columns if col != 'participant_id']
rows_all_nan = df_b19[value_cols].isna().all(axis=1)
print(f"Number of rows missing PFAS values: {rows_all_nan.sum()}")

Number of rows missing PFAS values: 388


In [325]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b19_cleaned = df_b19[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 388


### Fasting Glucose

In [328]:
file_path = '2017-2020/blood/20.P_GLU.xpt'

df_b20, meta = pyreadstat.read_xport(file_path)
df_b20 = standardize_id_column(df_b20)

In [330]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_GLU.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

pattern = re.compile(r'^([A-Z0-9_]+)\s*-\s*(.+)$')

rename_dict = {}

# Loop through all h3 tags and filter those that match our pattern
for tag in soup.find_all('h3'):
    text = tag.get_text(strip=True)
    match = pattern.match(text)
    if match:
        var_name = match.group(1)
        description = match.group(2)
                
        clean_name = to_snake_case(description)
        rename_dict[var_name] = clean_name

In [332]:
filtered_rename_dict = {
    k: v for k, v in rename_dict.items() if k in df_b20.columns
}

df_b20 = df_b20.rename(columns=filtered_rename_dict)

In [334]:
df_b20.head()

Unnamed: 0,participant_id,fasting_subsample_weight,fasting_glucose_mg_dl,fasting_glucose_mmol_l
0,109264.0,27533.174559,97.0,5.38
1,109271.0,18100.965319,103.0,5.72
2,109274.0,16281.758327,154.0,8.55
3,109277.0,32230.046209,92.0,5.11
4,109282.0,79007.100787,95.0,5.27


In [336]:
df_b20 = df_b20.drop(['fasting_subsample_weight'],axis=1)

In [338]:
df_b20.isnull().sum()

participant_id              0
fasting_glucose_mg_dl     346
fasting_glucose_mmol_l    346
dtype: int64

In [342]:
value_cols = [col for col in df_b20.columns if col != 'participant_id']
rows_all_nan = df_b20[value_cols].isna().all(axis=1)
print(f"Number of rows missing fasting glucose values: {rows_all_nan.sum()}")

Number of rows missing fasting glucose values: 346


In [344]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b20_cleaned = df_b20[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 346


### Sex Steroid Hormone Panel

In [347]:
file_path = '2017-2020/blood/21.P_TST.xpt'

df_b21, meta = pyreadstat.read_xport(file_path)
df_b21 = standardize_id_column(df_b21)

In [349]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_TST.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

pattern = re.compile(r'^([A-Z0-9_]+)\s*-\s*(.+)$')

rename_dict = {}

# Loop through all h3 tags and filter those that match our pattern
for tag in soup.find_all('h3'):
    text = tag.get_text(strip=True)
    match = pattern.match(text)
    if match:
        var_name = match.group(1)
        description = match.group(2)
                
        clean_name = to_snake_case(description)
        rename_dict[var_name] = clean_name

In [351]:
filtered_rename_dict = {
    k: v for k, v in rename_dict.items() if k in df_b21.columns
}

df_b21 = df_b21.rename(columns=filtered_rename_dict)

In [353]:
df_b21.head()

Unnamed: 0,participant_id,tst_subsample_weights_pre_pandemic,17α_hydroxyprogesterone_ng_dl,17α_hydroxyprogesterone_nmol_l,17α_hydroxyprogesterone_comment_code,androstenedione_ng_dl,androstenedione_nmol_l,androstenedione_comment_code,anti_mullerian_hormone_ng_ml,anti_mullerian_hormone_pmol_l,...,estrone_sulfate_comment_code,follicle_stimulating_hormone_miu_ml,fsh_comment_code,luteinizing_hormone_miu_ml,luteinizing_hormone_comment_code,progesterone_ng_dl,progesterone_nmol_l,progesterone_comment_code,shbg_nmol_l,shbg_comment_code
0,109264.0,0.0,,,,,,,,,...,,,,,,,,,,
1,109266.0,8338.955705,27.1,0.821,0.0,93.1,3.25,0.0,1.65,11.78,...,,7.15,0.0,4.55,0.0,1.5,0.0477,0.0,22.54,0.0
2,109270.0,6313.19381,,,,52.3,1.83,0.0,1.93,13.78,...,0.0,4.87,0.0,3.64,0.0,0.608,0.0193,1.0,23.68,0.0
3,109271.0,10483.726109,114.0,3.45,0.0,166.0,5.79,0.0,,,...,0.0,8.3,0.0,18.86,0.0,8.77,0.279,0.0,42.91,0.0
4,109273.0,27691.474737,164.0,4.97,0.0,91.7,3.2,0.0,,,...,0.0,3.33,0.0,4.22,0.0,16.0,0.509,0.0,59.72,0.0


In [355]:
df_b21 = df_b21.drop(['tst_subsample_weights_pre_pandemic'],axis=1)

In [357]:
df_b21.isnull().sum()

participant_id                             0
17α_hydroxyprogesterone_ng_dl           3954
17α_hydroxyprogesterone_nmol_l          3954
17α_hydroxyprogesterone_comment_code    3954
androstenedione_ng_dl                   2667
androstenedione_nmol_l                  2667
androstenedione_comment_code            2667
anti_mullerian_hormone_ng_ml            7516
anti_mullerian_hormone_pmol_l           7516
anti_mullerian_hormone_comment_code     7516
estradiol_pg_ml                         3003
estradiol_pmol_l                        3003
estradiol_comment_code                  3003
estrone_ng_dl                           3019
estrone_pmol_l                          3019
estrone_comment_code                    3019
estrone_sulfate_pg_ml                   3182
estrone_sulfate_pmol_l                  3182
estrone_sulfate_comment_code            3182
follicle_stimulating_hormone_miu_ml     3074
fsh_comment_code                        3074
luteinizing_hormone_miu_ml              2977
luteinizin

In [359]:
value_cols = [col for col in df_b21.columns if col != 'participant_id']
rows_all_nan = df_b21[value_cols].isna().all(axis=1)
print(f"Number of rows missing sex hormone values: {rows_all_nan.sum()}")

Number of rows missing sex hormone values: 2657


In [361]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b21_cleaned = df_b21[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 2657


### Complete Metabolic Panel

In [364]:
file_path = '2017-2020/blood/22.P_BIOPRO.xpt'

df_b22, meta = pyreadstat.read_xport(file_path)
df_b22 = standardize_id_column(df_b22)

In [366]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_BIOPRO.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

pattern = re.compile(r'^([A-Z0-9_]+)\s*-\s*(.+)$')

rename_dict = {}

# Loop through all h3 tags and filter those that match our pattern
for tag in soup.find_all('h3'):
    text = tag.get_text(strip=True)
    match = pattern.match(text)
    if match:
        var_name = match.group(1)
        description = match.group(2)
                
        clean_name = to_snake_case(description)
        rename_dict[var_name] = clean_name

In [368]:
filtered_rename_dict = {
    k: v for k, v in rename_dict.items() if k in df_b22.columns
}

df_b22 = df_b22.rename(columns=filtered_rename_dict)

In [370]:
df_b22.head()

Unnamed: 0,participant_id,alanine_aminotransferase_alt_u_l,alt_comment_code,albumin_refrigerated_serum_g_dl,albumin_refrigerated_serum_g_l,alkaline_phosphatase_alp_iu_l,aspartate_aminotransferase_ast_u_l,bicarbonate_mmol_l,blood_urea_nitrogen_mg_dl,blood_urea_nitrogen_mmol_l,...,total_calcium_mg_dl,total_calcium_mmol_l,cholesterol_refrigerated_serum_mg_dl,cholesterol_refrigerated_serum_mmol_l,total_protein_g_dl,total_protein_g_l,triglycerides_refrig_serum_mg_dl,triglycerides_refrig_serum_mmol_l,uric_acid_mg_dl,uric_acid_umol_l
0,109264.0,8.0,0.0,4.3,43.0,172.0,18.0,26.0,9.0,3.21,...,9.4,2.35,170.0,4.396,6.9,69.0,54.0,0.61,3.5,208.2
1,109266.0,15.0,0.0,3.8,38.0,74.0,14.0,25.0,8.0,2.86,...,8.8,2.2,199.0,5.146,6.8,68.0,86.0,0.971,4.9,291.5
2,109271.0,8.0,0.0,3.8,38.0,88.0,14.0,30.0,8.0,2.86,...,8.9,2.225,148.0,3.827,7.3,73.0,101.0,1.14,5.0,297.4
3,109273.0,35.0,0.0,4.6,46.0,70.0,27.0,27.0,11.0,3.93,...,9.7,2.425,168.0,4.344,7.4,74.0,178.0,2.01,5.2,309.3
4,109274.0,19.0,0.0,4.0,40.0,49.0,20.0,28.0,11.0,3.93,...,9.1,2.275,105.0,2.715,6.8,68.0,151.0,1.705,5.1,303.3


In [372]:
df_b22.isnull().sum()

participant_id                              0
alanine_aminotransferase_alt_u_l          936
alt_comment_code                          936
albumin_refrigerated_serum_g_dl           932
albumin_refrigerated_serum_g_l            932
alkaline_phosphatase_alp_iu_l             935
aspartate_aminotransferase_ast_u_l        974
bicarbonate_mmol_l                        936
blood_urea_nitrogen_mg_dl                 936
blood_urea_nitrogen_mmol_l                936
chloride_mmol_l                           933
creatine_phosphokinase_cpk_iu_l           940
creatinine_refrigerated_serum_mg_dl       934
creatinine_refrigerated_serum_umol_l      934
globulin_g_dl                             936
globulin_g_l                              936
glucose_refrigerated_serum_mg_dl          936
glucose_refrigerated_serum_mmol_l         936
gamma_glutamyl_transferase_ggt_iu_l       936
ggt_comment_code                          936
iron_refrigerated_serum_ug_dl             936
iron_refrigerated_serum_umol_l    

In [374]:
value_cols = [col for col in df_b22.columns if col != 'participant_id']
rows_all_nan = df_b22[value_cols].isna().all(axis=1)
print(f"Number of rows missing CMP values: {rows_all_nan.sum()}")

Number of rows missing CMP values: 932


In [376]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b22_cleaned = df_b22[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 932


### Volatile Organic Compounds and Trihalomethanes/MTBE

In [395]:
file_path = '2017-2020/blood/24.P_VOCWB.xpt'

df_b24, meta = pyreadstat.read_xport(file_path)
df_b24 = standardize_id_column(df_b24)

In [397]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_VOCWB.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

pattern = re.compile(r'^([A-Z0-9_]+)\s*-\s*(.+)$')

rename_dict = {}

# Loop through all h3 tags and filter those that match our pattern
for tag in soup.find_all('h3'):
    text = tag.get_text(strip=True)
    match = pattern.match(text)
    if match:
        var_name = match.group(1)
        description = match.group(2)
                
        clean_name = to_snake_case(description)
        rename_dict[var_name] = clean_name

In [399]:
filtered_rename_dict = {
    k: v for k, v in rename_dict.items() if k in df_b24.columns
}

df_b24 = df_b24.rename(columns=filtered_rename_dict)

In [401]:
df_b24.head()

Unnamed: 0,participant_id,voc_subsample_weight_pre_pandemic,blood_25_dimethylfuran_ng_ml,blood_25_dimethylfuran_comment_code,blood_1112_tetrachloroethane_ng_ml,blood_1112_tetrachloroethane_cmt_code,blood_hexane_ng_ml,blood_hexane_comment_code,blood_heptane_ng_ml,blood_heptane_comment_code,...,blood_aaa_trifluorotoluene_ng_ml,blood_aaa_trifluorotoluene_comment_code,blood_tetrahydrofuran_ng_ml,blood_tetrahydrofuran_comment_code,blood_123_trichloropropane_ng_ml,blood_123_trichloropropane_comt_code,blood_vinyl_bromide_ng_ml,blood_vinyl_bromide_comment_code,blood_m__p_xylene_ng_ml,blood_m__p_xylene_comment_code
0,109266.0,20355.358741,0.008,1.0,0.028,1.0,0.086,1.0,0.071,1.0,...,0.028,1.0,0.088,1.0,0.028,1.0,0.032,1.0,0.035,0.0
1,109273.0,56422.890469,0.151,0.0,0.028,1.0,0.086,1.0,0.071,1.0,...,0.028,1.0,0.088,1.0,0.028,1.0,0.032,1.0,0.241,0.0
2,109274.0,16607.44531,0.008,1.0,0.028,1.0,0.086,1.0,0.071,1.0,...,0.028,1.0,0.088,1.0,0.028,1.0,0.032,1.0,0.024,1.0
3,109277.0,0.0,,,,,,,,,...,,,,,,,,,,
4,109279.0,0.0,,,,,,,,,...,,,,,,,,,,


In [403]:
df_b24 = df_b24.drop('voc_subsample_weight_pre_pandemic', axis=1)

In [405]:
df_b24.isnull().sum()

participant_id                             0
blood_25_dimethylfuran_ng_ml             598
blood_25_dimethylfuran_comment_code      598
blood_1112_tetrachloroethane_ng_ml       598
blood_1112_tetrachloroethane_cmt_code    598
                                        ... 
blood_123_trichloropropane_comt_code     620
blood_vinyl_bromide_ng_ml                619
blood_vinyl_bromide_comment_code         619
blood_m__p_xylene_ng_ml                  614
blood_m__p_xylene_comment_code           614
Length: 81, dtype: int64

In [407]:
value_cols = [col for col in df_b24.columns if col != 'participant_id']
rows_all_nan = df_b24[value_cols].isna().all(axis=1)
print(f"Number of rows missing all values: {rows_all_nan.sum()}")

Number of rows missing all values: 598


In [409]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b24_cleaned = df_b24[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 598


In [412]:
df_names = [var for var in globals() if isinstance(globals()[var], pd.DataFrame)]
print(df_names)

['df_b1', '_5', '_8', '_11', 'df_b2', '_14', 'df_b3', 'df_b3_cleaned', 'df_b4', '_27', 'df_b5', 'df_b6', 'df_info_raw', 'df_info', '_42', '_46', 'df_b6_cleaned', 'df_b7', 'df_b8', 'df_b9', '_74', 'df_b10', '_77', 'df_b11', 'df_b11_cleaned', 'df_b12', 'df_b13', '_99', 'df_b13_cleaned', 'df_b14', '_107', 'df_b15', '_113', 'df_b16', '_231', '_237', '_246', 'df_b16_cleaned', 'df_b17', '_269', '_273', '_279', 'df_b17_cleaned', 'df_b18', '_298', 'df_b18_cleaned', 'df_b19', '_315', 'df_b19_cleaned', 'df_b20', '_334', 'df_b20_cleaned', 'df_b21', '_353', 'df_b21_cleaned', 'df_b22', '_370', 'df_b22_cleaned', 'df_b23', '_386', '_392', 'df_b24', '_401', 'df_b24_cleaned']


In [416]:
blood_dfs = [
    df_b1,
    df_b2,
    df_b3_cleaned,
    df_b4,
    df_b5,
    df_b6_cleaned,
    df_b7,
    df_b8,
    df_b9,
    df_b10,
    df_b12,
    df_b13_cleaned,
    df_b14,
    df_b15,
    df_b16_cleaned,
    df_b17_cleaned,
    df_b18_cleaned,
    df_b19_cleaned,
    df_b20_cleaned,
    df_b21_cleaned,
    df_b22_cleaned,
    df_b23,
    df_b24_cleaned
]

from functools import reduce

df_blood_combined = reduce(
    lambda left, right: pd.merge(left, right, on="participant_id", how="outer"),
    blood_dfs
)

In [419]:
df_blood_combined.to_csv("cleaned_blood_labs_combined.csv", index=False)

In [423]:
blood_df = pd.read_csv('cleaned_blood_labs_combined.csv')

In [425]:
blood_df.head()

Unnamed: 0,participant_id,alpha_1_agp_g_l,direct_hdl_mg_dl,direct_hdl_mmol_l,triglyceride_mg_dl,triglyceride_mmol_l,ldl_friedewald_mg_dl,ldl_friedwalkd_mmol_l,ldl_martin_hopkins_mg_dl,ldl_martin_hopkins_mmol_l,...,blood_aaa_trifluorotoluene_ng_ml,blood_aaa_trifluorotoluene_comment_code,blood_tetrahydrofuran_ng_ml,blood_tetrahydrofuran_comment_code,blood_123_trichloropropane_ng_ml,blood_123_trichloropropane_comt_code,blood_vinyl_bromide_ng_ml,blood_vinyl_bromide_comment_code,blood_m__p_xylene_ng_ml,blood_m__p_xylene_comment_code
0,108368.0,,,,,,,,,,...,,,,,,,,,,
1,108370.0,,,,,,,,,,...,,,,,,,,,,
2,108374.0,,,,,,,,,,...,,,,,,,,,,
3,108375.0,,,,,,,,,,...,,,,,,,,,,
4,108379.0,,,,,,,,,,...,,,,,,,,,,
