# Blood Lab Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
!pip install pyreadstat
import pyreadstat #since the data files are .xpt files, this library is needed to import the table
import re
import requests
from bs4 import BeautifulSoup
from nhanes_utils import to_snake_case, get_common_nan_ids, standardize_id_column, drop_rows_with_common_nan_ids



### Alpha-1-acid glycoprotein (AGP)

Alpha-1-acid glycoprotein (AGP), also known as orosomucoid (ORM), is an acute-phase serum protein present in humans and many animal species. It is produced in response to inflammation, although its precise biological role remains under investigation and somewhat ambiguous [2]. According to Ceciliani et al. (2019), AGP may play a role in immunometabolism, a function potentially relevant to understanding the obesity epidemic in the U.S.

In the NHANES dataset, AGP levels were measured in children aged 3–5 years and females aged 12–49 years. This data offers an opportunity to explore potential correlations between AGP serum concentrations and obesity prevalence among the female participants in the study.

In [2]:
file_path = '2017-2020/blood/1.P_SSAGP.xpt'

df_b1, meta = pyreadstat.read_xport(file_path)
df_b1 = standardize_id_column(df_b1)

In [3]:
df_b1.head(10)

Unnamed: 0,participant_id,WTSSAGPP,SSAGP
0,109264.0,0.0,
1,109266.0,10003.783188,0.796
2,109277.0,23329.384783,0.746
3,109279.0,14416.168293,0.94
4,109284.0,17705.030492,1.08
5,109286.0,21951.734438,0.358
6,109288.0,0.0,
7,109291.0,25981.10596,0.766
8,109297.0,0.0,
9,109309.0,0.0,


In [4]:
df_b1.shape

(3823, 3)

In [5]:
df_b1 = df_b1.rename(columns={
    'SSAGP': 'alpha_1_agp_g_l'
})

In [6]:
df_b1.head()

Unnamed: 0,participant_id,WTSSAGPP,alpha_1_agp_g_l
0,109264.0,0.0,
1,109266.0,10003.783188,0.796
2,109277.0,23329.384783,0.746
3,109279.0,14416.168293,0.94
4,109284.0,17705.030492,1.08


In [7]:
df_b1 = df_b1.drop('WTSSAGPP', axis=1)

In [8]:
df_b1 = df_b1.dropna(subset = ['alpha_1_agp_g_l'])

In [9]:
df_b1.head(5)

Unnamed: 0,participant_id,alpha_1_agp_g_l
1,109266.0,0.796
2,109277.0,0.746
3,109279.0,0.94
4,109284.0,1.08
5,109286.0,0.358


### Lipid Panel

Lipids are essential molecules that support a range of physiological functions, including hormone production and cellular structure. However, excessive lipid levels—particularly certain types—are associated with increased risk of cardiovascular disease.

To assess lipid status, a fasting lipid panel is commonly used. This test typically includes measurements of:
- LDL (low-density lipoprotein, or “bad” cholesterol),
- HDL (high-density lipoprotein, or “good” cholesterol),
- Total cholesterol, and
- Triglycerides

The NHANES dataset includes all of these values, enabling analysis of lipid profiles across a large representative population. This section focuses on cleaning and preparing these variables for analysis.

In [10]:
file_path = '2017-2020/blood/2.P_HDL.xpt'

df_b2, meta = pyreadstat.read_xport(file_path)
df_b2 = standardize_id_column(df_b2)

In [11]:
df_b2.head()

Unnamed: 0,participant_id,LBDHDD,LBDHDDSI
0,109264.0,72.0,1.86
1,109266.0,56.0,1.45
2,109270.0,47.0,1.22
3,109271.0,33.0,0.85
4,109273.0,42.0,1.09


In [12]:
df_b2 = df_b2.rename(columns ={
    'LBDHDD':'direct_hdl_mg_dl',
    'LBDHDDSI':'direct_hdl_mmol_l'
})

In [13]:
df_b2.isnull().sum()

participant_id          0
direct_hdl_mg_dl     1370
direct_hdl_mmol_l    1370
dtype: int64

In [14]:
common_nan = get_common_nan_ids(df_b2, 'direct_hdl_mg_dl', 'direct_hdl_mmol_l', id_col='participant_id')

Number of NaNs in direct_hdl_mg_dl: 1370
Number of NaNs in direct_hdl_mmol_l: 1370
Number of IDs with NaNs in both columns: 1370


In [15]:
df_b2 = drop_rows_with_common_nan_ids(df_b2, 'direct_hdl_mg_dl', 'direct_hdl_mmol_l', id_col='participant_id')

Rows dropped where both direct_hdl_mg_dl and direct_hdl_mmol_l were NaN: 1370


In [16]:
file_path = '2017-2020/blood/3.P_TRIGLY.xpt'

df_b3, meta = pyreadstat.read_xport(file_path)
df_b3 = standardize_id_column(df_b3)

In [17]:
df_b3.columns.to_list()

['participant_id',
 'WTSAFPRP',
 'LBXTR',
 'LBDTRSI',
 'LBDLDL',
 'LBDLDLSI',
 'LBDLDLM',
 'LBDLDMSI',
 'LBDLDLN',
 'LBDLDNSI']

In [18]:
df_b3 = df_b3.rename(columns={
    'LBXTR':'triglyceride_mg_dl',
    'LBDTRSI':'triglyceride_mmol_l',
    'LBDLDL':'ldl_friedewald_mg_dl',
    'LBDLDLSI':'ldl_friedwalkd_mmol_l',
    'LBDLDLM': 'ldl_martin_hopkins_mg_dl',
    'LBDLDMSI': 'ldl_martin_hopkins_mmol_l',
    'LBDLDLN':'ldl_nih_mg_dl',
    'LBDLDNSI':'ldl_nih_mmol_l'
})

In [19]:
df_b3 = df_b3.drop('WTSAFPRP', axis=1)

In [20]:
df_b3.isnull().sum()

participant_id                 0
triglyceride_mg_dl           440
triglyceride_mmol_l          440
ldl_friedewald_mg_dl         473
ldl_friedwalkd_mmol_l        473
ldl_martin_hopkins_mg_dl     473
ldl_martin_hopkins_mmol_l    473
ldl_nih_mg_dl                448
ldl_nih_mmol_l               448
dtype: int64

In [21]:
value_cols = [col for col in df_b3.columns if col != 'participant_id']
rows_all_nan = df_b3[value_cols].isna().all(axis=1)
print(f"Number of rows missing all cholesterol values: {rows_all_nan.sum()}")

Number of rows missing all cholesterol values: 440


In [22]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b3_cleaned = df_b3[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 440


In [23]:
file_path = '2017-2020/blood/4.P_TCHOL.xpt'

df_b4, meta = pyreadstat.read_xport(file_path)
df_b4 = standardize_id_column(df_b4)

In [24]:
df_b4.head()

Unnamed: 0,participant_id,LBXTC,LBDTCSI
0,109264.0,166.0,4.29
1,109266.0,195.0,5.04
2,109270.0,103.0,2.66
3,109271.0,147.0,3.8
4,109273.0,164.0,4.24


In [25]:
df_b4 = df_b4.rename(columns={
    'LBXTC': 'total_cholesterol_mg_dl',
    'LBDTCSI':'total_cholesterol_mmol_l'
})

In [26]:
df_b4.isnull().sum()

participant_id                 0
total_cholesterol_mg_dl     1370
total_cholesterol_mmol_l    1370
dtype: int64

In [27]:
common_nan = get_common_nan_ids(df_b4, 'total_cholesterol_mg_dl', 'total_cholesterol_mmol_l', id_col='participant_id')

Number of NaNs in total_cholesterol_mg_dl: 1370
Number of NaNs in total_cholesterol_mmol_l: 1370
Number of IDs with NaNs in both columns: 1370


In [28]:
df_b4 = drop_rows_with_common_nan_ids(df_b4, 'total_cholesterol_mg_dl', 'total_cholesterol_mmol_l', id_col='participant_id')

Rows dropped where both total_cholesterol_mg_dl and total_cholesterol_mmol_l were NaN: 1370


**Chromium and Cobalt (Blood)**

NHANES data on chromium and cobalt levels were collected on patients aged 40-150 years old. 

In [29]:
file_path = '2017-2020/blood/5.P_CRCO.xpt'

df_b5, meta = pyreadstat.read_xport(file_path)
df_b5 = standardize_id_column(df_b5)

In [30]:
df_b5.columns.to_list()

['participant_id',
 'LBXBCR',
 'LBDBCRSI',
 'LBDBCRLC',
 'LBXBCO',
 'LBDBCOSI',
 'LBDBCOLC']

In [31]:
df_b5 = df_b5.rename(columns={
    'LBXBCR':'chromium_blood_ug_l', 
    'LBDBCRSI': 'chromium_blood_nmol_l',
    'LBDBCRLC':'chromium_blood_comment', 
    'LBXBCO':'cobalt_blood_ug_l', 
    'LBDBCOSI':'cobalt_blood_nmol_l', 
    'LBDBCOLC' :'cobalt_blood_comment'
})

In [32]:
df_b5.isnull().sum()

participant_id              0
chromium_blood_ug_l       302
chromium_blood_nmol_l     302
chromium_blood_comment    302
cobalt_blood_ug_l         299
cobalt_blood_nmol_l       299
cobalt_blood_comment      299
dtype: int64

In [33]:
common_nan = get_common_nan_ids(df_b5, 'chromium_blood_ug_l', 'cobalt_blood_ug_l', id_col='participant_id')

Number of NaNs in chromium_blood_ug_l: 302
Number of NaNs in cobalt_blood_ug_l: 299
Number of IDs with NaNs in both columns: 297


In [34]:
df_b5 = drop_rows_with_common_nan_ids(df_b5, 'chromium_blood_ug_l', 'cobalt_blood_ug_l', id_col='participant_id')

Rows dropped where both chromium_blood_ug_l and cobalt_blood_ug_l were NaN: 297


### Complete Blood Count with Differential

CBC with diff is the most common blood work that is ordered for a baseline lab. CBC can be useful to assess the patients for acute inflammation in the body and anemia. 

There are many values that are extracted and assessed through the CBC panel. To more efficiently extract the information, the decision was made to utilize webscraping rather than individually typing out each lab value. The units for each of the columns is included in the README file for this project.

In [35]:
file_path = '2017-2020/blood/6.P_CBC.xpt'

df_b6, meta = pyreadstat.read_xport(file_path)
df_b6 = standardize_id_column(df_b6)

In [36]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_CBC.htm"

df_info_raw = pd.read_html(url)[0]

# Use the first row as the header
df_info_raw.columns = df_info_raw.iloc[0]
df_info = df_info_raw.drop(index=0).reset_index(drop=True)

In [37]:
df_info.head()

Unnamed: 0,Variable Name,Analyte Description,LLOD,ULOD,Units
0,LBXWBCSI,White blood cell count,0.02,363.0,x 103 cells/uL
1,LBXLYPCT,Lymphocyte percent,0.0,100.0,%
2,LBXMOPCT,Monocyte percent,0.0,100.0,%
3,LBXNEPCT,Segmented neutrophils percent,0.0,100.0,%
4,LBXEOPCT,Eosinophils percent,0.0,100.0,%


In [38]:
df_info.columns.to_list()

['Variable  Name', 'Analyte  Description', 'LLOD', 'ULOD', 'Units']

In [39]:
rename_dict = {
    row["Variable  Name"]: to_snake_case(row["Analyte  Description"])
    for _, row in df_info.iterrows()
    if row["Variable  Name"] in df_b6.columns
}

In [40]:
df_b6 = df_b6.rename(columns=rename_dict)

In [41]:
df_b6.head()

Unnamed: 0,participant_id,white_blood_cell_count,lymphocyte_percent,monocyte_percent,segmented_neutrophils_percent,eosinophils_percent,basophils_percent,LBDLYMNO,LBDMONO,LBDNENO,...,red_blood_cell_count,hemoglobin,LBXHCT,mean_cell_volume,LBXMC,LBXMCHSI,red_cell_distribution_width,platelet_count,mean_platelet_volume,LBXNRBC
0,109263.0,,,,,,,,,,...,,,,,,,,,,
1,109264.0,4.5,45.6,6.2,46.4,1.4,0.5,2.1,0.3,2.1,...,4.8,13.7,40.5,84.3,33.7,28.4,13.1,263.0,8.2,0.1
2,109265.0,9.5,46.4,10.9,39.2,2.9,0.7,4.4,1.0,3.7,...,4.5,12.6,36.6,81.2,34.4,27.9,13.1,286.0,6.6,0.1
3,109266.0,7.8,34.5,6.0,58.3,0.8,0.5,2.7,0.5,4.5,...,4.35,12.3,36.5,83.7,33.6,28.1,14.0,314.0,6.9,0.1
4,109269.0,9.1,38.3,7.8,48.8,4.1,1.1,3.5,0.7,4.4,...,4.21,11.7,33.5,79.6,34.9,27.8,13.4,287.0,6.9,0.1


In [42]:
df_b6.columns.to_list()

#Some of the names were renamed based on what was available on the first table in the URL. For the other ones that were not, they were manually renamed

['participant_id',
 'white_blood_cell_count',
 'lymphocyte_percent',
 'monocyte_percent',
 'segmented_neutrophils_percent',
 'eosinophils_percent',
 'basophils_percent',
 'LBDLYMNO',
 'LBDMONO',
 'LBDNENO',
 'LBDEONO',
 'LBDBANO',
 'red_blood_cell_count',
 'hemoglobin',
 'LBXHCT',
 'mean_cell_volume',
 'LBXMC',
 'LBXMCHSI',
 'red_cell_distribution_width',
 'platelet_count',
 'mean_platelet_volume',
 'LBXNRBC']

In [43]:
df_b6 = df_b6.rename(columns={
 'LBDLYMNO': 'lymphocyte_number',
 'LBDMONO': 'monocyte_number',
 'LBDNENO':'segmented_neutrophils_number',
 'LBDEONO':'eosinophils_number',
 'LBDBANO':'basophils_number',
 'LBXHCT':'hematocrit_percent',
 'LBXMC':'mean_cell_hgb_concentration',
 'LBXMCHSI':'mean_cell_hemoglobin',
 'LBXNRBC':'nucelated_red_blood_cells'
})

In [44]:
df_b6.columns

Index(['participant_id', 'white_blood_cell_count', 'lymphocyte_percent',
       'monocyte_percent', 'segmented_neutrophils_percent',
       'eosinophils_percent', 'basophils_percent', 'lymphocyte_number',
       'monocyte_number', 'segmented_neutrophils_number', 'eosinophils_number',
       'basophils_number', 'red_blood_cell_count', 'hemoglobin',
       'hematocrit_percent', 'mean_cell_volume', 'mean_cell_hgb_concentration',
       'mean_cell_hemoglobin', 'red_cell_distribution_width', 'platelet_count',
       'mean_platelet_volume', 'nucelated_red_blood_cells'],
      dtype='object')

In [45]:
df_b6.isnull().sum()

participant_id                      0
white_blood_cell_count           1616
lymphocyte_percent               1621
monocyte_percent                 1621
segmented_neutrophils_percent    1621
eosinophils_percent              1621
basophils_percent                1621
lymphocyte_number                1621
monocyte_number                  1621
segmented_neutrophils_number     1621
eosinophils_number               1621
basophils_number                 1621
red_blood_cell_count             1616
hemoglobin                       1616
hematocrit_percent               1616
mean_cell_volume                 1616
mean_cell_hgb_concentration      1616
mean_cell_hemoglobin             1616
red_cell_distribution_width      1616
platelet_count                   1616
mean_platelet_volume             1616
nucelated_red_blood_cells        1621
dtype: int64

In [46]:
value_cols = [col for col in df_b6.columns if col != 'participant_id']
rows_all_nan = df_b6[value_cols].isna().all(axis=1)
print(f"Number of rows missing all CBC values: {rows_all_nan.sum()}")

Number of rows missing all CBC values: 1616


In [47]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b6_cleaned = df_b6[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 1616


### Cotinine

Cotinine is a metabolite that is produced when nicotine is processed. Its long half-life makes it a good marker for assessing tobacco exposure or usage. 

In [48]:
file_path = '2017-2020/blood/7.P_COT.xpt'

df_b7, meta = pyreadstat.read_xport(file_path)
df_b7 = standardize_id_column(df_b7)

In [49]:
df_b7.columns.to_list()

['participant_id', 'LBXCOT', 'LBDCOTLC', 'LBXHCOT', 'LBDHCOLC']

In [50]:
df_b7 = df_b7.rename(columns={
    'LBXCOT':'serum_cotinine_ng_ml',
    'LBDCOTLC':'serum_cotinine_comment',
    'LBXHCOT':'serum_hydroxycotinine_ng_ml',
    'LBDHCOLC':'serum_hydroxycotinine_comment'
})

In [51]:
df_b7.isnull().sum()

participant_id                      0
serum_cotinine_ng_ml             1632
serum_cotinine_comment           1632
serum_hydroxycotinine_ng_ml      1632
serum_hydroxycotinine_comment    1632
dtype: int64

In [52]:
common_nan = get_common_nan_ids(df_b7, 'serum_cotinine_ng_ml', 'serum_hydroxycotinine_ng_ml', id_col='participant_id')

Number of NaNs in serum_cotinine_ng_ml: 1632
Number of NaNs in serum_hydroxycotinine_ng_ml: 1632
Number of IDs with NaNs in both columns: 1632


In [53]:
df_b7 = drop_rows_with_common_nan_ids(df_b7, 'serum_cotinine_ng_ml', 'serum_hydroxycotinine_ng_ml', id_col='participant_id')

Rows dropped where both serum_cotinine_ng_ml and serum_hydroxycotinine_ng_ml were NaN: 1632


### Cytomegalovirus

Cytomegalovirus (CMV) is a double-stranded DNA virus that causes flu-like symptoms in immunocompetant population but can cause organ damage in immunocompromised (i.e. HIV/AIDS) population. CMV virus is transmitted via bodily fluids including sexual contact [12]. 

Avidity tests for whether the CMV infection was recent or in the past. Low avidity shows recent infection and high avidity shows past infeciton.

In [54]:
file_path = '2017-2020/blood/8.P_CMV.xpt'

df_b8, meta = pyreadstat.read_xport(file_path)
df_b8 = standardize_id_column(df_b8)

In [55]:
df_b8.columns.to_list()

['participant_id', 'LBXIGG', 'LBXIGM', 'LBXIGGA']

In [56]:
df_b8 = df_b8.rename(columns={
    'LBXIGG':'cmv_igg',
    'LBXIGM':'cmv_igm', 
    'LBXIGGA':'cmv_igg_avidity'
})

In [57]:
df_b8.isnull().sum()

#there are missing avidity value which would indicate that the person was never infected with CMV. The null values for the IgG and IgM would indicate missing data so rows without these two values will be dropped

participant_id        0
cmv_igg             617
cmv_igm             617
cmv_igg_avidity    1307
dtype: int64

In [58]:
common_nan = get_common_nan_ids(df_b8, 'cmv_igg', 'cmv_igm', id_col='participant_id')

Number of NaNs in cmv_igg: 617
Number of NaNs in cmv_igm: 617
Number of IDs with NaNs in both columns: 617


In [59]:
df_b8 = drop_rows_with_common_nan_ids(df_b8, 'cmv_igg', 'cmv_igm', id_col='participant_id')

Rows dropped where both cmv_igg and cmv_igm were NaN: 617


### Ethylene Oxide

Ethylene Oxide (EtO) is a colorless gas that is used to produce various materials as well as sterilize medical equipments. Exposure to EtO most often is due to aerosolization. EtO is a well-known carcinogen and long term exposure to this substance could lead to blood cancers such as non-Hodgkin lymphoma, myeloma and lymphocytic leukemia [13].

The unit for EtO measurement in the blood is picomoles per gram of hemoglobin (pmol/g Hb).

In [60]:
file_path = '2017-2020/blood/9.P_ETHOX.xpt'

df_b9, meta = pyreadstat.read_xport(file_path)
df_b9 = standardize_id_column(df_b9)

In [61]:
df_b9.columns.to_list()

['participant_id', 'WTSAPRP', 'LBXEOA', 'LBDEOALC']

In [62]:
df_b9 = df_b9.drop('WTSAPRP',axis=1)

In [63]:
df_b9 = df_b9.rename(columns={
    'LBXEOA':'eto_pmol_g_hb',
    'LBDEOALC':'eto_comment'
})

In [64]:
df_b9.isnull().sum()

participant_id      0
eto_pmol_g_hb     424
eto_comment       424
dtype: int64

In [65]:
df_b9 = df_b9.dropna(subset=['eto_pmol_g_hb'])

In [66]:
df_b9.head()

Unnamed: 0,participant_id,eto_pmol_g_hb,eto_comment
0,109266.0,18.7,0.0
1,109270.0,37.7,0.0
2,109273.0,359.0,0.0
3,109274.0,61.2,0.0
5,109290.0,25.5,0.0


### Ferritin and iron panel

Ferritin and iron panel are used to assess someone's iron status. Low values in the iron panel and ferritin along with clinical symptoms are corroborated to diagnose iron deficiency anemia. 

In [67]:
file_path = '2017-2020/blood/10.P_FERTIN.xpt'

df_b10, meta = pyreadstat.read_xport(file_path)
df_b10 = standardize_id_column(df_b10)

In [68]:
df_b10.head()

Unnamed: 0,participant_id,LBXFER,LBDFERSI
0,109263.0,,
1,109264.0,15.7,15.7
2,109265.0,42.1,42.1
3,109266.0,11.6,11.6
4,109269.0,41.7,41.7


In [69]:
df_b10 = df_b10.rename(columns={
    'LBXFER':'ferritin_ng_ml',
    'LBDFERSI':'ferritin_ug_l'
})

In [70]:
df_b10.isnull().sum()

participant_id       0
ferritin_ng_ml    1426
ferritin_ug_l     1426
dtype: int64

In [71]:
common_nan = get_common_nan_ids(df_b10, 'ferritin_ng_ml', 'ferritin_ug_l', id_col='participant_id')

Number of NaNs in ferritin_ng_ml: 1426
Number of NaNs in ferritin_ug_l: 1426
Number of IDs with NaNs in both columns: 1426


In [72]:
df_b10 = drop_rows_with_common_nan_ids(df_b10, 'ferritin_ng_ml', 'ferritin_ug_l', id_col='participant_id')

Rows dropped where both ferritin_ng_ml and ferritin_ug_l were NaN: 1426


In [73]:
file_path = '2017-2020/blood/11.P_FETIB.xpt'

df_b11, meta = pyreadstat.read_xport(file_path)
df_b11 = standardize_id_column(df_b11)

In [74]:
df_b11.columns.to_list()

['participant_id',
 'LBXIRN',
 'LBDIRNSI',
 'LBXUIB',
 'LBDUIBLC',
 'LBDUIBSI',
 'LBDTIB',
 'LBDTIBSI',
 'LBDPCT']

In [75]:
df_b11 = df_b11.rename(columns={
 'LBXIRN':'iron_frozen_ug_dl',
 'LBDIRNSI':'iron_frozen_umol_l',
 'LBXUIB':'uibc_ug_dl',
 'LBDUIBLC':'uibc_comment',
 'LBDUIBSI':'uibc_umol_l',
 'LBDTIB':'tibc_ug_dl',
 'LBDTIBSI':'tibc_umol_l',
 'LBDPCT':'transferrin_saturation'
})

In [76]:
df_b11.isnull().sum()

participant_id              0
iron_frozen_ug_dl         904
iron_frozen_umol_l        904
uibc_ug_dl                949
uibc_comment              949
uibc_umol_l               949
tibc_ug_dl                956
tibc_umol_l               956
transferrin_saturation    956
dtype: int64

In [77]:
value_cols = [col for col in df_b11.columns if col != 'participant_id']
rows_all_nan = df_b11[value_cols].isna().all(axis=1)
print(f"Number of rows missing all iron panel values: {rows_all_nan.sum()}")

Number of rows missing all iron panel values: 904


In [78]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b11_cleaned = df_b11[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 904


### Folate


In [79]:
file_path = '2017-2020/blood/12.P_FOLATE.xpt'

df_b12, meta = pyreadstat.read_xport(file_path)
df_b12 = standardize_id_column(df_b12)

In [80]:
df_b12.columns

Index(['participant_id', 'WTFOLPRP', 'LBDRFO', 'LBDRFOSI'], dtype='object')

In [81]:
df_b12 = df_b12.drop('WTFOLPRP', axis=1)

In [82]:
df_b12 = df_b12.rename(columns={
    'LBDRFO':'rbc_folate_ng_ml',
    'LBDRFOSI':'rbc_folate_nmol_l'
})

In [83]:
df_b12.isnull().sum()

participant_id         0
rbc_folate_ng_ml     966
rbc_folate_nmol_l    966
dtype: int64

In [84]:
common_nan = get_common_nan_ids(df_b12, 'rbc_folate_ng_ml', 'rbc_folate_nmol_l', id_col='participant_id')

Number of NaNs in rbc_folate_ng_ml: 966
Number of NaNs in rbc_folate_nmol_l: 966
Number of IDs with NaNs in both columns: 966


In [85]:
df_b12 = drop_rows_with_common_nan_ids(df_b12, 'rbc_folate_ng_ml', 'rbc_folate_nmol_l', id_col='participant_id')

Rows dropped where both rbc_folate_ng_ml and rbc_folate_nmol_l were NaN: 966


In [86]:
file_path = '2017-2020/blood/13.P_FOLFMS.xpt'

df_b13, meta = pyreadstat.read_xport(file_path)
df_b13 = standardize_id_column(df_b13)

#There are a lot of technical names for these values so webscraping will be done instead of manual renaming of the columns

In [87]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_FOLFMS.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# Pattern: match things like "LBDRFOSI - RBC folate (nmol/L)"
pattern = re.compile(r'^([A-Z0-9_]+)\s*-\s*(.+)$')

rename_dict = {}

# Loop through all h3 tags and filter those that match our pattern
for tag in soup.find_all('h3'):
    text = tag.get_text(strip=True)
    match = pattern.match(text)
    if match:
        var_name = match.group(1)
        description = match.group(2)
                
        clean_name = to_snake_case(description)
        rename_dict[var_name] = clean_name

In [88]:
# Only keep entries in rename_dict where the variable name is in df_b13
filtered_rename_dict = {
    k: v for k, v in rename_dict.items() if k in df_b13.columns
}

# Rename columns in df_b13
df_b13 = df_b13.rename(columns=filtered_rename_dict)

In [89]:
df_b13.head()

Unnamed: 0,participant_id,folate_folate_form_weight_pre_pandemic,serum_total_folate_nmol_l,serum_total_folate_ng_ml,5_methyl_tetrahydrofolate_nmol_l,5_methyl_tetrahydrofolate_cmt,folic_acid_nmol_l,folic_acid_cmt,5_formyl_tetrahydrofolate_nmol_l,5_formyl_tetrahydrofolate_cmt,tetrahydrofolate_nmol_l,tetrahydrofolate_cmt,510_methenyl_tetrahydrofolate_nmol_l,510_methenyl_tetrahydrofolate_cmt,mefox_oxidation_product_nmol_l,mefox_oxidation_product_cmt
0,109264.0,13078.84474,75.7,33.4,74.2,0.0,0.347,0.0,0.141,1.0,0.909,0.0,0.141,1.0,0.692,0.0
1,109265.0,51567.643136,28.6,12.6,27.6,0.0,0.344,0.0,0.141,1.0,0.418,0.0,0.141,1.0,0.511,0.0
2,109266.0,8998.028716,42.4,18.7,41.4,0.0,0.402,0.0,0.141,1.0,0.34,0.0,0.141,1.0,0.661,0.0
3,109269.0,17148.10026,50.6,22.3,48.5,0.0,0.444,0.0,0.141,1.0,1.41,0.0,0.141,1.0,0.761,0.0
4,109270.0,13466.605962,31.1,13.7,28.6,0.0,1.46,0.0,0.141,1.0,0.719,0.0,0.141,1.0,2.18,0.0


In [90]:
df_b13.columns.to_list()

['participant_id',
 'folate_folate_form_weight_pre_pandemic',
 'serum_total_folate_nmol_l',
 'serum_total_folate_ng_ml',
 '5_methyl_tetrahydrofolate_nmol_l',
 '5_methyl_tetrahydrofolate_cmt',
 'folic_acid_nmol_l',
 'folic_acid_cmt',
 '5_formyl_tetrahydrofolate_nmol_l',
 '5_formyl_tetrahydrofolate_cmt',
 'tetrahydrofolate_nmol_l',
 'tetrahydrofolate_cmt',
 '510_methenyl_tetrahydrofolate_nmol_l',
 '510_methenyl_tetrahydrofolate_cmt',
 'mefox_oxidation_product_nmol_l',
 'mefox_oxidation_product_cmt']

In [91]:
df_b13 = df_b13.drop('folate_folate_form_weight_pre_pandemic',axis=1)

In [92]:
df_b13.isnull().sum()

participant_id                             0
serum_total_folate_nmol_l               1052
serum_total_folate_ng_ml                1052
5_methyl_tetrahydrofolate_nmol_l        1052
5_methyl_tetrahydrofolate_cmt           1052
folic_acid_nmol_l                       1052
folic_acid_cmt                          1052
5_formyl_tetrahydrofolate_nmol_l        1052
5_formyl_tetrahydrofolate_cmt           1052
tetrahydrofolate_nmol_l                 1052
tetrahydrofolate_cmt                    1052
510_methenyl_tetrahydrofolate_nmol_l    1052
510_methenyl_tetrahydrofolate_cmt       1052
mefox_oxidation_product_nmol_l          1052
mefox_oxidation_product_cmt             1052
dtype: int64

In [93]:
value_cols = [col for col in df_b13.columns if col != 'participant_id']
rows_all_nan = df_b13[value_cols].isna().all(axis=1)
print(f"Number of rows missing all folate values: {rows_all_nan.sum()}")

Number of rows missing all folate values: 1052


In [94]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_b13_cleaned = df_b13[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 1052


### Glycohemoglobin (%)

In [95]:
file_path = '2017-2020/blood/14.P_GHB.xpt'

df_b14, meta = pyreadstat.read_xport(file_path)
df_b14 = standardize_id_column(df_b14)

In [96]:
df_b14.head()

Unnamed: 0,participant_id,LBXGH
0,109264.0,5.3
1,109266.0,5.2
2,109271.0,5.6
3,109273.0,5.1
4,109274.0,5.7


In [97]:
df_b14 = df_b14.rename(columns = {'LBXGH':'glycohemoglobin_percent'})

In [98]:
df_b14.isnull().sum()

participant_id               0
glycohemoglobin_percent    672
dtype: int64

In [99]:
df_b14 = df_b14.dropna(subset=['glycohemoglobin_percent'])

### High-Sensitivity C-Reactive Protein

In [102]:
file_path = '2017-2020/blood/15.P_HSCRP.xpt'

df_b15, meta = pyreadstat.read_xport(file_path)
df_b15 = standardize_id_column(df_b15)

In [103]:
df_b15.head()

Unnamed: 0,participant_id,LBXHSCRP,LBDHRPLC
0,109263.0,,
1,109264.0,0.11,1.0
2,109265.0,0.31,0.0
3,109266.0,0.72,0.0
4,109269.0,0.73,0.0


In [104]:
df_b15 = df_b15.rename(columns = {
    'LBXHSCRP':'hs_crp_mg_l',
    'LBDHRPLC':'hs_crp_cmt'
})

In [105]:
df_b15.isnull().sum()

participant_id       0
hs_crp_mg_l       2158
hs_crp_cmt        2158
dtype: int64

In [106]:
df_b15 = df_b15.dropna(subset=['hs_crp_mg_l'])

### Inorganic metyl and ethyl mercury

In [107]:
file_path = '2017-2020/blood/16.P_IHGEM.xpt'

df_b16, meta = pyreadstat.read_xport(file_path)
df_b16 = standardize_id_column(df_b16)