# Hepatitis Panel

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
!pip install pyreadstat
import pyreadstat #since the data files are .xpt files, this library is needed to import the table
import re
import requests
from bs4 import BeautifulSoup
from nhanes_utils import to_snake_case, get_common_nan_ids, standardize_id_column, drop_rows_with_common_nan_ids



## Hep A

In [2]:
file_path = '2017-2020/hepatitis/P_HEPA.xpt'

df_h1, meta = pyreadstat.read_xport(file_path)
df_h1 = standardize_id_column(df_h1)

In [3]:
df_h1 = df_h1.rename(columns = {'LBXHA' : 'hepatitis_a_antibody'})

In [4]:
df_h1.head()

Unnamed: 0,participant_id,hepatitis_a_antibody
0,109263.0,
1,109264.0,1.0
2,109265.0,1.0
3,109266.0,1.0
4,109269.0,1.0


In [5]:
df_h1.isnull().sum()

participant_id             0
hepatitis_a_antibody    1894
dtype: int64

In [6]:
df_h1 = df_h1.dropna()

### Hep B

In [8]:
file_path = '2017-2020/hepatitis/P_HEPB_S.xpt'

df_h2, meta = pyreadstat.read_xport(file_path)
df_h2 = standardize_id_column(df_h2)

In [12]:
df_h2 = df_h2.rename(columns = {'LBXHBS' : 'hepatitis_b_surface_antibody'})

In [10]:
df_h2.isnull().sum()

participant_id             0
hepatitis_b_antibody    1805
dtype: int64

In [11]:
df_h2 = df_h2.dropna()

In [13]:
file_path = '2017-2020/hepatitis/P_HEPBD.xpt'

df_h3, meta = pyreadstat.read_xport(file_path)
df_h3 = standardize_id_column(df_h3)

In [14]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_HEPBD.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

pattern = re.compile(r'^([A-Z0-9_]+)\s*-\s*(.+)$')

rename_dict = {}

# Loop through all h3 tags and filter those that match our pattern
for tag in soup.find_all('h3'):
    text = tag.get_text(strip=True)
    match = pattern.match(text)
    if match:
        var_name = match.group(1)
        description = match.group(2)
                
        clean_name = to_snake_case(description)
        rename_dict[var_name] = clean_name

In [15]:
filtered_rename_dict = {
    k: v for k, v in rename_dict.items() if k in df_h3.columns
}

df_h3 = df_h3.rename(columns=filtered_rename_dict)

In [16]:
df_h3.isnull().sum()

participant_id                       0
hepatitis_b_core_antibody         1281
hepatitis_b_surface_antigen      11558
hepatitis_d_antibody_anti_hdv    11559
dtype: int64

In [19]:
value_cols = [col for col in df_h3.columns if col != 'participant_id']
rows_all_nan = df_h3[value_cols].isna().all(axis=1)
print(f"Number of rows missing Hep B values: {rows_all_nan.sum()}")

Number of rows missing Hep B values: 1281


In [20]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_h3_cleaned = df_h3[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 1281


### Hep C

In [21]:
file_path = '2017-2020/hepatitis/P_HEPC.xpt'

df_h4, meta = pyreadstat.read_xport(file_path)
df_h4 = standardize_id_column(df_h4)

In [22]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_HEPC.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

pattern = re.compile(r'^([A-Z0-9_]+)\s*-\s*(.+)$')

rename_dict = {}

# Loop through all h3 tags and filter those that match our pattern
for tag in soup.find_all('h3'):
    text = tag.get_text(strip=True)
    match = pattern.match(text)
    if match:
        var_name = match.group(1)
        description = match.group(2)
                
        clean_name = to_snake_case(description)
        rename_dict[var_name] = clean_name

In [23]:
filtered_rename_dict = {
    k: v for k, v in rename_dict.items() if k in df_h4.columns
}

df_h4 = df_h4.rename(columns=filtered_rename_dict)

In [24]:
value_cols = [col for col in df_h4.columns if col != 'participant_id']
rows_all_nan = df_h4[value_cols].isna().all(axis=1)
print(f"Number of rows missing Hep C values: {rows_all_nan.sum()}")

Number of rows missing Hep C values: 1391


In [25]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_h4_cleaned = df_h4[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 1391


### Hep E

In [26]:
file_path = '2017-2020/hepatitis/P_HEPE.xpt'

df_h5, meta = pyreadstat.read_xport(file_path)
df_h5 = standardize_id_column(df_h5)

In [27]:
url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_HEPE.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

pattern = re.compile(r'^([A-Z0-9_]+)\s*-\s*(.+)$')

rename_dict = {}

# Loop through all h3 tags and filter those that match our pattern
for tag in soup.find_all('h3'):
    text = tag.get_text(strip=True)
    match = pattern.match(text)
    if match:
        var_name = match.group(1)
        description = match.group(2)
                
        clean_name = to_snake_case(description)
        rename_dict[var_name] = clean_name

In [28]:
filtered_rename_dict = {
    k: v for k, v in rename_dict.items() if k in df_h5.columns
}

df_h5 = df_h5.rename(columns=filtered_rename_dict)

In [29]:
value_cols = [col for col in df_h5.columns if col != 'participant_id']
rows_all_nan = df_h5[value_cols].isna().all(axis=1)
print(f"Number of rows missing Hep E values: {rows_all_nan.sum()}")

Number of rows missing Hep E values: 1282


In [30]:
# Drop rows where all value columns are NaN (excluding participant_id)
df_h5_cleaned = df_h5[~rows_all_nan].copy()

print(f"Number of rows dropped: {rows_all_nan.sum()}")

Number of rows dropped: 1282


In [31]:
df_names = [var for var in globals() if isinstance(globals()[var], pd.DataFrame)]
print(df_names)

['df_h1', '_4', 'df_h2', 'df_h3', '_17', 'df_h3_cleaned', 'df_h4', 'df_h4_cleaned', 'df_h5', 'df_h5_cleaned']


In [32]:
hep_dfs = [
    df_h1,
    df_h2,
    df_h3_cleaned,
    df_h4_cleaned,
    df_h5_cleaned
]

from functools import reduce

df_hep_combined = reduce(
    lambda left, right: pd.merge(left, right, on="participant_id", how="outer"),
    hep_dfs
)

In [33]:
df_hep_combined.to_csv("cleaned_hep_labs_combined.csv", index=False)

In [34]:
df_hep = pd.read_csv('cleaned_hep_labs_combined.csv')

In [35]:
df_hep.head()

Unnamed: 0,participant_id,hepatitis_a_antibody,hepatitis_b_antibody,hepatitis_b_core_antibody,hepatitis_b_surface_antigen,hepatitis_d_antibody_anti_hdv,hepatitis_c_rna,hepatitis_c_antibody_confirmed,hepatitis_c_genotype,hepatitis_e_igg_anti_hev,hepatitis_e_igm_anti_hev
0,109264.0,1.0,2.0,2.0,,,3.0,3.0,,2.0,2.0
1,109265.0,1.0,,,,,,,,,
2,109266.0,1.0,1.0,2.0,,,3.0,3.0,,2.0,2.0
3,109269.0,1.0,1.0,,,,,,,,
4,109270.0,1.0,1.0,2.0,,,3.0,3.0,,2.0,2.0
