In [1]:
import altair as alt
import pandas as pd
import streamlit as st
import xport
import types    # Needed for @st.cache(hash_funcs={types.FunctionType: lambda _: None})

In [66]:
# @st.cache_data(hash_funcs={types.FunctionType: lambda _: None})
def load_data_xpt(filename, columns_to_keep=None):
    with open(filename, 'rb') as f:
        df = pd.DataFrame(xport.to_dataframe(f))
        
        # If specific columns to keep have been specified, filter the DataFrame
        if columns_to_keep is not None:
            relevant_columns = [col for col in columns_to_keep if col in df.columns]
            df = df[relevant_columns]
        return df

# Load NHANES data
df = pd.DataFrame()
f_body_measures = ['BMX_B.XPT','BMX_C.XPT','BMX_D.XPT','BMX_E.XPT','BMX_F.XPT','BMX_G.XPT','BMX_H.XPT','BMX_I.XPT','BMX_J.XPT','BMX.XPT','P_BMX.XPT']
f_blood_pressures = ['BPX_B.XPT','BPX_C.XPT','BPX_D.XPT','BPX_E.XPT','BPX_F.XPT','BPX_G.XPT','BPX_H.XPT','BPX_I.XPT','BPX_J.XPT','BPX.XPT','P_BPXO.XPT','BPXO_J.XPT']
f_demographics = ['DEMO_B.XPT','DEMO_C.XPT','DEMO_D.XPT','DEMO_E.XPT','DEMO_F.XPT','DEMO_G.XPT','DEMO_H.XPT','DEMO_I.XPT','DEMO_J.XPT','DEMO.XPT','P_DEMO.XPT']
for i in range(len(f_body_measures)):
    body_measures = load_data_xpt('nhanes/' + f_body_measures[i], ['SEQN', 'BMXWT', 'BMXHT'])    # Not including 'BMDSTATS' due to incomplete data
    blood_pressures = load_data_xpt('nhanes/' + f_blood_pressures[i], ['SEQN', 'BPXSY1', 'BPXDI1'])    # Not including 'BPXSY2', 'BPXDI2', 'BPXSY3', 'BPXDI3', 'BPXOSY1', 'BPXODI1', 'BPXOSY2', 'BPXODI2', 'BPXOSY3', 'BPXODI3' for simplicity
    demographics = load_data_xpt('nhanes/' + f_demographics[i], ['SEQN', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN'])
    
    # Drop incomplete rows
    body_measures.dropna(inplace=True)
    demographics.dropna(inplace=True)
    skip = False
    if(len(blood_pressures.columns) == len(['SEQN', 'BPXSY1', 'BPXDI1'])):
        blood_pressures.dropna(inplace=True, subset=['BPXSY1', 'BPXDI1'], how='all')
    else:
        skip = True

    # Merge dataframes
    if(not skip):
        all = pd.merge(body_measures, demographics, on='SEQN', how='inner')
        all = pd.merge(all, blood_pressures, on='SEQN', how='inner')
        df = pd.concat([df, all], ignore_index=True)

df['RIDAGEYR'] = df['RIDAGEMN'] / 12

In [71]:
print(df)

         SEQN  BMXWT  BMXHT  RIAGENDR   RIDAGEYR  RIDAGEMN  BPXSY1  BPXDI1
0      9966.0   91.7  174.2       1.0  39.333333     472.0   128.0    80.0
1      9967.0   84.0  167.4       1.0  23.583333     283.0   106.0    60.0
2      9968.0   51.7  144.9       2.0  84.250000    1011.0   120.0     0.0
3      9969.0   58.0  161.4       2.0  51.000000     612.0   120.0    72.0
4      9970.0  139.1  188.3       1.0  16.666667     200.0   120.0    88.0
...       ...    ...    ...       ...        ...       ...     ...     ...
39478  9955.0   91.2  177.7       1.0  37.416667     449.0   118.0    72.0
39479  9958.0   58.4  158.2       2.0  43.916667     527.0   110.0    74.0
39480  9961.0   83.1  180.3       1.0  36.083333     433.0   126.0    76.0
39481  9964.0   32.8  141.2       1.0  11.166667     134.0   102.0    50.0
39482  9965.0   57.9  167.0       2.0  84.500000    1014.0   242.0    78.0

[39483 rows x 8 columns]


In [None]:
# Load blood pressure tables
fSBP = pd.read_csv('bp-tables/FemaleSBP.csv')
fDBP = pd.read_csv('bp-tables/FemaleDBP.csv')
mSBP = pd.read_csv('bp-tables/MaleSBP.csv')
mDBP = pd.read_csv('bp-tables/MaleDBP.csv')