# CS 439 Final Project - Disease Prediction ML Model


In [1]:
!pip install pandas requests pyreadstat

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\paral\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd
import pyreadstat
import requests
import re

## Data Processing

### Fetching the data

We are using data from NHANES (National Health and Nutrition Examination Survey). We are going to gather data that enacapsulates a wider range than traditional models, specifically data that isn't necessarily medical related.

links can be found at [https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?Cycle=2021-2023](https://)

In [3]:
from typing import ByteString
from io import BytesIO

def load_xpt(url):
  response = requests.get(url)
  if response.status_code != 200:
    raise Exception(f'failed to download {url}')
  return pd.read_sas(BytesIO(response.content), format='xport')

#base_url = 'https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/'
base_url = 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/'

urls = {
    'Demographics': base_url + 'DEMO_L.XPT',
    'Medical Conditions': base_url + 'MCQ_L.XPT',
    'Diabetes': base_url + 'DIQ_L.XPT',
    'Hypertension': base_url + 'BPQ_L.XPT',
    'Prescription Meds': base_url + 'RXQ_RX_L.XPT',
    'Plasma glucose': base_url + 'GLU_L.XPT',
    'CBC': base_url + 'CBC_L.XPT',
    'Body Measures': base_url + 'BMX_L.XPT',
    'Immunization': base_url + 'IMQ_L.XPT',
    'Physical Activity': base_url + 'PAQ_L.XPT',
    'Alchohol Use': base_url + 'ALQ_L.XPT',
    'Smoking': base_url + 'SMQ_L.XPT',
    'Diet': base_url + 'DBQ_L.XPT',
    'Health Insurance': base_url + 'HIQ_L.XPT',
    'Housing': base_url + 'HOQ_L.XPT',
    'Income and Food Security': base_url + 'INQ_L.XPT',
    'Pesticide Use': base_url + 'PUQMEC_L.XPT',
    'Occupational': base_url + 'OCQ_L.XPT',
    'Glycohemoglobin': base_url + 'GHB_L.XPT',
    'Oscillometric BP': base_url + 'BPXO_L.XPT'
}

data = {}
for name, url in urls.items():
  try:
    data[name] = load_xpt(url)
    print(f'loaded {name}')
  except Exception as e:
    print(f'Failed to load {name} : {e}')

df = data['Demographics'] #starting with demographics
for key in data:
  if key != 'Demographics':
    df = df.merge(data[key], on='SEQN', how='left')



loaded Demographics
loaded Medical Conditions
loaded Diabetes
loaded Hypertension
loaded Prescription Meds
loaded Plasma glucose
loaded CBC
loaded Body Measures
loaded Immunization
loaded Physical Activity
loaded Alchohol Use
loaded Smoking
loaded Diet
loaded Health Insurance
loaded Housing
loaded Income and Food Security
loaded Pesticide Use
loaded Occupational
loaded Glycohemoglobin
loaded Oscillometric BP


### Explore the Data

In [4]:
#saving the original data to a file
#from google.colab import files
#df.to_csv('df.csv', index=False)
# files.download('df.csv')

In [5]:
print(len(set(df.columns)))
print(set(df.columns))
print(f"Final merged shape: {df.shape}")
df.head()


211
{'BMIWAIST', 'DBQ940', 'ALQ270', 'INDFMMPI', 'MCQ170L', 'DBQ930', 'BMIHIP', 'DIQ160', 'LBXNRBC', 'MCQ510B', 'BMDSTATS', 'OSQ230', 'RIDSTATR', 'ALQ170', 'MCQ035', 'MCQ510E', 'HIQ032D', 'ALQ151', 'DBQ370', 'PUQ110', 'DBQ073U', 'SMQ621', 'SEQN', 'DBQ073E', 'BPXOPLS3', 'SMD100MN', 'DBD030', 'PAD790U', 'MCQ220', 'OCQ215', 'BPAOCSZ', 'LBXEOPCT', 'MCQ510F', 'DMDHSEDZ', 'WTSAF2YR', 'DID040', 'RXQ050', 'HIQ032H', 'HIQ032I', 'LBXGH', 'LBXWBCSI', 'MCQ195', 'BMXRECUM', 'DBQ935', 'BPXOSY3', 'MCQ230B', 'DMDYRUSR', 'DBQ390', 'BPXOPLS1', 'IMQ070', 'DMDHRGND', 'HIQ032C', 'DMDHREDZ', 'LBDNENO', 'LBDEONO', 'WTMEC2YR', 'SMQ040', 'PUQ100', 'HIQ210', 'PAD790Q', 'HIQ032A', 'DIQ050', 'RIAGENDR', 'LBDBANO', 'BPQ080', 'SMAQUEX2', 'HIQ032F', 'DBQ945', 'MCQ160L', 'IMQ060', 'BPXOPLS2', 'DBD055', 'HIQ032E', 'SMD650', 'MCQ510A', 'MCQ170M', 'HIQ011', 'MCQ160B', 'AGQ030', 'MCQ550', 'BMIWT', 'LBXHGB', 'RIDEXMON', 'DMDBORN4', 'MCQ010', 'MCQ040', 'MCQ160F', 'BMIHEAD', 'DBQ424', 'LBXMOPCT', 'LBXHCT', 'DBQ421', 'DBQ330

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,...,BPAOCSZ,BPXOSY1,BPXODI1,BPXOSY2,BPXODI2,BPXOSY3,BPXODI3,BPXOPLS1,BPXOPLS2,BPXOPLS3
0,130378.0,12.0,2.0,1.0,43.0,,5.0,6.0,2.0,,...,4.0,135.0,98.0,131.0,96.0,132.0,94.0,82.0,79.0,82.0
1,130379.0,12.0,2.0,1.0,66.0,,3.0,3.0,2.0,,...,4.0,121.0,84.0,117.0,76.0,113.0,76.0,72.0,71.0,73.0
2,130380.0,12.0,2.0,2.0,44.0,,2.0,2.0,1.0,,...,4.0,111.0,79.0,112.0,80.0,104.0,76.0,84.0,83.0,77.0
3,130381.0,12.0,2.0,2.0,5.0,,5.0,7.0,1.0,71.0,...,,,,,,,,,,
4,130382.0,12.0,2.0,1.0,2.0,,3.0,3.0,2.0,34.0,...,,,,,,,,,,


### Clean the data

clean out columns with too many null values

In [6]:
# threshold = more than 50% are NaNs
threshold = 0.5

missing_ratio = df.isnull().mean() # ratios of missing val

columns_to_keep = missing_ratio[missing_ratio <= threshold].index

df_cleaned = df[columns_to_keep]

print(f"Remaining columns: {len(df_cleaned.columns)}")

Remaining columns: 107


drop col with non numeric values

In [7]:
df_cleaned = df_cleaned.loc[:, df_cleaned.apply(pd.api.types.is_numeric_dtype)]
print(df_cleaned.dtypes)
print(f"Remaining numeric columns: {df_cleaned.shape[1]}")

SEQN        float64
SDDSRVYR    float64
RIDSTATR    float64
RIAGENDR    float64
RIDAGEYR    float64
             ...   
BPXOSY3     float64
BPXODI3     float64
BPXOPLS1    float64
BPXOPLS2    float64
BPXOPLS3    float64
Length: 104, dtype: object
Remaining numeric columns: 104


drop duplicate and highly correlated col

In [8]:
# Convert all numeric columns to float for uniformity
df_temp = df_cleaned.astype(float)

# Drop columns with duplicate values (there's none here but just in case.)
df_cleaned = df_cleaned.loc[:, ~df_temp.T.duplicated()]

print(f"Columns after dropping duplicates: {df_cleaned.shape[1]}")

corr_matrix = df_cleaned.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)] # > 90% similar

df_cleaned.drop(columns=to_drop, inplace=True)

print(f"Columns after dropping high corr col: {df_cleaned.shape[1]}")


Columns after dropping duplicates: 104
Columns after dropping high corr col: 88


In [30]:
print((len(df_cleaned.columns)))

88


Handle NULLs within columns

In [10]:
#TBI

Translate column code names to descriptions

In [None]:
#create new url dict for hml files to scrape
hml_urls = {
    name: re.sub(r'\.XPT$', '.htm', url, flags=re.IGNORECASE)
    for name, url in urls.items()
}
print(hml_urls)

{'Demographics': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm', 'Medical Conditions': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/MCQ_L.htm', 'Diabetes': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DIQ_L.htm', 'Hypertension': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/BPQ_L.htm', 'Prescription Meds': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/RXQ_RX_L.htm', 'Plasma glucose': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/GLU_L.htm', 'CBC': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/CBC_L.htm', 'Body Measures': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/BMX_L.htm', 'Immunization': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/IMQ_L.htm', 'Physical Activity': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/PAQ_L.htm', 'Alchohol Use': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/ALQ_L.htm', 'Smoking':

In [28]:
#scrape variable names and labels from htm files
#go through each url and check to see if any of our codes exist in it
#scrape the sas labels for each code
from bs4 import BeautifulSoup

def extract_labels(url):
    try:
        resp = requests.get(url)
        soup = BeautifulSoup(resp.content, 'html.parser')
    except:
        print(f'theres an error in {url}')
    result = {}
    for dl in soup.find_all('dl'):
        dds = dl.find_all('dd')
        for code in df_cleaned.columns:
            cod = code.lower()
            if cod == dds[0].get_text(strip=True).lower():
                result[code] = dds[1].get_text(strip=True)
                print(f'found {code} in {url}')
    return result

code_map = {}

for name, url in hml_urls.items():
    code_map = code_map | extract_labels(url)

print(code_map)


found SEQN in https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm
found SDDSRVYR in https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm
found RIDSTATR in https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm
found RIAGENDR in https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm
found RIDAGEYR in https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm
found RIDRETH1 in https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm
found RIDEXMON in https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm
found DMQMILIZ in https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm
found DMDBORN4 in https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm
found DMDEDUC2 in https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm
found DMDMARTZ in https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm
found DMDHHSIZ in https:/

In [None]:
#save code map to a file for ease
cm = pd.DataFrame(list(code_map.items()), columns=['Variable', 'SAS label'])
cm.to_csv('sas_labels.csv', index=False)

### Separate the data

We want to split the data into x and y. X will be what we use to make a prediction on the variable y

### Split the data

## Building the Model

## Data Visualization and Evaluation