# CS 439 Final Project - Disease Prediction ML Model


In [1]:
!pip install pandas requests pyreadstat

Defaulting to user installation because normal site-packages is not writeable
Collecting pyreadstat
  Downloading pyreadstat-1.2.8-cp312-cp312-win_amd64.whl.metadata (1.1 kB)
Downloading pyreadstat-1.2.8-cp312-cp312-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   -------------------------- ------------- 1.6/2.4 MB 8.4 MB/s eta 0:00:01
   ---------------------------------------- 2.4/2.4 MB 8.6 MB/s eta 0:00:00
Installing collected packages: pyreadstat
Successfully installed pyreadstat-1.2.8



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\paral\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd
import pyreadstat
import requests

## Data Processing

### Fetching the data

We are using data from NHANES (National Health and Nutrition Examination Survey). We are going to gather data that enacapsulates a wider range than traditional models, specifically data that isn't necessarily medical related.

links can be found at [https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?Cycle=2021-2023](https://)

In [3]:
from typing import ByteString
from io import BytesIO

def load_xpt(url):
  response = requests.get(url)
  if response.status_code != 200:
    raise Exception(f'failed to download {url}')
  return pd.read_sas(BytesIO(response.content), format='xport')

#base_url = 'https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/'
base_url = 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/'

urls = {
    'Demographics': base_url + 'DEMO_L.XPT',
    'Medical Conditions': base_url + 'MCQ_L.XPT',
    'Diabetes': base_url + 'DIQ_L.XPT',
    'Hypertension': base_url + 'BPQ_L.XPT',
    'Prescription Meds': base_url + 'RXQ_RX_L.XPT',
    'Plasma glucose': base_url + 'GLU_L.XPT',
    'CBC': base_url + 'CBC_L.XPT',
    'Body Measures': base_url + 'BMX_L.XPT',
    'Immunization': base_url + 'IMQ_L.XPT',
    'Physical Activity': base_url + 'PAQ_L.XPT',
    'Alchohol Use': base_url + 'ALQ_L.XPT',
    'Smoking': base_url + 'SMQ_L.XPT',
    'Diet': base_url + 'DBQ_L.XPT',
    'Health Insurance': base_url + 'HIQ_L.XPT',
    'Housing': base_url + 'HOQ_L.XPT',
    'Income and Food Security': base_url + 'INQ_L.XPT',
    'Pesticide Use': base_url + 'PUQMEC_L.XPT',
    'Occupational': base_url + 'OCQ_L.XPT',
    'Glycohemoglobin': base_url + 'GHB_L.XPT',
    'Oscillometric BP': base_url + 'BPXO_L.XPT'
}

data = {}
for name, url in urls.items():
  try:
    data[name] = load_xpt(url)
    print(f'loaded {name}')
  except Exception as e:
    print(f'Failed to load {name} : {e}')

df = data['Demographics'] #starting with demographics
for key in data:
  if key != 'Demographics':
    df = df.merge(data[key], on='SEQN', how='left')



loaded Demographics
loaded Medical Conditions
loaded Diabetes
loaded Hypertension
loaded Prescription Meds
loaded Plasma glucose
loaded CBC
loaded Body Measures
loaded Immunization
loaded Physical Activity
loaded Alchohol Use
loaded Smoking
loaded Diet
loaded Health Insurance
loaded Housing
loaded Income and Food Security
loaded Pesticide Use
loaded Occupational
loaded Glycohemoglobin
loaded Oscillometric BP


### Explore the Data

In [5]:
#saving the original data to a file
#from google.colab import files
#df.to_csv('df.csv', index=False)
# files.download('df.csv')

In [6]:
print(len(set(df.columns)))
print(set(df.columns))
print(f"Final merged shape: {df.shape}")
df.head()


211
{'BMXARMC', 'LBXPLTSI', 'BMXLEG', 'DID040', 'LBXNRBC', 'HIQ210', 'DBQ400', 'LBXMCVSI', 'MCQ010', 'ALQ130', 'RIDSTATR', 'MCQ510B', 'ALQ121', 'DMDHRAGZ', 'DBQ073U', 'DMDMARTZ', 'MCQ160M', 'BMIWT', 'BPAOCSZ', 'SMD650', 'PAD790Q', 'BMDBMIC', 'IMQ100', 'IND310', 'BPXOPLS1', 'LBXMCHSI', 'OCQ383', 'MCQ170M', 'ALQ142', 'MCQ510D', 'MCQ160P', 'DBD061', 'RIDRETH3', 'MCQ560', 'BPAOARM', 'DID060', 'DIQ050', 'DBD411', 'OSQ230', 'DBD381', 'MCQ149', 'INDFMPIR', 'SEQN', 'DBQ073A', 'MCQ500', 'PUQ100', 'BPXOPLS3', 'PAD680', 'BPQ030', 'LBXHCT', 'DBD030', 'DBQ935', 'LBXMPSI', 'OCQ210', 'SMQ020', 'WTMEC2YR', 'SMQ621', 'OCQ180', 'BPXODI1', 'LBXBAPCT', 'DMDEDUC2', 'DBQ930', 'BMIARML', 'DIQ070', 'MCQ160D', 'DMQMILIZ', 'LBDGLUSI', 'LBXMOPCT', 'LBXHGB', 'BPQ101D', 'DBQ360', 'AGQ030', 'HOD051', 'PAD790U', 'LBDLYMNO', 'WTINT2YR', 'PAD820', 'BMIARMC', 'DBD050', 'MCQ550', 'MCQ510E', 'DBQ390', 'HIQ032A', 'BPQ020', 'RXQ033', 'BMIHT', 'MCQ035', 'MCQ170L', 'MCQ160F', 'OCD150', 'BPXOSY1', 'PAD810Q', 'IMQ070', 'BPQ080

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,...,BPAOCSZ,BPXOSY1,BPXODI1,BPXOSY2,BPXODI2,BPXOSY3,BPXODI3,BPXOPLS1,BPXOPLS2,BPXOPLS3
0,130378.0,12.0,2.0,1.0,43.0,,5.0,6.0,2.0,,...,4.0,135.0,98.0,131.0,96.0,132.0,94.0,82.0,79.0,82.0
1,130379.0,12.0,2.0,1.0,66.0,,3.0,3.0,2.0,,...,4.0,121.0,84.0,117.0,76.0,113.0,76.0,72.0,71.0,73.0
2,130380.0,12.0,2.0,2.0,44.0,,2.0,2.0,1.0,,...,4.0,111.0,79.0,112.0,80.0,104.0,76.0,84.0,83.0,77.0
3,130381.0,12.0,2.0,2.0,5.0,,5.0,7.0,1.0,71.0,...,,,,,,,,,,
4,130382.0,12.0,2.0,1.0,2.0,,3.0,3.0,2.0,34.0,...,,,,,,,,,,


### Clean the data

clean out columns with too many null values

In [7]:
# threshold = more than 50% are NaNs
threshold = 0.5

missing_ratio = df.isnull().mean() # ratios of missing val

columns_to_keep = missing_ratio[missing_ratio <= threshold].index

df_cleaned = df[columns_to_keep]

print(f"Remaining columns: {len(df_cleaned.columns)}")

Remaining columns: 107


drop col with non numeric values

In [8]:
df_cleaned = df_cleaned.loc[:, df_cleaned.apply(pd.api.types.is_numeric_dtype)]
print(df_cleaned.dtypes)
print(f"Remaining numeric columns: {df_cleaned.shape[1]}")

SEQN        float64
SDDSRVYR    float64
RIDSTATR    float64
RIAGENDR    float64
RIDAGEYR    float64
             ...   
BPXOSY3     float64
BPXODI3     float64
BPXOPLS1    float64
BPXOPLS2    float64
BPXOPLS3    float64
Length: 104, dtype: object
Remaining numeric columns: 104


drop duplicate and highly correlated col

In [9]:
# Convert all numeric columns to float for uniformity
df_temp = df_cleaned.astype(float)

# Drop columns with duplicate values (there's none here but just in case.)
df_cleaned = df_cleaned.loc[:, ~df_temp.T.duplicated()]

print(f"Columns after dropping duplicates: {df_cleaned.shape[1]}")

corr_matrix = df_cleaned.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)] # > 90% similar

df_cleaned.drop(columns=to_drop, inplace=True)

print(f"Columns after dropping high corr col: {df_cleaned.shape[1]}")


Columns after dropping duplicates: 104
Columns after dropping high corr col: 88


In [10]:
# Your column list
columns = df_cleaned.columns.tolist()

# Create a DataFrame with blank descriptions
df_dict = pd.DataFrame({'Variable': columns, 'Description': [''] * len(columns)})

# Save as CSV
df_dict.to_csv("nhanes_variable_descriptions.csv", index=False)


Make dictionary of col code names to actual names

### Separate the data

We want to split the data into x and y. X will be what we use to make a prediction on the variable y

### Split the data

## Building the Model

## Data Visualization and Evaluation