In [5]:
"""
# Glucose Prediction Model - NHANES Dataset

## Introduction
This notebook presents a machine learning model for predicting fasting glucose and glycohemoglobin levels using routinely collected health data from the NHANES dataset. The goal is to refine the model by removing non-routinely collected clinical assessments and focusing on commonly available health metrics.
"""

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

# Define file paths
data_dir = "/Users/aakashsuresh/fairness/processed_data_nhanes_lab/"
files_to_load = [
    "fasting_questionnaire_processed.csv",
    "fasting_glucose_processed.csv",
    "glycohemoglobin_processed.csv",
    "biochemistry_profile_processed.csv",
    "iron_status_processed.csv",
    "c_reactive_protein_processed.csv",
    "cotinine_processed.csv",
]


# Load dataframes into a dictionary
dataframes = {}
for file in files_to_load:
    file_path = os.path.join(data_dir, file)
    df_name = file.replace("_processed.csv", "")
    dataframes[df_name] = pd.read_csv(file_path)

# Remove fasting_questionnaire temporarily because it has no unique counts 
del dataframes["fasting_questionnaire"]

# Iterate and merge all datasets
for name, df in dataframes.items():
    if merged_df is None:
        merged_df = df  # Initialize with the first dataset
    else:
        merged_df = pd.merge(merged_df, df, on="seqn", how="inner")

print(f"Data merged successfully. Shape: {merged_df.shape}")

# Drop rows with missing values after merge
merged_df = merged_df.dropna()

# Standardize numeric columns
scaler = StandardScaler()
numeric_cols = merged_df.select_dtypes(include=["float64", "int64"]).columns
merged_df[numeric_cols] = scaler.fit_transform(merged_df[numeric_cols])

print("Data cleaned and scaled.")

# Drop unnecessary features
drop_features = ["cotinine", "c_reactive_protein", "iron", "ferritin", "fasting_insulin"]
merged_df = merged_df.drop(columns=[col for col in drop_features if col in merged_df.columns], errors='ignore')

# Drop rows with missing values
data_cleaned = merged_df.dropna()

# Standardize numeric columns
scaler = StandardScaler()
numeric_cols = data_cleaned.select_dtypes(include=["float64", "int64"]).columns
data_cleaned[numeric_cols] = scaler.fit_transform(data_cleaned[numeric_cols])

print("Data cleaned and scaled. Shape:", data_cleaned.shape)

# Define features and target variables
X = data_cleaned.drop(columns=["lbxglu", "lbxgh"])  # Drop target columns from features
y = data_cleaned[["lbxglu", "lbxgh"]]  # Multi-output target

# Drop rows with missing target values
X = X.dropna()
y = y.loc[X.index]  # Align target with feature rows

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a multi-output regression model
base_model = RandomForestRegressor(n_estimators=100, random_state=42)
model = MultiOutputRegressor(base_model)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Calculate mean squared error for both targets
glu_mse = mean_squared_error(y_test["lbxglu"], y_pred[:, 0])
gh_mse = mean_squared_error(y_test["lbxgh"], y_pred[:, 1])

print(f"Fasting Glucose (lbxglu) MSE: {glu_mse:.4f}")
print(f"Glycohemoglobin (lbxgh) MSE: {gh_mse:.4f}")

"""
## Results and Discussion
- The model has been fine-tuned by removing non-routinely collected clinical assessments.
- The remaining features focus on routinely collected health data such as age, gender, BMI, waist circumference, blood pressure, cholesterol levels, triglycerides, and dietary intake data.
- The updated mean squared error values for fasting glucose and glycohemoglobin are presented above.
- Further improvements may involve optimizing hyperparameters and exploring additional routinely available health indicators.
"""

Data merged successfully. Shape: (0, 135)


ValueError: Found array with 0 sample(s) (shape=(0, 116)) while a minimum of 1 is required by StandardScaler.