In [1]:
import os
import pandas as pd

# Define file paths
data_dir = "/Users/aakashsuresh/fairness/processed_data_nhanes_lab/"
files_to_load = [
    "fasting_questionnaire_processed.csv",
    "fasting_glucose_processed.csv",
    "glycohemoglobin_processed.csv",
    "biochemistry_profile_processed.csv",
    "iron_status_processed.csv",
    "c_reactive_protein_processed.csv",
    "cotinine_processed.csv",
]

# Load dataframes into a dictionary
dataframes = {}
for file in files_to_load:
    file_path = os.path.join(data_dir, file)
    df_name = file.replace("_processed.csv", "")
    dataframes[df_name] = pd.read_csv(file_path)

# Merge datasets on SEQN (NHANES unique identifier)
merged_df = dataframes["fasting_questionnaire"]
for name, df in dataframes.items():
    if name != "fasting_questionnaire":
        merged_df = pd.merge(merged_df, df, on="seqn", how="inner")

# Standardize column names to lowercase
df.columns = df.columns.str.strip().str.lower()

print("Data merged successfully. Shape:", merged_df.shape)


Data merged successfully. Shape: (0, 77)


In [2]:
# Remove fasting_questionnaire temporarily because it has no unique counts 
del dataframes["fasting_questionnaire"]

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Merge remaining datasets on 'seqn' (ignoring fasting_questionnaire)
merged_df = None

# Iterate and merge all datasets
for name, df in dataframes.items():
    if merged_df is None:
        merged_df = df  # Initialize with the first dataset
    else:
        merged_df = pd.merge(merged_df, df, on="seqn", how="inner")

print(f"Data merged successfully. Shape: {merged_df.shape}")

# Drop rows with missing values after merge
merged_df = merged_df.dropna()

# Standardize numeric columns
scaler = StandardScaler()
numeric_cols = merged_df.select_dtypes(include=["float64", "int64"]).columns
merged_df[numeric_cols] = scaler.fit_transform(merged_df[numeric_cols])

print("Data cleaned and scaled.")

Data merged successfully. Shape: (4526, 59)
Data cleaned and scaled.


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

# Drop non-routinely collected clinical assessments
columns_to_remove = ["cotinine", "hs-crp", "iron", "ferritin", "fasting_insulin"]
filtered_columns = [col for col in merged_df.columns if not any(exclude in col.lower() for exclude in columns_to_remove)]

# Define the selected feature set
selected_features = [
    "age", "gender", "bmi", "waist_circumference", 
    "blood_pressure", "cholesterol", "triglycerides"
]

# Include dietary intake data if available
dietary_features = [col for col in merged_df.columns if "dietary" in col.lower()]
selected_features.extend(dietary_features)

# Ensure only selected features are used
X = merged_df[selected_features]
y = merged_df[["lbxglu", "lbxgh"]]  # Multi-output target

# Drop rows with missing values
X = X.dropna()
y = y.loc[X.index]  # Align target with feature rows

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a multi-output regression model
base_model = RandomForestRegressor(n_estimators=100, random_state=42)
model = MultiOutputRegressor(base_model)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Calculate mean squared error for both targets
glu_mse = mean_squared_error(y_test["lbxglu"], y_pred[:, 0])
gh_mse = mean_squared_error(y_test["lbxgh"], y_pred[:, 1])

print(f"Fasting Glucose (lbxglu) MSE: {glu_mse:.4f}")
print(f"Glycohemoglobin (lbxgh) MSE: {gh_mse:.4f}")

KeyError: "None of [Index(['age', 'gender', 'bmi', 'waist_circumference', 'blood_pressure',\n       'cholesterol', 'triglycerides'],\n      dtype='object')] are in the [columns]"

In [5]:
# Display all column names in merged_df
print("Columns in merged_df:")
print(merged_df.columns.tolist())

Columns in merged_df:
['seqn', 'wtsafprp', 'lbxglu', 'lbdglusi', 'lbxgh', 'lbxsatsi', 'lbdsatlc', 'lbxsal', 'lbdsalsi', 'lbxsapsi', 'lbxsassi', 'lbxsc3si', 'lbxsbu', 'lbdsbusi', 'lbxsclsi', 'lbxsck', 'lbxscr', 'lbdscrsi', 'lbxsgb', 'lbdsgbsi', 'lbxsgl', 'lbdsglsi', 'lbxsgtsi', 'lbdsgtlc', 'lbxsir', 'lbdsirsi', 'lbxsldsi', 'lbxsossi', 'lbxsph', 'lbdsphsi', 'lbxsksi', 'lbxsnasi', 'lbxstb', 'lbdstbsi', 'lbdstblc', 'lbxsca', 'lbdscasi', 'lbxsch', 'lbdschsi', 'lbxstp', 'lbdstpsi', 'lbxstr', 'lbdstrsi', 'lbxsua', 'lbdsuasi', 'lbxirn', 'lbdirnsi', 'lbxuib', 'lbduiblc', 'lbduibsi', 'lbdtib', 'lbdtibsi', 'lbdpct', 'lbxhscrp', 'lbdhrplc', 'lbxcot', 'lbdcotlc', 'lbxhcot', 'lbdhcolc']
