In [4]:
import os
import pandas as pd

# Define file paths
data_dir = "/Users/aakashsuresh/fairness/processed_data_nhanes_lab/"
files_to_load = [
    "fasting_questionnaire_processed.csv",
    "fasting_glucose_processed.csv",
    "glycohemoglobin_processed.csv",
    "biochemistry_profile_processed.csv",
    "iron_status_processed.csv",
    "c_reactive_protein_processed.csv",
    "cotinine_processed.csv",
]

# Load dataframes into a dictionary
dataframes = {}
for file in files_to_load:
    file_path = os.path.join(data_dir, file)
    df_name = file.replace("_processed.csv", "")
    dataframes[df_name] = pd.read_csv(file_path)

# Merge datasets on SEQN (NHANES unique identifier)
merged_df = dataframes["fasting_questionnaire"]
for name, df in dataframes.items():
    if name != "fasting_questionnaire":
        merged_df = pd.merge(merged_df, df, on="seqn", how="inner")

# Standardize column names to lowercase
df.columns = df.columns.str.strip().str.lower()

print("Data merged successfully. Shape:", merged_df.shape)


Data merged successfully. Shape: (0, 77)


In [6]:
for name, df in dataframes.items():
    print(f"{name} - SEQN dtype: {df['seqn'].dtype}, Unique count: {df['seqn'].nunique()}, Total rows: {len(df)}")

fasting_questionnaire - SEQN dtype: object, Unique count: 0, Total rows: 0
fasting_glucose - SEQN dtype: float64, Unique count: 4744, Total rows: 4744
glycohemoglobin - SEQN dtype: float64, Unique count: 9737, Total rows: 9737
biochemistry_profile - SEQN dtype: float64, Unique count: 9258, Total rows: 9258
iron_status - SEQN dtype: float64, Unique count: 9453, Total rows: 9453
c_reactive_protein - SEQN dtype: float64, Unique count: 11614, Total rows: 11614
cotinine - SEQN dtype: float64, Unique count: 11395, Total rows: 11395


In [7]:
# Remove fasting_questionnaire temporarily because it has no unique counts 
del dataframes["fasting_questionnaire"]


### Re-Merge because of problems before, handle missing data and scale it once handled 

In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Merge remaining datasets on 'seqn' (ignoring fasting_questionnaire)
merged_df = None

# Iterate and merge all datasets
for name, df in dataframes.items():
    if merged_df is None:
        merged_df = df  # Initialize with the first dataset
    else:
        merged_df = pd.merge(merged_df, df, on="seqn", how="inner")

print(f"Data merged successfully. Shape: {merged_df.shape}")

# Drop rows with missing values after merge
merged_df = merged_df.dropna()

# Standardize numeric columns
scaler = StandardScaler()
numeric_cols = merged_df.select_dtypes(include=["float64", "int64"]).columns
merged_df[numeric_cols] = scaler.fit_transform(merged_df[numeric_cols])

print("Data cleaned and scaled.")

Data merged successfully. Shape: (4526, 59)
Data cleaned and scaled.


### Check to see if we are ready for modeling 

In [10]:
print(merged_df.head())
print(f"Final shape: {merged_df.shape}")
print(f"Columns: {merged_df.columns}")

       seqn  wtsafprp    lbxglu  lbdglusi     lbxgh  lbxsatsi  lbdsatlc  \
0 -1.766579 -0.442652 -0.395739 -0.397905 -0.445597 -0.679824       0.0   
1 -1.765016 -0.578612 -0.229418 -0.228153 -0.173587 -0.679824       0.0   
2 -1.764346 -0.604835  1.184315  1.184782 -0.082917 -0.121378       0.0   
3 -1.763676 -0.374949 -0.534341 -0.532708 -0.445597 -0.527521       0.0   
4 -1.762559  0.299318 -0.451180 -0.452825 -0.264257 -0.222914       0.0   

     lbxsal  lbdsalsi  lbxsapsi  ...  lbduibsi    lbdtib  lbdtibsi    lbdpct  \
0  0.754519  0.754519  1.679067  ...  1.352088  1.550569  1.550056 -0.638062   
1 -0.685539 -0.685539 -0.004073  ... -0.281707 -0.824683 -0.825171 -0.389914   
2 -0.109516 -0.109516 -0.785531  ... -0.913200 -1.369823 -1.369788  0.189099   
3  0.754519  0.754519  1.298357  ...  1.336602  1.180653  1.180455 -0.968926   
4  0.754519  0.754519 -0.284596  ...  0.319674 -0.123789 -0.124017 -0.720778   

   lbxhscrp  lbdhrplc    lbxcot  lbdcotlc   lbxhcot  lbdhcolc  
0 -0

### Preliminary Modeling for Diabetes 

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

# Define features and target variables
X = merged_df.drop(columns=["lbxglu", "lbxgh"])  # Drop target columns from features
y = merged_df[["lbxglu", "lbxgh"]]  # Multi-output target

# Drop rows with missing target values
X = X.dropna()
y = y.loc[X.index]  # Align target with feature rows

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a multi-output regression model
base_model = RandomForestRegressor(n_estimators=100, random_state=42)
model = MultiOutputRegressor(base_model)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Calculate mean squared error for both targets
glu_mse = mean_squared_error(y_test["lbxglu"], y_pred[:, 0])
gh_mse = mean_squared_error(y_test["lbxgh"], y_pred[:, 1])

print(f"Fasting Glucose (lbxglu) MSE: {glu_mse:.4f}")
print(f"Glycohemoglobin (lbxgh) MSE: {gh_mse:.4f}")

Fasting Glucose (lbxglu) MSE: 0.0125
Glycohemoglobin (lbxgh) MSE: 0.2455
