In [1]:
# Import needed libraries 
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

## NaN Analysis

#### We want to analysis NaN's specifically in the dietary dataset so we can understand what is missing in our modeling and what exactly to focus on for features as well as remove/input NaN's to increase accuracy in our model. 

In [2]:
# Load the dataset 
file_path = '/Users/aakashsuresh/fairness/processed_data_new/nhanes_combined_diet.csv'
df = pd.read_csv(file_path)

In [4]:
# Display information about the dataset 
print("Initial dataset shape:", df.shape)
print("Initial missing values:\n", df.isnull().sum())

Initial dataset shape: (19931, 39)
Initial missing values:
 SEQN            0
DSDCOUNT        0
DSDANCNT        0
DSD010          0
DSD010AN        0
DSQTKCAL    15892
DSQTPROT    19546
DSQTCARB    17054
DSQTSUGR    17241
DSQTFIBE    19679
DSQTTFAT    18513
DSQTSFAT    19539
DSQTMFAT    19770
DSQTPFAT    19265
DSQTCHOL    19036
DSQTLYCO    18638
DSQTLZ      18639
DSQTVB1     16044
DSQTVB2     16032
DSQTNIAC    15861
DSQTVB6     14769
DSQTFA      14892
DSQTFDFE    14892
DSQTCHL     18340
DSQTVB12    14552
DSQTVC      14165
DSQTVK      17342
DSQTVD      13807
DSQTCALC    14860
DSQTPHOS    17980
DSQTMAGN    16155
DSQTIRON    17473
DSQTZINC    15187
DSQTCOPP    16603
DSQTSODI    18533
DSQTPOTA    17987
DSQTSELE    17117
DSQTCAFF    19894
DSQTIODI    16323
dtype: int64


#### Here, we can see that there are many missing values in this dataset, taken from some research through the CDC: As a general guideline, if 10% or less of the main outcome variable's data are missing, the dataset is typically considered acceptable for analysis without further adjustments. So we will filter our data if they are missing 10% of values or lower. We will also go through some general filtering processes, for example, if the columns are missing participant ID's or etc. we will also drop them. 

In [5]:
# 1. Remove columns with more than 10% missing values
threshold = 0.10
df = df.loc[:, df.isnull().mean() < threshold]
print("\nShape after removing columns with >10% missing values:", df.shape)


Shape after removing columns with >10% missing values: (19931, 5)


In [8]:
print(df.columns)

Index(['SEQN', 'DSDCOUNT', 'DSDANCNT', 'DSD010', 'DSD010AN'], dtype='object')


In [9]:
# 2. Drop rows with missing values in critical columns (e.g., 'participant_id')
critical_columns = ['SEQN', 'DSDCOUNT', 'DSDANCNT', 'DSD010', 'DSD010AN']  # Replace with actual critical columns
df = df.dropna(subset=critical_columns)
print("\nShape after dropping rows with missing critical values:", df.shape)


Shape after dropping rows with missing critical values: (19931, 5)


#### After dropping any missing columns, we want to impute these rows with numerical and categorical variables to allow for modeling to occur. 

In [10]:
# 3. Impute remaining missing values
# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

In [11]:
# Impute numerical columns with mean
num_imputer = SimpleImputer(strategy='mean')
df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])

#### After going through the dataset, I thought there was no categorical variables so just wanted to write the function below to analyze if there were any of these columns. 

In [16]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=["object", "category"]).columns

# Ensure categorical_cols is not empty
if len(categorical_cols) > 0:
    cat_imputer = SimpleImputer(strategy='most_frequent')

    # Ensure the DataFrame is not empty before applying transformation
    if not df[categorical_cols].empty:
        df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])
    else:
        print("No categorical data available for imputation.")
else:
    print("No categorical columns found in the dataset.")


No categorical columns found in the dataset.


#### Since there are no categorical features we want to normalize our numerical features so we can use it for our modeling. 

In [17]:
# 4. Normalize numerical features
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [18]:
# Display final information
print("\nFinal dataset shape:", df.shape)
print("Final missing values:\n", df.isnull().sum().sum())


Final dataset shape: (19931, 5)
Final missing values:
 0


In [20]:
# Save the cleaned dataset
output_path = '/Users/aakashsuresh/fairness/processed_data_new/cleaned_nhanes_combined_diet.csv'
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved to {output_path}")


Cleaned dataset saved to /Users/aakashsuresh/fairness/processed_data_new/cleaned_nhanes_combined_diet.csv


## Label Analysis and Identification 

#### After looking at these NaN values, we want to analyze and see if there are any labels outside of glucose in diet to make our predictions. 

In [21]:
# Possible labels to consider
potential_labels = {
    "Diabetes": ["glucose", "HbA1c", "insulin_levels"],
    "Mental Health": ["mental_health", "stress_level", "sleep_hours"],
    "Lifestyle": ["exercise_frequency", "diet_quality", "smoking_status", "alcohol_intake"],
    "Vital Signs": ["heart_rate", "blood_pressure", "BMI", "cholesterol"]
}

In [22]:
# Identify which of these labels exist in the dataset
identified_labels = {category: [col for col in columns if col in df.columns] 
                     for category, columns in potential_labels.items()}

In [23]:
# Print identified labels
print("Identified relevant labels in the dataset:")
for category, labels in identified_labels.items():
    if labels:
        print(f"- {category}: {', '.join(labels)}")
    else:
        print(f"- {category}: No matching labels found")

Identified relevant labels in the dataset:
- Diabetes: No matching labels found
- Mental Health: No matching labels found
- Lifestyle: No matching labels found
- Vital Signs: No matching labels found


#### No matching labels found through this process so going to use fuzzy matching to identify labels. 

In [26]:
from fuzzywuzzy import process



In [44]:
# Define keywords for label identification
label_keywords = {
    "Diabetes": ["glucose", "sugar", "insulin", "hba1c", "diabetes"],
    "Mental Health": ["stress", "anxiety", "depression", "mental", "sleep"],
    "Lifestyle": ["exercise", "diet", "smoking", "alcohol", "activity"],
    "Vital Signs": ["heart", "blood_pressure", "bmi", "cholesterol", "pulse"]
}

In [45]:
# Fuzzy matching function
def find_matching_columns(df_columns, keywords):
    matched = []
    for keyword in keywords:
        # Extract best match and score
        match = process.extractOne(keyword, df_columns, score_cutoff=40) 
        if match:  # Ensure that a valid match is found
            matched.append(match[0])  # Append only the column name
    return list(set(matched))  # Remove duplicates

#### There are some matching labels that are present when doing fuzzy matching, but still the next step for the next week should most likely be feature engineering for new labels in modeling. 

In [46]:
# Identify relevant labels in dataset
identified_labels = {category: find_matching_columns(df.columns, keywords) 
                     for category, keywords in label_keywords.items()}

# Print identified labels
print("Identified relevant labels in the dataset:")
for category, labels in identified_labels.items():
    if labels:
        print(f"- {category}: {', '.join(labels)}")
    else:
        print(f"- {category}: No matching labels found")

Identified relevant labels in the dataset:
- Diabetes: SEQN
- Mental Health: SEQN, DSDANCNT
- Lifestyle: SEQN, DSDCOUNT
- Vital Signs: SEQN
