# Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn.model_selection import train_test_split
import os

## Loading and Merging Data Across Survey Years
This section loads and merges data from multiple NHANES survey cycles (1999–2006) for lean fat mass, body measurements, and demographics into unified DataFrames for easier analysis.

In [2]:
survey_years = ["1999_2000", "2001_2002", "2003_2004", "2005_2006"]
data_types = ["lean_fat_data", "body_measures", "demographic_data"]

dataframes = {data_type: [] for data_type in data_types}

for year in survey_years:
    for data_type in data_types:
        file_path = f"csv/{year}/{data_type}.csv"
        try:
            df = pd.read_csv(file_path)
            dataframes[data_type].append(df)
        except FileNotFoundError:
            print(f"Warning: File not found {file_path}, skipping...")

lean_fat_data = pd.concat(dataframes["lean_fat_data"], ignore_index=True)
body_measurements = pd.concat(dataframes["body_measures"], ignore_index=True)
demographic_data = pd.concat(dataframes["demographic_data"], ignore_index=True)

print("Data successfully merged across survey years:")
print(f"- Lean Fat Data: {lean_fat_data.shape}")
print(f"- Body Measurements: {body_measurements.shape}")
print(f"- Demographic Data: {demographic_data.shape}")

Data successfully merged across survey years:
- Lean Fat Data: (145130, 4)
- Body Measurements: (39352, 11)
- Demographic Data: (41474, 5)


## Filtering Exam Records and Removing Missing Data
This step filters only valid exam records (exam_status == 1) from the lean fat mass dataset and removes missing values to ensure clean data for aggregation.

In [3]:
filtered_lean_fat_data = lean_fat_data[lean_fat_data['exam_status'] == 1].dropna()
print(f"Filtered Lean Fat Data: {filtered_lean_fat_data.shape}")

Filtered Lean Fat Data: (123555, 4)


## Aggregating Total Fat Mass (TFM) and Total Lean Mass (TLM) per Individual
This step groups data by individual ID (id) and calculates the most frequent (mode) or mean values for total fat mass (TFM) and total lean mass (TLM) to handle multiple exam entries per individual.

In [4]:
grouped_lean_fat_data = filtered_lean_fat_data.groupby('id')
aggregated_lean_fat_data = []

for individual_id, group in grouped_lean_fat_data:
    tfm_mode = mode(group['total_fat_mass'], keepdims=True)
    tfm_value = tfm_mode.mode[0] if tfm_mode.count[0] > 1 else group['total_fat_mass'].mean()

    tlm_mode = mode(group['total_lean_mass'], keepdims=True)
    tlm_value = tlm_mode.mode[0] if tlm_mode.count[0] > 1 else group['total_lean_mass'].mean()

    aggregated_lean_fat_data.append({'id': individual_id, 'total_fat_mass': tfm_value, 'total_lean_mass': tlm_value})

aggregated_lean_fat_df = pd.DataFrame(aggregated_lean_fat_data)
print(f"Aggregated Lean Fat Data: {aggregated_lean_fat_df.shape}")

Aggregated Lean Fat Data: (24711, 3)


## Merging Body Measurements, Demographics, and Aggregated Lean Fat Data
This step merges the body measurements, demographic data, and aggregated lean fat mass data using the common id column, ensuring a complete dataset.

In [5]:
merged_health_data = (
    body_measurements
    .merge(demographic_data, on='id', how='inner')
    .merge(aggregated_lean_fat_df, on='id', how='inner')
)

print(f"Merged Dataset Shape (before filtering): {merged_health_data.shape}")

Merged Dataset Shape (before filtering): (24711, 17)


## Filtering for Valid Age Range and Non-Amputated Individuals
This step removes individuals with amputations (amputation == 1) and keeps only those aged between 15 and 64 years.

In [6]:
filtered_health_data = merged_health_data[merged_health_data['amputation'] != 1]

filtered_health_data = filtered_health_data[
    (filtered_health_data['age_in_months'] / 12 >= 15) &
    (filtered_health_data['age_in_months'] / 12 < 64)
]

print(f"Dataset Shape after Filtering Age & Amputation: {filtered_health_data.shape}")

Dataset Shape after Filtering Age & Amputation: (15621, 17)


## Dropping Unnecessary Columns and some Cleanup
This step removes unnecessary columns (pregnancy_status and amputation) and drops any remaining missing values.

In [7]:
columns_to_remove = ['pregnancy_status', 'amputation']

cleaned_health_data = filtered_health_data.drop(columns=columns_to_remove, errors='ignore')

cleaned_health_data = cleaned_health_data.dropna()

print(f"Final Dataset Shape (after dropping NaNs): {cleaned_health_data.shape}")
cleaned_health_data.head()

Final Dataset Shape (after dropping NaNs): (15359, 15)


Unnamed: 0,id,weight,height,bmi,upper_leg_length,maximal_calf_circumference,upper_arm_length,arm_circumference,waist_circumference,thigh_circumference,gender,age_in_months,ethnicity,total_fat_mass,total_lean_mass
1,5.0,92.5,178.3,29.1,45.2,42.6,39.7,35.8,99.9,56.2,1.0,597.0,3.0,29215.5,64392.2
2,6.0,59.2,162.0,22.56,39.7,34.0,34.5,26.0,81.6,47.0,2.0,230.0,5.0,25022.8,35279.5
3,7.0,78.0,162.9,29.39,43.0,37.2,38.1,31.7,90.7,55.7,2.0,712.0,4.0,31481.0,47511.3
5,10.0,111.8,190.1,30.94,46.6,43.7,43.0,37.6,108.0,64.0,1.0,518.0,4.0,31265.3,81419.0
6,11.0,65.0,171.9,22.0,40.5,37.8,37.2,29.2,76.5,39.9,1.0,184.0,3.0,11616.4,54140.2


## Data Validation and Cleaning
This section performs data validation by checking for unrealistic values in key columns, such as negative or biologically implausible measurements.

In [8]:
valid_ranges = {
    "weight": (30, 300),  # in kg (realistic for adults)
    "height": (100, 250),  # in cm
    "bmi": (10, 60),  # valid BMI range
    "upper_leg_length": (30, 100),  # in cm
    "maximal_calf_circumference": (20, 60),  # in cm
    "upper_arm_length": (20, 50),  # in cm
    "arm_circumference": (15, 60),  # in cm
    "waist_circumference": (50, 200),  # in cm
    "thigh_circumference": (30, 90),  # in cm
}

# Function to filter out unreasonable values
def filter_unrealistic_values(df, valid_ranges):
    for column, (min_val, max_val) in valid_ranges.items():
        if column in df.columns:
            invalid_count = df[(df[column] < min_val) | (df[column] > max_val)].shape[0]
            if invalid_count > 0:
                print(f"Warning: {invalid_count} records removed due to unrealistic {column} values")
            df = df[(df[column] >= min_val) & (df[column] <= max_val)]
    return df

In [9]:
validated_health_data = filter_unrealistic_values(cleaned_health_data, valid_ranges)

print(f"Final dataset after validation: {validated_health_data.shape}")

Final dataset after validation: (15318, 15)


## Define BMI Groups & Stratification
This section categorizes BMI into clinical groups and creates a stratification column for better training/testing balance.

In [10]:
def classify_bmi(bmi_value):
    if bmi_value < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi_value < 25:
        return 'Healthy Weight'
    elif 25 <= bmi_value < 30:
        return 'Overweight'
    elif 30 <= bmi_value < 35:
        return 'Obese'
    else:
        return 'Extremely Obese'

validated_health_data['age_group'] = pd.cut(
    validated_health_data['age_in_months'] / 12,
    bins=[16, 26, 36, 46, 56, 65], 
    labels=['16-26', '26-36', '36-46', '46-56', '56-65'], 
    right=False
)

validated_health_data['bmi_group'] = validated_health_data['bmi'].apply(classify_bmi)

validated_health_data['stratify_group'] = (
    validated_health_data['age_group'].astype(str) + "_" +
    validated_health_data['gender'].astype(str) + "_" +
    validated_health_data['ethnicity'].astype(str) + "_" +
    validated_health_data['bmi_group'].astype(str)
)

print(f"Stratification groups created. Unique groups: {validated_health_data['stratify_group'].nunique()}")

Stratification groups created. Unique groups: 285


## Deviding into Train and Test Sets
This section devides dataset into train and test sets based on stratification column.

In [11]:
group_counts = validated_health_data['stratify_group'].value_counts()
validated_health_data['stratify_group'] = validated_health_data['stratify_group'].apply(lambda x: x if group_counts[x] >= 2 else 'Other')

train_set, test_set = train_test_split(validated_health_data, test_size=0.30, random_state=42, stratify=validated_health_data['stratify_group'])

## Saving the Data

In [12]:
train_path = "data/train.csv"
test_path = "data/test.csv"

os.makedirs(os.path.dirname(train_path), exist_ok=True)
train_set.to_csv(train_path, index=False)
test_set.to_csv(test_path, index=False)

print(f"Cleaned and validated train data saved successfully: {train_path}")
print(f"Cleaned and validated test data saved successfully: {test_path}")

Cleaned and validated train data saved successfully: data/train.csv
Cleaned and validated test data saved successfully: data/test.csv
