# Training and Testing the Models for TFM and TLM Prediction

In [17]:
import pandas as pd
import numpy as np
import os
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pyrenn as pr
import pickle

## Load Cleaned Data
This section retrieves the validated NHANES dataset from the processed CSV file and prepares it for feature selection and modeling.

In [3]:
data_path = "data/nhanes_cleaned_validated_data.csv"
if not os.path.exists(data_path):
    raise FileNotFoundError(f"Dataset not found at {data_path}. Ensure the preprocessing step was completed.")

health_data = pd.read_csv(data_path)

print(f"Dataset loaded successfully. Shape: {health_data.shape}")

Dataset loaded successfully. Shape: (14740, 15)


## Define BMI Groups & Stratification
This section categorizes BMI into clinical groups and creates a stratification column for better training/testing balance.

In [4]:
def classify_bmi(bmi_value):
    if bmi_value < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi_value < 25:
        return 'Healthy Weight'
    elif 25 <= bmi_value < 30:
        return 'Overweight'
    elif 30 <= bmi_value < 35:
        return 'Obese'
    else:
        return 'Extremely Obese'

health_data['age_group'] = pd.cut(
    health_data['age_in_months'] / 12,
    bins=[16, 26, 36, 46, 56, 65], 
    labels=['16-26', '26-36', '36-46', '46-56', '56-65'], 
    right=False
)

health_data['bmi_group'] = health_data['bmi'].apply(classify_bmi)

health_data['stratify_group'] = (
    health_data['age_group'].astype(str) + "_" +
    health_data['gender'].astype(str) + "_" +
    health_data['ethnicity'].astype(str) + "_" +
    health_data['bmi_group'].astype(str)
)

print(f"Stratification groups created. Unique groups: {health_data['stratify_group'].nunique()}")

Stratification groups created. Unique groups: 252


## Define Feature Sets & Target
This section tests different feature sets to determine the best performing model.

In [13]:
feature_sets = {
    "set_1": ['weight', 'height', 'bmi', 'maximal_calf_circumference', 'arm_circumference', 'waist_circumference', 'thigh_circumference', 'gender', 'age_in_months', 'ethnicity'],
    "set_2": ['weight', 'height', 'bmi', 'arm_circumference', 'waist_circumference', 'thigh_circumference', 'gender', 'age_in_months', 'ethnicity'],
    "set_3": ['weight', 'height', 'bmi', 'arm_circumference', 'waist_circumference', 'gender', 'age_in_months', 'ethnicity'],
    "set_4": ['weight', 'height', 'bmi', 'arm_circumference', 'thigh_circumference', 'gender', 'age_in_months', 'ethnicity'],
    "set_5": ['weight', 'height', 'bmi', 'waist_circumference', 'thigh_circumference', 'gender', 'age_in_months', 'ethnicity'],
    "set_6": ['weight', 'height', 'bmi', 'arm_circumference', 'waist_circumference', 'thigh_circumference', 'gender', 'age_in_months'],
}

tfm_target = 'total_fat_mass'
tlm_target = 'total_lean_mass'

## Model Training & Evaluation
This section trains and evaluates multiple models with different feature sets and selects the best one.

In [12]:
def train_and_evaluate(features, dataset, target):
    X = dataset[features]
    y = dataset[target]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    group_counts = dataset['stratify_group'].value_counts()
    dataset['stratify_group'] = dataset['stratify_group'].apply(lambda x: x if group_counts[x] >= 2 else 'Other')

    train_set, test_set = train_test_split(dataset, test_size=0.30, random_state=42, stratify=dataset['stratify_group'])

    X_train = scaler.transform(train_set[features])
    X_test = scaler.transform(test_set[features])
    y_train = train_set[target].values.reshape(-1, 1)
    y_test = test_set[target].values.reshape(-1, 1)

    n_inputs = X_train.shape[1]
    n_outputs = 1
    hidden_neurons = 10
    nn = pr.CreateNN([n_inputs, hidden_neurons, n_outputs])

    nn = pr.train_LM(X_train.T, y_train.T, nn, k_max=300, E_stop=1e-10, dampfac=3.0, dampconst=10.0, verbose=True)

    y_pred_test = pr.NNOut(X_test.T, nn).reshape(-1, 1)

    pearson_corr, _ = pearsonr(y_test.flatten(), y_pred_test.flatten())
    errors = y_test - y_pred_test
    SEE = np.sqrt(np.sum(errors ** 2) / len(y_test))

    return {"model": nn, "scaler": scaler, "features": features, "R": pearson_corr, "SEE": SEE}

### Training, Evaluation and Saving of Model for Prediction of Total Fat Mass

In [14]:
results = []
for name, features in feature_sets.items():
    print(f"\nTraining model with feature set: {name}")
    result = train_and_evaluate(features, health_data, tfm_target)
    results.append(result)


Training model with feature set: set_1
Iteration:  0 		Error:  8845.74475941008 	scale factor:  3.0
Iteration:  1 		Error:  100.42119404238358 	scale factor:  0.3
Iteration:  2 		Error:  62.32431941933748 	scale factor:  0.03
Iteration:  3 		Error:  39.335358622146444 	scale factor:  0.03
Iteration:  4 		Error:  13.371090665799677 	scale factor:  0.03
Iteration:  5 		Error:  11.649800623595599 	scale factor:  0.03
Iteration:  6 		Error:  10.154866714249756 	scale factor:  0.03
Iteration:  7 		Error:  10.057725812079546 	scale factor:  0.03
Iteration:  8 		Error:  10.020474220350284 	scale factor:  0.03
Iteration:  9 		Error:  9.989741083236659 	scale factor:  0.03
Iteration:  10 		Error:  9.96171219274544 	scale factor:  0.03
Iteration:  11 		Error:  9.938746064048722 	scale factor:  0.03
Iteration:  12 		Error:  9.921843557083156 	scale factor:  0.03
Iteration:  13 		Error:  9.90931222688753 	scale factor:  0.03
Iteration:  14 		Error:  9.899469276742098 	scale factor:  0.03
Iteratio

In [15]:
best_model = max(results, key=lambda x: x['R'])
print(f"\nBest Model: {best_model['features']}")
print(f"Pearson's Correlation Coefficient (R): {best_model['R']:.4f}")
print(f"Standard Estimation Error (SEE): {best_model['SEE']:.4f}")


Best Model: ['weight', 'height', 'bmi', 'maximal_calf_circumference', 'arm_circumference', 'waist_circumference', 'thigh_circumference', 'gender', 'age_in_months', 'ethnicity']
Pearson's Correlation Coefficient (R): 0.9771
Standard Estimation Error (SEE): 2408.6379


In [19]:
tfm_model_path = "models/tfm_tlm/tfm.csv"
tfm_scaler_path = "models/tfm_tlm/tfm_scaler.pkl"

os.makedirs(os.path.dirname(tfm_model_path), exist_ok=True)

pr.saveNN(best_model['model'], tfm_model_path)

with open(tfm_scaler_path, "wb") as scaler_file:
    pickle.dump(best_model['scaler'], scaler_file)

print(f"Best model saved successfully at: {tfm_model_path}")
print(f"Scaler saved successfully at: {tfm_scaler_path}")

Best model saved successfully at: models/tfm_tlm/tfm.csv
Scaler saved successfully at: models/tfm_tlm/tfm_scaler.pkl


### Training, Evaluation and Saving of Model for Prediction of Total Lean Mass

In [20]:
results = []
for name, features in feature_sets.items():
    print(f"\nTraining model with feature set: {name}")
    result = train_and_evaluate(features, health_data, tlm_target)
    results.append(result)


Training model with feature set: set_1
Iteration:  0 		Error:  6614.877416602948 	scale factor:  3.0
Iteration:  1 		Error:  143.45127214013786 	scale factor:  0.3
Iteration:  2 		Error:  88.66173925575558 	scale factor:  0.03
Iteration:  3 		Error:  25.66626481853731 	scale factor:  0.03
Iteration:  4 		Error:  24.642097454205928 	scale factor:  0.003
Iteration:  5 		Error:  18.8245274447771 	scale factor:  0.003
Iteration:  6 		Error:  9.431363040788776 	scale factor:  0.003
Iteration:  7 		Error:  8.550639411147829 	scale factor:  0.003
Iteration:  8 		Error:  7.112087347272694 	scale factor:  0.003
Iteration:  9 		Error:  7.107131543593439 	scale factor:  0.003
Iteration:  10 		Error:  7.029674407970734 	scale factor:  0.003
Iteration:  11 		Error:  6.937844303831679 	scale factor:  0.03
Iteration:  12 		Error:  6.934766796587898 	scale factor:  0.03
Iteration:  13 		Error:  6.932741553445061 	scale factor:  0.03
Iteration:  14 		Error:  6.930928864422094 	scale factor:  0.03
Iter

In [21]:
best_model = max(results, key=lambda x: x['R'])
print(f"\nBest Model: {best_model['features']}")
print(f"Pearson's Correlation Coefficient (R): {best_model['R']:.4f}")
print(f"Standard Estimation Error (SEE): {best_model['SEE']:.4f}")


Best Model: ['weight', 'height', 'bmi', 'maximal_calf_circumference', 'arm_circumference', 'waist_circumference', 'thigh_circumference', 'gender', 'age_in_months', 'ethnicity']
Pearson's Correlation Coefficient (R): 0.9793
Standard Estimation Error (SEE): 2420.9886


In [23]:
tfm_model_path = "models/tfm_tlm/tlm.csv"
tfm_scaler_path = "models/tfm_tlm/tlm_scaler.pkl"

os.makedirs(os.path.dirname(tfm_model_path), exist_ok=True)

pr.saveNN(best_model['model'], tfm_model_path)

with open(tfm_scaler_path, "wb") as scaler_file:
    pickle.dump(best_model['scaler'], scaler_file)

print(f"Best model saved successfully at: {tfm_model_path}")
print(f"Scaler saved successfully at: {tfm_scaler_path}")

Best model saved successfully at: models/tfm_tlm/tlm.csv
Scaler saved successfully at: models/tfm_tlm/tlm_scaler.pkl
