# Training and Testing the Models for TFM and TLM Prediction

In [1]:
import pandas as pd
import numpy as np
import os
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
import pyrenn as pr
import pickle

## Load Cleaned Data
This section retrieves the validated NHANES dataset from the processed CSV file and prepares it for feature selection and modeling.

In [2]:
train_path = "data/train.csv"
test_path = "data/test.csv"
if not (os.path.exists(train_path) and os.path.exists(test_path)):
    raise FileNotFoundError(f"Dataset not found at {train_path} or at {test_path}. Ensure the preprocessing step was completed.")

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

print(f"Train Set loaded successfully. Shape: {train.shape}")
print(f"Test Set loaded successfully. Shape: {test.shape}")

Train Set loaded successfully. Shape: (10722, 18)
Test Set loaded successfully. Shape: (4596, 18)


## Define Feature Sets & Target
This section tests different feature sets to determine the best performing model.

In [3]:
feature_sets = {
    "set_1": ['weight', 'height', 'bmi', 'maximal_calf_circumference', 'arm_circumference', 'waist_circumference', 'thigh_circumference', 'gender', 'age_in_months', 'ethnicity'],
    "set_2": ['weight', 'height', 'bmi', 'arm_circumference', 'waist_circumference', 'thigh_circumference', 'gender', 'age_in_months', 'ethnicity'],
    "set_3": ['weight', 'height', 'bmi', 'arm_circumference', 'waist_circumference', 'gender', 'age_in_months', 'ethnicity'],
    "set_4": ['weight', 'height', 'bmi', 'arm_circumference', 'thigh_circumference', 'gender', 'age_in_months', 'ethnicity'],
    "set_5": ['weight', 'height', 'bmi', 'waist_circumference', 'thigh_circumference', 'gender', 'age_in_months', 'ethnicity'],
    "set_6": ['weight', 'height', 'bmi', 'arm_circumference', 'waist_circumference', 'thigh_circumference', 'gender', 'age_in_months'],
}

tfm_target = 'total_fat_mass'
tlm_target = 'total_lean_mass'

## Model Training & Evaluation
This section trains and evaluates multiple models with different feature sets and selects the best one.

In [4]:
def train_and_evaluate(train, test, features, target):
    X_train = train[features]
    y_train = train[target].values.reshape(-1, 1)

    X_test = test[features]
    y_test = test[target].values.reshape(-1, 1)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    n_inputs = X_train_scaled.shape[1]
    n_outputs = 1
    hidden_neurons = 10
    nn = pr.CreateNN([n_inputs, hidden_neurons, n_outputs])

    nn = pr.train_LM(X_train_scaled.T, y_train.T, nn, k_max=300, E_stop=1e-10, dampfac=3.0, dampconst=10.0, verbose=True)

    y_pred_test = pr.NNOut(X_test_scaled.T, nn).reshape(-1, 1)

    pearson_corr, _ = pearsonr(y_test.flatten(), y_pred_test.flatten())
    errors = y_test - y_pred_test
    SEE = np.sqrt(np.sum(errors ** 2) / len(y_test))

    return {"model": nn, "scaler": scaler, "features": features, "R": pearson_corr, "SEE": SEE}

### Training, Evaluation and Saving of Model for Prediction of Total Fat Mass

In [5]:
results = []
for name, features in feature_sets.items():
    print(f"\nTraining model with feature set: {name}")
    result = train_and_evaluate(train, test, features, tfm_target)
    results.append(result)


Training model with feature set: set_1
Iteration:  0 		Error:  1432.5339071123258 	scale factor:  3.0
Iteration:  1 		Error:  80.70507840269545 	scale factor:  0.3
Iteration:  2 		Error:  66.44606274187339 	scale factor:  0.03
Iteration:  3 		Error:  21.12373882194689 	scale factor:  0.03
Iteration:  4 		Error:  12.024634037389397 	scale factor:  0.03
Iteration:  5 		Error:  10.484500374951338 	scale factor:  0.03
Iteration:  6 		Error:  10.457824711424458 	scale factor:  0.3
Iteration:  7 		Error:  10.447478344015813 	scale factor:  0.3
Iteration:  8 		Error:  10.438102562373487 	scale factor:  0.3
Iteration:  9 		Error:  10.429459175863531 	scale factor:  0.3
Iteration:  10 		Error:  10.421534706680863 	scale factor:  0.03
Iteration:  11 		Error:  10.354186165910047 	scale factor:  0.03
Iteration:  12 		Error:  10.323702535636949 	scale factor:  0.03
Iteration:  13 		Error:  10.295046718089408 	scale factor:  0.03
Iteration:  14 		Error:  10.270299084555143 	scale factor:  0.03
Iter

In [6]:
best_model = max(results, key=lambda x: x['R'])
print(f"\nBest Model: {best_model['features']}")
print(f"Pearson's Correlation Coefficient (R): {best_model['R']:.4f}")
print(f"Standard Estimation Error (SEE): {best_model['SEE']:.4f}")


Best Model: ['weight', 'height', 'bmi', 'maximal_calf_circumference', 'arm_circumference', 'waist_circumference', 'thigh_circumference', 'gender', 'age_in_months', 'ethnicity']
Pearson's Correlation Coefficient (R): 0.9776
Standard Estimation Error (SEE): 2375.6427


In [7]:
tfm_model_path = "models/tfm_tlm/tfm.csv"
tfm_scaler_path = "models/tfm_tlm/tfm_scaler.pkl"

os.makedirs(os.path.dirname(tfm_model_path), exist_ok=True)

pr.saveNN(best_model['model'], tfm_model_path)

with open(tfm_scaler_path, "wb") as scaler_file:
    pickle.dump(best_model['scaler'], scaler_file)

print(f"Best model saved successfully at: {tfm_model_path}")
print(f"Scaler saved successfully at: {tfm_scaler_path}")

Best model saved successfully at: models/tfm_tlm/tfm.csv
Scaler saved successfully at: models/tfm_tlm/tfm_scaler.pkl


### Training, Evaluation and Saving of Model for Prediction of Total Lean Mass

In [8]:
results = []
for name, features in feature_sets.items():
    print(f"\nTraining model with feature set: {name}")
    result = train_and_evaluate(train, test, features, tlm_target)
    results.append(result)


Training model with feature set: set_1
Iteration:  0 		Error:  11738.011896021737 	scale factor:  3.0
Iteration:  1 		Error:  82.89645659710675 	scale factor:  0.3
Iteration:  2 		Error:  34.36884888750883 	scale factor:  0.03
Iteration:  3 		Error:  16.715130597906686 	scale factor:  0.003
Iteration:  4 		Error:  9.703983020036597 	scale factor:  0.03
Iteration:  5 		Error:  8.644899343422843 	scale factor:  0.003
Iteration:  6 		Error:  7.434814141836976 	scale factor:  0.03
Iteration:  7 		Error:  7.382805941613031 	scale factor:  0.03
Iteration:  8 		Error:  7.366570654969106 	scale factor:  0.03
Iteration:  9 		Error:  7.353778983061545 	scale factor:  0.03
Iteration:  10 		Error:  7.340958547794849 	scale factor:  0.03
Iteration:  11 		Error:  7.328737160939214 	scale factor:  0.03
Iteration:  12 		Error:  7.317960670809647 	scale factor:  0.03
Iteration:  13 		Error:  7.308578208502303 	scale factor:  0.03
Iteration:  14 		Error:  7.300202160929329 	scale factor:  0.03
Iteratio

In [9]:
best_model = max(results, key=lambda x: x['R'])
print(f"\nBest Model: {best_model['features']}")
print(f"Pearson's Correlation Coefficient (R): {best_model['R']:.4f}")
print(f"Standard Estimation Error (SEE): {best_model['SEE']:.4f}")


Best Model: ['weight', 'height', 'bmi', 'maximal_calf_circumference', 'arm_circumference', 'waist_circumference', 'thigh_circumference', 'gender', 'age_in_months', 'ethnicity']
Pearson's Correlation Coefficient (R): 0.9797
Standard Estimation Error (SEE): 2400.0374


In [10]:
tfm_model_path = "models/tfm_tlm/tlm.csv"
tfm_scaler_path = "models/tfm_tlm/tlm_scaler.pkl"

os.makedirs(os.path.dirname(tfm_model_path), exist_ok=True)

pr.saveNN(best_model['model'], tfm_model_path)

with open(tfm_scaler_path, "wb") as scaler_file:
    pickle.dump(best_model['scaler'], scaler_file)

print(f"Best model saved successfully at: {tfm_model_path}")
print(f"Scaler saved successfully at: {tfm_scaler_path}")

Best model saved successfully at: models/tfm_tlm/tlm.csv
Scaler saved successfully at: models/tfm_tlm/tlm_scaler.pkl
