# REGRESSION FOR HBA1C_LEVEL AND AGE WITH KERAS NORMALIZATION

In [1]:
# Import required libraries for regression, preprocessing, and evaluation
import pandas as pd
import numpy as np
from keras.optimizers import Adam
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import mean_squared_error, r2_score
from keras.callbacks import EarlyStopping
from keras.layers import Normalization

In [2]:
model_df = pd.read_csv("not_scaling.csv")
model_df.head()

Unnamed: 0,hypertension,heart_disease,diabetes,problematic,gender,smoking_history,age,bmi,HbA1c_level,blood_glucose_level
0,0,1,0,0,0,0,80.0,25.19,6.6,140
1,0,0,0,0,0,1,54.0,27.32,6.6,80
2,0,0,0,0,1,0,28.0,27.32,5.7,158
3,0,0,0,0,0,2,36.0,23.45,5.0,155
4,1,1,0,0,1,2,76.0,20.14,4.8,155


In [3]:
# Define target column and separate continuous and categorical features
target_cols     = ['age', 'bmi']
continuous_cols = ['HbA1c_level', 'blood_glucose_level']
categorical_cols     = ['hypertension', 'heart_disease', 'diabetes', 'gender', 'problematic','smoking_history']

In [4]:
X = model_df[continuous_cols +  categorical_cols]
y = model_df[target_cols]

In [5]:
# Split dataset into train and test sets (90% training, 10% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42, stratify=None 
)

In [6]:
# Since Keras Normalization will be applied, explicit scaling is not performed. 
# Binary and categorical variables are one-hot encoded and will be normalized by Kerass
X_train_num = X_train[continuous_cols]
X_test_num = X_test[continuous_cols]

ohe = OneHotEncoder(sparse_output=False)
X_train_cat = ohe.fit_transform(X_train[categorical_cols])
X_test_cat  = ohe.transform(X_test[categorical_cols])

cat_cols = ohe.get_feature_names_out(categorical_cols)
X_train_cat_df = pd.DataFrame(X_train_cat, index=X_train.index, columns=cat_cols)
X_test_cat_df  = pd.DataFrame(X_test_cat,  index=X_test.index,  columns=cat_cols)


X_train_final_df = pd.concat([X_train_num, X_train_cat_df], axis=1)
X_test_final_df  = pd.concat([X_test_num,  X_test_cat_df],  axis=1)

X_train_final = X_train_final_df.values
X_test_final  = X_test_final_df.values

In [None]:
# Normalization layer is used to standardize input features instead of manual scaling.
normalizer = Normalization()
normalizer.adapt(X_train_final)

# Build and train a neural network regression model with Keras Sequential
model = Sequential([
    normalizer,  
    Dense(256, activation='tanh'),
    Dense(256, activation='tanh'),
    Dense(128, activation='tanh'), 
    Dense(2)  
])


model.compile(optimizer="Adam", loss='mse', metrics=['mae'])
model.summary()

early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=15,
    verbose=1
)

history = model.fit(
    X_train_final, 
    y_train,
    epochs=200,  
    batch_size=128,
    verbose=1,
    validation_split=0.2,
    callbacks=early_stopping
)

2025-08-25 22:12:45.790980: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-08-25 22:12:45.791012: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-08-25 22:12:45.791040: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-08-25 22:12:45.791057: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-08-25 22:12:45.791066: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/200


2025-08-25 22:12:46.198818: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 675.9997 - mae: 19.7741 - val_loss: 299.6034 - val_mae: 11.8544
Epoch 2/200
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 291.5034 - mae: 11.7169 - val_loss: 279.4991 - val_mae: 11.4644
Epoch 3/200
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 276.9410 - mae: 11.4350 - val_loss: 273.5914 - val_mae: 11.3537
Epoch 4/200
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 276.4049 - mae: 11.4298 - val_loss: 271.2958 - val_mae: 11.3051
Epoch 5/200
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 272.6246 - mae: 11.3708 - val_loss: 264.2686 - val_mae: 11.2453
Epoch 6/200
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 262.7769 - mae: 11.2724 - val_loss: 254.9402 - val_mae: 11.1454
Epoch 7/200
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
y_pred = model.predict(X_test_final)
y_test_array = y_test.values

# Visualization: Ground truth vs Predictions, Residuals, and Distribution Comparison
for i, name in enumerate(target_cols):
    r2 = r2_score(y_test_array[:, i], y_pred[:, i])
    mse = mean_squared_error(y_test_array[:, i], y_pred[:, i])
    rmse = np.sqrt(mse)
    
    plt.figure(figsize=(15, 5))

    # 1. Ground Truth 
    plt.subplot(1, 3, 1)
    plt.scatter(y_test_array[:, i], y_pred[:, i], alpha=0.6, s=20)
    plt.plot([y_test_array[:, i].min(), y_test_array[:, i].max()], 
             [y_test_array[:, i].min(), y_test_array[:, i].max()], 'r--', lw=2)
    plt.xlabel(f'True {name}')
    plt.ylabel(f'Predicted {name}')
    plt.title(f'Ground Truth vs Predictions\nR^2 Score = {r2:.3f},  RMSE= {rmse:.3f}')
    plt.grid(True, alpha=0.3)

    # 2. Residuals Plot
    plt.subplot(1, 3, 2)
    residuals = y_test_array[:, i] - y_pred[:, i]  
    plt.scatter(y_pred[:, i], residuals, alpha=0.6, s=20)  
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel(f'Predicted {name}')
    plt.ylabel('Residuals')
    plt.title('Residuals Plot')
    plt.grid(True, alpha=0.3)

    # 3. Distribution Comparison
    plt.subplot(1, 3, 3)
    plt.hist(y_test_array[:, i], bins=30, alpha=0.7, label='True', density=True)
    plt.hist(y_pred[:, i], bins=30, alpha=0.7, label='Predicted', density=True)
    plt.xlabel(f'{name}')
    plt.ylabel('Density')
    plt.title('Distribution Comparison')
    plt.legend()
    plt.grid(True, alpha=0.3)
        
    plt.tight_layout()
    plt.show()