In [1]:
import pandas as pd

import numpy as np

import warnings


warnings.filterwarnings("ignore")

In [2]:
ppg_df = pd.read_csv("./ppg_sample_dataset.csv")

In [3]:
ppg_df.describe()

Unnamed: 0,mean_ppg,std_ppg,skew_ppg,kurtosis_ppg,mean_hr,std_hr,lf_power,hf_power,lf_hf_ratio,pulse_rate,age,height,weight,glucose
count,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0
mean,0.089223,3.196459,0.900563,5.980537,148.109473,33.849718,0.0,0.001046,0.0,138.839912,31.313433,174.880597,69.19403,115.014925
std,0.062424,1.25838,0.816291,8.641329,14.34102,8.988871,0.0,0.001015,0.0,15.069049,9.434317,8.585616,17.020699,18.736201
min,-0.112238,1.172114,-0.729203,-0.127534,95.130545,1.465148,0.0,0.000102,0.0,95.346819,22.0,154.0,42.0,88.0
25%,0.063786,2.33923,0.379478,0.676531,143.04626,29.118247,0.0,0.000429,0.0,131.101877,24.0,170.0,56.0,102.5
50%,0.107623,3.078835,0.966015,2.02524,148.807733,35.559795,0.0,0.000742,0.0,137.061053,29.0,178.0,62.0,110.0
75%,0.132484,3.686415,1.243527,8.338015,155.931423,40.510074,0.0,0.001255,0.0,148.979405,38.0,180.0,88.0,125.5
max,0.190586,7.521053,5.473804,36.586841,178.181579,45.525528,0.0,0.005792,0.0,172.81611,61.0,187.0,103.0,183.0


## Build an XGBoost DMatrix

In [4]:
from sklearn.model_selection import train_test_split

# Extract feature and target arrays
X, y = ppg_df.drop('glucose', axis=1), ppg_df[['glucose']]

In [5]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

In [6]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [18]:
X_train

Unnamed: 0,mean_ppg,std_ppg,skew_ppg,kurtosis_ppg,mean_hr,std_hr,lf_power,hf_power,lf_hf_ratio,pulse_rate,age,gender,height,weight
48,0.12068,2.837507,0.966015,0.493703,143.145864,45.525528,0.0,0.001265,0.0,125.142701,31,Male,179,93
44,0.120915,2.315596,-0.080138,10.603849,150.758601,36.590369,0.0,0.000511,0.0,137.061053,37,Male,179,90
54,0.135509,1.352172,2.244834,8.457172,138.056857,43.957282,0.0,0.000198,0.0,125.142701,38,Male,180,53
3,0.069762,3.381581,1.097108,0.531415,124.505408,40.825147,0.0,0.000576,0.0,113.224348,29,Male,180,80
34,0.18892,2.57031,1.620743,2.972188,147.950014,45.051212,0.0,0.001448,0.0,131.101877,51,Male,187,83
39,0.098059,2.845309,-0.243325,11.122598,150.764238,32.696613,0.0,0.000739,0.0,143.020229,31,Female,165,60
10,0.012227,2.711081,1.246047,0.598143,160.178273,32.003488,0.0,0.000604,0.0,154.938582,24,Female,170,60
61,0.070484,4.114358,1.182811,1.148756,162.010655,26.644605,0.0,0.005792,0.0,148.979405,27,Male,173,57
19,0.12544,2.271488,-0.463761,27.171899,148.807733,30.334518,0.0,0.000939,0.0,143.020229,25,Female,172,60
36,0.110316,2.405944,1.446773,6.638995,139.316884,44.585685,0.0,0.000296,0.0,125.142701,48,Male,170,88


In [7]:
import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [15]:
# Define hyperparameters
params = {
    "objective": "reg:squarederror",
    "tree_method": "auto"
}

n = 100

evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
    verbose_eval=10,
    early_stopping_rounds=25
)

[0]	train-rmse:15.31059	validation-rmse:15.84926
[10]	train-rmse:1.90341	validation-rmse:14.05132
[20]	train-rmse:0.31791	validation-rmse:14.58534
[30]	train-rmse:0.05908	validation-rmse:14.71481
[32]	train-rmse:0.04254	validation-rmse:14.72737


In [11]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)


rmse = mean_squared_error(y_test, preds)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 217.344


In [17]:
model.save_model("xgb_model.json")