In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
import xgboost as xgb
from sklearn.impute import SimpleImputer
from glob import glob
import os
from tqdm import tqdm
from hyperopt import fmin, tpe, hp, Trials
from hyperopt.pyll.base import scope
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid
from keras.models import Sequential
from keras.layers import Input, LSTM, GRU, Dense
from keras.optimizers import Adam
from prophet import Prophet
import gc

from sklearn.preprocessing import OneHotEncoder

In [None]:
subject_files = glob("/content/drive/MyDrive/Sensor Data/*.xlsx")
print(f"Number of subject files found: {len(subject_files)}")

Number of subject files found: 54


In [None]:
class Conf :
    # dataloading values
    Subjects = 54 # define how many subjects you want to include in processing
    CGM_Inputs = 2000 # define how many CGM inputs (rows) to consider for training

    # when you set iterator as 5 it would take input as a 6 (always plus one than input)
    iterator = 3 # it would be recommended to set less than 10
    iterator_deeplearning = 2 # deep learnig take time so set it to less than 5


In [None]:
# Updated create_lag_features function
def create_lag_features(df, column, lags):
    df = df.copy()
    for lag in lags:
        df[f'{column}_lag_{lag}'] = df.groupby('subjectID')[column].shift(lag)
    return df

# Updated loop to process subject files
all_data = []

for subject_file in tqdm(sorted(subject_files)[:min(Conf.Subjects, len(subject_files))]):
    try:
        print(f"Processing {subject_file}...")

        # Load CGM and Bolus data
        cgm_data = pd.read_excel(subject_file, sheet_name='CGM', parse_dates=['date'])
        bolus_data = pd.read_excel(subject_file, sheet_name='Bolus', parse_dates=['date'])

        # Ensure 'date' columns are datetime64[ns]
        cgm_data['date'] = pd.to_datetime(cgm_data['date'], errors='coerce')
        bolus_data['date'] = pd.to_datetime(bolus_data['date'], errors='coerce')

        # Drop rows with invalid dates
        cgm_data.dropna(subset=['date'], inplace=True)
        bolus_data.dropna(subset=['date'], inplace=True)

        # Check for issues in date parsing
        if cgm_data['date'].dtype != 'datetime64[ns]' or bolus_data['date'].dtype != 'datetime64[ns]':
            raise ValueError(f"Date parsing failed for {subject_file}")

        # Sort data by 'date'
        cgm_data.sort_values('date', inplace=True)
        bolus_data.sort_values('date', inplace=True)

        # Merge data
        merged_data = pd.merge_asof(cgm_data, bolus_data, on='date', direction='nearest')

        # Ensure lagged features are created per subject
        merged_data['subjectID'] = os.path.basename(subject_file).replace("Subject", "").replace(".xlsx", "")
        lagged_data = create_lag_features(merged_data, 'mg/dl', lags=[1, 2, 3, 4, 5])
        lagged_data.dropna(inplace=True)

        if lagged_data.empty:
            print(f"No lagged data for {subject_file}")
            continue

        all_data.append(lagged_data)  # Append full data without slicing

    except Exception as ex:
        print(f"Error with {subject_file}: {ex}")

  0%|          | 0/54 [00:00<?, ?it/s]

Processing /content/drive/MyDrive/Sensor Data/Subject1.xlsx...


  2%|▏         | 1/54 [00:25<22:23, 25.34s/it]

No lagged data for /content/drive/MyDrive/Sensor Data/Subject1.xlsx
Processing /content/drive/MyDrive/Sensor Data/Subject10.xlsx...


  4%|▎         | 2/54 [00:41<17:11, 19.84s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject11.xlsx...


  6%|▌         | 3/54 [00:57<15:21, 18.07s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject12.xlsx...


  7%|▋         | 4/54 [01:13<14:20, 17.22s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject13.xlsx...


  9%|▉         | 5/54 [01:26<12:57, 15.86s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject14.xlsx...


 11%|█         | 6/54 [01:38<11:39, 14.57s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject15.xlsx...


 13%|█▎        | 7/54 [01:49<10:21, 13.23s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject16.xlsx...


 15%|█▍        | 8/54 [01:59<09:19, 12.15s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject17.xlsx...


 17%|█▋        | 9/54 [02:10<09:01, 12.04s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject18.xlsx...


 19%|█▊        | 10/54 [02:21<08:25, 11.49s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject19.xlsx...


 20%|██        | 11/54 [02:28<07:18, 10.19s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject2.xlsx...


 22%|██▏       | 12/54 [02:52<10:09, 14.51s/it]

No lagged data for /content/drive/MyDrive/Sensor Data/Subject2.xlsx
Processing /content/drive/MyDrive/Sensor Data/Subject20.xlsx...


 24%|██▍       | 13/54 [02:59<08:18, 12.15s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject21.xlsx...


 26%|██▌       | 14/54 [03:06<07:07, 10.69s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject22.xlsx...


 28%|██▊       | 15/54 [03:11<05:50,  8.98s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject23.xlsx...


 30%|██▉       | 16/54 [03:17<05:07,  8.10s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject24.xlsx...


 31%|███▏      | 17/54 [03:21<04:09,  6.74s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject25.xlsx...


 33%|███▎      | 18/54 [03:26<03:46,  6.29s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject26.xlsx...


 35%|███▌      | 19/54 [03:32<03:31,  6.03s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject27.xlsx...


 37%|███▋      | 20/54 [03:35<02:55,  5.15s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject28.xlsx...


 39%|███▉      | 21/54 [03:38<02:36,  4.73s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject29.xlsx...


 41%|████      | 22/54 [03:41<02:08,  4.03s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject3.xlsx...


 43%|████▎     | 23/54 [04:02<04:48,  9.31s/it]

No lagged data for /content/drive/MyDrive/Sensor Data/Subject3.xlsx
Processing /content/drive/MyDrive/Sensor Data/Subject30.xlsx...


 44%|████▍     | 24/54 [04:05<03:34,  7.14s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject31.xlsx...


 46%|████▋     | 25/54 [04:07<02:43,  5.65s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject32.xlsx...


 48%|████▊     | 26/54 [04:09<02:05,  4.49s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject33.xlsx...


 50%|█████     | 27/54 [04:11<01:48,  4.01s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject34.xlsx...


 52%|█████▏    | 28/54 [04:14<01:32,  3.56s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject35.xlsx...


 54%|█████▎    | 29/54 [04:16<01:16,  3.08s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject36.xlsx...


 56%|█████▌    | 30/54 [04:17<01:00,  2.51s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject37.xlsx...


 57%|█████▋    | 31/54 [04:19<00:50,  2.20s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject38.xlsx...


 59%|█████▉    | 32/54 [04:20<00:43,  1.99s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject39.xlsx...


 61%|██████    | 33/54 [04:21<00:36,  1.74s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject4.xlsx...


 63%|██████▎   | 34/54 [04:46<02:52,  8.64s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject40.xlsx...


 65%|██████▍   | 35/54 [04:47<02:01,  6.40s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject41.xlsx...


 67%|██████▋   | 36/54 [04:48<01:27,  4.89s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject42.xlsx...


 69%|██████▊   | 37/54 [04:49<01:03,  3.72s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject43.xlsx...


 70%|███████   | 38/54 [04:50<00:45,  2.87s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject44.xlsx...


 72%|███████▏  | 39/54 [04:52<00:35,  2.38s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject45.xlsx...


 74%|███████▍  | 40/54 [04:52<00:26,  1.88s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject46.xlsx...


 76%|███████▌  | 41/54 [04:53<00:19,  1.51s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject47.xlsx...


 78%|███████▊  | 42/54 [04:54<00:17,  1.42s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject48.xlsx...


 80%|███████▉  | 43/54 [04:55<00:14,  1.30s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject49.xlsx...


 81%|████████▏ | 44/54 [04:56<00:11,  1.17s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject5.xlsx...


 83%|████████▎ | 45/54 [05:17<01:05,  7.25s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject50.xlsx...


 85%|████████▌ | 46/54 [05:18<00:41,  5.21s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject51.xlsx...


 87%|████████▋ | 47/54 [05:18<00:26,  3.79s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject52.xlsx...


 89%|████████▉ | 48/54 [05:19<00:16,  2.78s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject53.xlsx...


 91%|█████████ | 49/54 [05:19<00:10,  2.08s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject54.xlsx...


 93%|█████████▎| 50/54 [05:20<00:06,  1.58s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject6.xlsx...


 94%|█████████▍| 51/54 [05:41<00:22,  7.64s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject7.xlsx...


 96%|█████████▋| 52/54 [06:02<00:22, 11.46s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject8.xlsx...


 98%|█████████▊| 53/54 [06:20<00:13, 13.42s/it]

Processing /content/drive/MyDrive/Sensor Data/Subject9.xlsx...


100%|██████████| 54/54 [06:38<00:00,  7.38s/it]


In [40]:
if all_data:
    final_data = pd.concat(all_data, axis=0).reset_index(drop=True)
    final_data = final_data.groupby('subjectID').head(2000).reset_index(drop=True)

    # No need to one-hot encode subjectID, it's enough to keep it as a categorical column
    # Leave subjectID as a column and drop it from features later if not needed
    X = final_data.drop(columns=['date', 'mg/dl'])
    y = final_data['mg/dl']

    print("Final Dataset Shape:", X.shape)
    print(X.tail())

Final Dataset Shape: (96838, 16)
       normal  carbInput  insulinCarbRatio  bgInput  recommended.carb  \
96833    7.96       41.0               7.0    204.0              5.86   
96834    7.96       41.0               7.0    204.0              5.86   
96835    7.96       41.0               7.0    204.0              5.86   
96836    7.96       41.0               7.0    204.0              5.86   
96837    7.96       41.0               7.0    204.0              5.86   

       recommended.net  recommended.correction  insulinSensitivityFactor  \
96833             7.96                     2.1                      40.0   
96834             7.96                     2.1                      40.0   
96835             7.96                     2.1                      40.0   
96836             7.96                     2.1                      40.0   
96837             7.96                     2.1                      40.0   

       targetBloodGlucose  insulinOnBoard subjectID  mg/dl_lag_1  mg/dl

In [41]:
X.tail()

Unnamed: 0,normal,carbInput,insulinCarbRatio,bgInput,recommended.carb,recommended.net,recommended.correction,insulinSensitivityFactor,targetBloodGlucose,insulinOnBoard,subjectID,mg/dl_lag_1,mg/dl_lag_2,mg/dl_lag_3,mg/dl_lag_4,mg/dl_lag_5
96833,7.96,41.0,7.0,204.0,5.86,7.96,2.1,40.0,110.0,0.255,9,204.0,204.0,150.0,150.0,133.0
96834,7.96,41.0,7.0,204.0,5.86,7.96,2.1,40.0,110.0,0.255,9,198.0,204.0,204.0,150.0,150.0
96835,7.96,41.0,7.0,204.0,5.86,7.96,2.1,40.0,110.0,0.255,9,198.0,198.0,204.0,204.0,150.0
96836,7.96,41.0,7.0,204.0,5.86,7.96,2.1,40.0,110.0,0.255,9,206.0,198.0,198.0,204.0,204.0
96837,7.96,41.0,7.0,204.0,5.86,7.96,2.1,40.0,110.0,0.255,9,206.0,206.0,198.0,198.0,204.0


In [42]:
# Determine the split point (e.g., 95% for training)
train_size = int(len(final_data) * 0.95)



# Split into training and testing sets based on time order
X_train = X.iloc[:train_size]
X_test = X.iloc[train_size:]
y_train = y.iloc[:train_size]
y_test = y.iloc[train_size:]



# Print the shapes of the resulting datasets
print("Dataset shape:", X.shape)
print("Train set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Dataset shape: (96838, 16)
Train set shape: (91996, 16) (91996,)
Test set shape: (4842, 16) (4842,)


In [43]:
# Create an initial DataFrame to store the results
ResultDF = pd.DataFrame(columns=["modelName", "subjects", "datasetLength", "MAE", "RMSE", "MAPE"])

In [48]:
# One-hot encode 'subjectID' with handling unknown categories
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Apply one-hot encoding to 'subjectID' in the training set
encoded_subjects_train = pd.DataFrame(encoder.fit_transform(X_train[['subjectID']]), columns=encoder.get_feature_names_out(['subjectID']))

# Apply one-hot encoding to 'subjectID' in the test set (with handling unknown categories)
encoded_subjects_test = pd.DataFrame(encoder.transform(X_test[['subjectID']]), columns=encoder.get_feature_names_out(['subjectID']))

# Drop the original 'subjectID' column from both train and test sets
X_train = X_train.drop(columns=['subjectID']).reset_index(drop=True)
X_test = X_test.drop(columns=['subjectID']).reset_index(drop=True)

# Concatenate the encoded subject features to both training and test data
X_train = pd.concat([X_train, encoded_subjects_train], axis=1)
X_test = pd.concat([X_test, encoded_subjects_test], axis=1)

# Train XGBoost with default parameters
model = xgb.XGBRegressor(
    verbosity=0,  # Set verbosity to 0 to suppress warnings
    objective='reg:squarederror',  # Default regression objective
    random_state=42  # Set random state for reproducibility
)

# Fit the model with training data
model.fit(X_train, y_train)

# Predict on the test set
preds = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print("MAE:", mae)
print("RMSE:", rmse)
print("MAPE:", mape)

# Append result in dataframe
row_result = ["XGBoost", Conf.Subjects, Conf.Subjects * Conf.CGM_Inputs, mae, rmse, mape]

ResultDF.loc[len(ResultDF)] = row_result

# Clean up memory
del model
gc.collect()



MAE: 6.012086005606764
RMSE: 9.990418414115128
MAPE: 0.030266176265054102


5865

In [49]:
# Train Random Forest with default parameters
model = RandomForestRegressor(
    random_state=42
)

# Fit the model with training data
model.fit(X_train, y_train)

# Predict on the test set
preds = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, preds)

print("MAE:", mae)
print("RMSE:", rmse)
print("MAPE:", mape)

# Append result in dataframe
row_result = ["Random Forest", Conf.Subjects, Conf.Subjects * Conf.CGM_Inputs, mae, rmse, mape]
ResultDF.loc[len(ResultDF)] = row_result

# Clean up
gc.collect()

MAE: 4.047717883654478
RMSE: 6.65983663690466
MAPE: 0.022814725951572863


54