<a href="https://colab.research.google.com/github/fabriziobasso/kaggle/blob/main/File_00_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **<h1 align="center"><font color='#001ddd'> GLUCOSE PREDICTION DATASET**</font></h1>

## **Dataset Description**
The dataset is from a study that collected data from young adults in the UK with type 1 diabetes, who used a continuous glucose monitor (CGM), an insulin pump and a smartwatch. These devices collected blood glucose readings, insulin dosage, carbohydrate intake, and activity data. The data collected was aggregated to five-minute intervals and formatted into samples. Each sample represents a point in time and includes the aggregated five-minute intervals from the previous six hours. The aim is to predict the blood glucose reading an hour into the future, for each of these samples.

The training set takes samples from the first three months of study data from nine of the participants and includes the future blood glucose value. These training samples appear in chronological order and overlap. The testing set takes samples from the remainder of the study period from fifteen of the participants (so unseen participants appear in the testing set). These testing samples do not overlap and are in a random order to avoid data leakage.

**Complexities to be aware of:**

This is medical data so there are missing values and noise in the data
the participants did not all use the same device models (CGM, insulin pump and smartwatch) so there may be differences in the collection method of the data
some participants in the test set do not appear in the training set

In [1]:
%%capture
# Connect to Colab:#
from google.colab import drive
import os
drive.mount('/content/drive')

!pip install category-encoders
!pip install optuna
!pip install optuna-integration
#!pip install scikit-learn==1.4
!pip install catboost
!pip install deeptables

!pip install keras-tuner --upgrade
!pip install keras-nlp
!pip install BorutaShap
!pip install scikit-lego
!!pip install --no-index -U --find-links=/kaggle/input/deeptables-v0-2-5/deeptables-0.2.5 deeptables==0.2.5

In [2]:
folder_script = models_folders = "/content/drive/MyDrive/Exercises/Studies_Structured_Data/Data/Glucose"
os.chdir(folder_script)

In [3]:
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.wrapper import PolynomialWrapper
from category_encoders.count import CountEncoder

# Setup notebook
from pathlib import Path
import ipywidgets as widgets
import pandas as pd
import numpy as np
from pickle import load, dump
import json
import joblib
#import calplot as cal
import missingno as msno
import category_encoders as ce

# Graphic Libraries:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.image as mpimg

# Palette Setup
colors = ['#FB5B68','#FFEB48','#2676A1','#FFBDB0',]
colormap_0 = mpl.colors.LinearSegmentedColormap.from_list("",colors)
palette_1 = sns.color_palette("coolwarm", as_cmap=True)
palette_2 = sns.color_palette("YlOrBr", as_cmap=True)
palette_3 = sns.light_palette("red", as_cmap=True)
palette_4 = sns.color_palette("viridis", as_cmap=True)
palette_5 = sns.color_palette("rocket", as_cmap=True)
palette_6 = sns.color_palette("GnBu", as_cmap=True)
palette_7 = sns.color_palette("tab20c", as_cmap=False)
palette_8 = sns.color_palette("Set2", as_cmap=False)

palette_custom = ['#fbb4ae','#b3cde3','#ccebc5','#decbe4','#fed9a6','#ffffcc','#e5d8bd','#fddaec','#f2f2f2']
palette_9 = sns.color_palette(palette_custom, as_cmap=False)


# Bloomberg
#from xbbg import blp
from catboost import CatBoostRegressor, Pool
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from xgboost.callback import EarlyStopping

import lightgbm as lgb
from lightgbm import (LGBMRegressor,
                      LGBMClassifier,
                      early_stopping,
                      record_evaluation,
                      log_evaluation)

# Time Management
from tqdm import tqdm
from datetime import date
from datetime import datetime
from pandas.tseries.offsets import BMonthEnd, QuarterEnd
import datetime
from pandas.tseries.offsets import BDay # BDay is business day, not birthday...
import datetime as dt
import click
import glob
import os
import gc
import re
import string

from ipywidgets import AppLayout
from ipywidgets import Dropdown, Layout, HTML, AppLayout, VBox, Label, HBox, BoundedFloatText, interact, Output

#from my_func import *

import optuna
from optuna.integration import TFKerasPruningCallback
from optuna.trial import TrialState
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_contour

os.environ["KERAS_BACKEND"] = "tensorflow"

import tensorflow as tf
import keras
from keras import ops
from keras import layers

from keras.layers import Input, LSTM, Dense, Lambda, RepeatVector, Reshape
from keras.models import Model
from keras.losses import MeanSquaredError
from keras.metrics import RootMeanSquaredError

from keras.utils import FeatureSpace, plot_model

# Import libraries for Hypertuning
import keras_tuner as kt
from keras_tuner.tuners import RandomSearch, GridSearch, BayesianOptimization

#from my_func import *

# preprocessing modules
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_validate, GroupKFold, GridSearchCV, RepeatedStratifiedKFold, cross_val_predict

from sklearn.preprocessing import (LabelEncoder,
                                   StandardScaler,
                                   MinMaxScaler,
                                   OrdinalEncoder,
                                   RobustScaler,
                                   PowerTransformer,
                                   OneHotEncoder,
                                   LabelEncoder,
                                   QuantileTransformer,
                                   PolynomialFeatures)

# metrics
import sklearn
from sklearn.metrics import (mean_squared_error,
                             root_mean_squared_error,
                             r2_score,
                             mean_absolute_error,
                             mean_absolute_percentage_error,
                             classification_report,
                             confusion_matrix,
                             ConfusionMatrixDisplay,
                             multilabel_confusion_matrix,
                             accuracy_score,
                             roc_auc_score,
                             auc,
                             roc_curve,
                             log_loss,
                             make_scorer)

# modeling algos
from sklearn.linear_model import (LogisticRegression,
                                  Lasso,
                                  ridge_regression,
                                  LinearRegression,
                                  Ridge,
                                  RidgeCV,
                                  ElasticNet,
                                  BayesianRidge,
                                  HuberRegressor,
                                  TweedieRegressor,
                                  QuantileRegressor,
                                  ARDRegression,
                                  TheilSenRegressor,
                                  PoissonRegressor,
                                  GammaRegressor)

from sklearn.ensemble import (AdaBoostRegressor,
                              AdaBoostClassifier,
                              RandomForestRegressor,
                              RandomForestClassifier,
                              VotingRegressor,
                              GradientBoostingRegressor,
                              GradientBoostingClassifier,
                              StackingRegressor,
                              HistGradientBoostingClassifier,
                              HistGradientBoostingRegressor,
                              ExtraTreesClassifier)

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import FunctionTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
%matplotlib inline

import seaborn as sns
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

from sklearn.multioutput import RegressorChain
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

import itertools
import warnings
import logging
from openpyxl import load_workbook

import statsmodels.api as sm
from pylab import rcParams
import scipy.stats as ss

warnings.filterwarnings('ignore')
#plt.style.use('fivethirtyeight')

# Setting rc parameters in seaborn for plots and graphs-
# Reference - https://matplotlib.org/stable/tutorials/introductory/customizing.html:-
# To alter this, refer to matplotlib.rcParams.keys()

sns.set({"axes.facecolor"       : "#ffffff",
         "figure.facecolor"     : "#ffffff",
         "axes.edgecolor"       : "#000000",
         "grid.color"           : "#ffffff",
         "font.family"          : ['Cambria'],
         "axes.labelcolor"      : "#000000",
         "xtick.color"          : "#000000",
         "ytick.color"          : "#000000",
         "grid.linewidth"       : 0.5,
         'grid.alpha'           :0.5,
         "grid.linestyle"       : "--",
         "axes.titlecolor"      : 'black',
         'axes.titlesize'       : 12,
         'axes.labelweight'     : "bold",
         'legend.fontsize'      : 7.0,
         'legend.title_fontsize': 7.0,
         'font.size'            : 7.5,
         'xtick.labelsize'      : 7.5,
         'ytick.labelsize'      : 7.5,
        });

sns.set_style("whitegrid",{"grid.linestyle":"--", 'grid.linewidth':0.2, 'grid.alpha':0.5})
# Set Style
mpl.rcParams['figure.dpi'] = 120;

# Making sklearn pipeline outputs as dataframe:-
pd.set_option('display.max_columns', 100);
pd.set_option('display.max_rows', 50);

sns.despine(left=True, bottom=True, top=False, right=False)

mpl.rcParams['axes.spines.left'] = True
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.bottom'] = True

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



<Figure size 960x660 with 0 Axes>

## 1.0 Functions:

In [4]:
def encode_target(y_train, y_test, encoder_type='label', enc_strategy=False):
    """
    Encodes the target columns in the training and testing data
    using the specified encoder type.

    Parameters:
    y_train (pd.Series or pd.DataFrame): Training target data.
    y_test (pd.Series or pd.DataFrame): Testing target data.

    Returns:
    y_train_encoded (pd.Series): Encoded training target data.
    y_test_encoded (pd.Series): Encoded testing target data.
    """

    if encoder_type == 'label':
        encoder = LabelEncoder()
        y_train_encoded = encoder.fit_transform(y_train)
        y_test_encoded = encoder.transform(y_test)

        y_train_encoded = pd.Series(y_train_encoded, index=y_train.index, name="Target")
        y_test_encoded = pd.Series(y_test_encoded, index=y_test.index, name="Target")


    elif encoder_type == 'onehot':
        y_train_ = y_train.values.reshape(-1, 1)
        y_test_ = y_test.values.reshape(-1, 1)

        encoder = OneHotEncoder(sparse_output=False)
        y_train_encoded = encoder.fit_transform(y_train_)
        y_test_encoded = encoder.transform(y_test_)

        y_train_encoded = pd.DataFrame(y_train_encoded, index=y_train.index)
        y_test_encoded = pd.DataFrame(y_test_encoded, index=y_test.index)

    else:
        raise ValueError("Invalid encoder_type. Currently supported: 'label'.")

    if enc_strategy:
        return y_train_encoded, y_test_encoded, encoder

    else:
        return y_train_encoded, y_test_encoded

def encode_data(X_train, X_test, encoder_type='label', columns=None, map=None):
    """
    Encodes the training and testing data using the specified encoder type.

    Parameters:
    X_train (pd.DataFrame): Training data.
    X_test (pd.DataFrame): Testing data.
    encoder_type (str): Type of encoder ('label' or 'onehot'). Default is 'label'.
    columns (list): List of columns to encode. If None, all object type columns are encoded.

    Returns:
    X_train_encoded (pd.DataFrame): Encoded training data.
    X_test_encoded (pd.DataFrame): Encoded testing data.
    """

    if columns is None:
        # Default to all object type columns if no columns are specified
        columns = X_train.select_dtypes(include=['object']).columns.tolist()

    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()

    if encoder_type == 'label':
        for col in columns:
            le = LabelEncoder()
            X_train_encoded[col] = le.fit_transform(X_train[col])
            X_test_encoded[col] = le.transform(X_test[col])

    elif encoder_type == 'onehot':
        for col in columns:
            ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
            # Fit the encoder on the training data and transform both training and test data
            encoded_train = ohe.fit_transform(X_train[[col]])
            encoded_test = ohe.transform(X_test[[col]])

            # Create a DataFrame with the encoded data
            encoded_train_df = pd.DataFrame(encoded_train, columns=ohe.get_feature_names_out([col]))
            encoded_test_df = pd.DataFrame(encoded_test, columns=ohe.get_feature_names_out([col]))

            # Concatenate the new columns to the original dataframes and drop the original columns
            X_train_encoded = pd.concat([X_train_encoded.drop(col, axis=1), encoded_train_df], axis=1)
            X_test_encoded = pd.concat([X_test_encoded.drop(col, axis=1), encoded_test_df], axis=1)

    elif encoder_type == 'count_encoder':

          for col in columns:

                target_encoder = CountEncoder(cols=columns)
                X_train_encoded = target_encoder.fit_transform(X_train_encoded)
                X_test_encoded = target_encoder.transform(X_test_encoded)

    else:
        raise ValueError("Invalid encoder_type. Currently supported: 'label', 'onehot', 'target_encoder'.")

    return X_train_encoded, X_test_encoded

## **Importing the Dataset**

## **Files**
* activities.txt - a list of activity names that appear in the activity-X:XX columns
* sample_submission.csv - a sample submission file in the correct format
* test.csv - the test set
* train.csv - the training set

## **Columns**
* train.csv:
    * **id - row id** consisting of participant number and a count for that participant
    * **p_num** - participant number
    * **time** - time of day in the format HH:MM:SS
    * **bg-X:XX** - blood glucose reading in mmol/L, X:XX(H:SS) time in the past (e.g. bg-2:35, would be the blood glucose reading from 2 hours and 35 minutes before the time value for that row), recorded by the continuous glucose monitor
    * **insulin-X:XX** - total insulin dose received in units in the last 5 minutes, X:XX(H:SS) time in the past (e.g. insulin-2:35, would be the total insulin dose received between 2 hours and 40 minutes and 2 hours and 35 minutes before the time value for that row), recorded by the insulin pump
    * **carbs-X:XX** - total carbohydrate value consumed in grammes in the last 5 minutes, X:XX(H:SS) time in the past (e.g. carbs-2:35, would be the total carbohydrate value consumed between 2 hours and 40 minutes and 2 hours and 35 minutes before the time value for that row), recorded by the participant
    * **hr-X:XX** - mean heart rate in beats per minute in the last 5 minutes, X:XX(H:SS) time in the past (e.g. hr-2:35, would be the mean heart rate between 2 hours and 40 minutes and 2 hours and 35 minutes before the time value for that row), recorded by the smartwatch
    * **steps-X:XX** - total steps walked in the last 5 minutes, X:XX(H:SS) time in the past (e.g. * steps-2:35, would be the total steps walked between 2 hours and 40 minutes and 2 hours and 35 minutes before the time value for that row), recorded by the smartwatch
    * **cals-X:XX** - total calories burnt in the last 5 minutes, X:XX(H:SS) time in the past (e.g. cals-2:35, would be the total calories burned between 2 hours and 40 minutes and 2 hours and 35 minutes before the time value for that row), calculated by the smartwatch
    * **activity-X:XX** - self-declared activity performed in the last 5 minutes, X:XX(H:SS) time in the past (e.g. activity-2:35, would show a string name of the activity performed between 2 hours and 40 minutes and 2 hours and 35 minutes before the time value for that row), set on the smartwatch
    * **bg+1:00** - blood glucose reading in mmol/L an hour in the future, this is the value you will be predicting (not provided in test.csv)

In [5]:
df=pd.read_csv("/content/drive/MyDrive/Exercises/Studies_Structured_Data/Data/Glucose/train.csv", index_col=0)
df_test=pd.read_csv("/content/drive/MyDrive/Exercises/Studies_Structured_Data/Data/Glucose/test.csv", index_col=0)

train_index=df.index
test_index=df_test.index

df_test.shape,df.shape

((3644, 506), (177024, 507))

In [6]:
df.p_num.unique()

array(['p01', 'p02', 'p03', 'p04', 'p05', 'p06', 'p10', 'p11', 'p12'],
      dtype=object)

In [7]:
df.head(5)

Unnamed: 0_level_0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,bg-5:15,bg-5:10,bg-5:05,bg-5:00,bg-4:55,bg-4:50,bg-4:45,bg-4:40,bg-4:35,bg-4:30,bg-4:25,bg-4:20,bg-4:15,bg-4:10,bg-4:05,bg-4:00,bg-3:55,bg-3:50,bg-3:45,bg-3:40,bg-3:35,bg-3:30,bg-3:25,bg-3:20,bg-3:15,bg-3:10,bg-3:05,bg-3:00,bg-2:55,bg-2:50,bg-2:45,bg-2:40,bg-2:35,bg-2:30,bg-2:25,bg-2:20,bg-2:15,bg-2:10,bg-2:05,bg-2:00,...,activity-4:00,activity-3:55,activity-3:50,activity-3:45,activity-3:40,activity-3:35,activity-3:30,activity-3:25,activity-3:20,activity-3:15,activity-3:10,activity-3:05,activity-3:00,activity-2:55,activity-2:50,activity-2:45,activity-2:40,activity-2:35,activity-2:30,activity-2:25,activity-2:20,activity-2:15,activity-2:10,activity-2:05,activity-2:00,activity-1:55,activity-1:50,activity-1:45,activity-1:40,activity-1:35,activity-1:30,activity-1:25,activity-1:20,activity-1:15,activity-1:10,activity-1:05,activity-1:00,activity-0:55,activity-0:50,activity-0:45,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
p01_0,p01,06:10:00,,,9.6,,,9.7,,,9.2,,,8.7,,,8.4,,,8.1,,,8.3,,,9.6,,,11.1,,,11.8,,,12.8,,,13.9,,,14.2,,,14.2,,,15.4,,,17.2,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,13.4
p01_1,p01,06:25:00,,,9.7,,,9.2,,,8.7,,,8.4,,,8.1,,,8.3,,,9.6,,,11.1,,,11.8,,,12.8,,,13.9,,,14.2,,,14.2,,,15.4,,,17.2,,,18.2,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12.8
p01_2,p01,06:40:00,,,9.2,,,8.7,,,8.4,,,8.1,,,8.3,,,9.6,,,11.1,,,11.8,,,12.8,,,13.9,,,14.2,,,14.2,,,15.4,,,17.2,,,18.2,,,18.4,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15.5
p01_3,p01,06:55:00,,,8.7,,,8.4,,,8.1,,,8.3,,,9.6,,,11.1,,,11.8,,,12.8,,,13.9,,,14.2,,,14.2,,,15.4,,,17.2,,,18.2,,,18.4,,,18.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14.8
p01_4,p01,07:10:00,,,8.4,,,8.1,,,8.3,,,9.6,,,11.1,,,11.8,,,12.8,,,13.9,,,14.2,,,14.2,,,15.4,,,17.2,,,18.2,,,18.4,,,18.0,,,17.3,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12.7


In [8]:
df_test.head()

Unnamed: 0_level_0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,bg-5:15,bg-5:10,bg-5:05,bg-5:00,bg-4:55,bg-4:50,bg-4:45,bg-4:40,bg-4:35,bg-4:30,bg-4:25,bg-4:20,bg-4:15,bg-4:10,bg-4:05,bg-4:00,bg-3:55,bg-3:50,bg-3:45,bg-3:40,bg-3:35,bg-3:30,bg-3:25,bg-3:20,bg-3:15,bg-3:10,bg-3:05,bg-3:00,bg-2:55,bg-2:50,bg-2:45,bg-2:40,bg-2:35,bg-2:30,bg-2:25,bg-2:20,bg-2:15,bg-2:10,bg-2:05,bg-2:00,...,activity-4:05,activity-4:00,activity-3:55,activity-3:50,activity-3:45,activity-3:40,activity-3:35,activity-3:30,activity-3:25,activity-3:20,activity-3:15,activity-3:10,activity-3:05,activity-3:00,activity-2:55,activity-2:50,activity-2:45,activity-2:40,activity-2:35,activity-2:30,activity-2:25,activity-2:20,activity-2:15,activity-2:10,activity-2:05,activity-2:00,activity-1:55,activity-1:50,activity-1:45,activity-1:40,activity-1:35,activity-1:30,activity-1:25,activity-1:20,activity-1:15,activity-1:10,activity-1:05,activity-1:00,activity-0:55,activity-0:50,activity-0:45,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
p01_8459,p01,06:45:00,,9.2,,,10.2,,,10.3,,,10.2,,,11.7,,,13.5,,,15.3,,,15.0,,,14.4,,,13.6,,,12.6,,,11.9,,,11.4,,,11.9,,,12.9,,,13.8,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
p01_8460,p01,11:25:00,,,9.9,,,9.4,,,9.1,,,8.3,,,7.7,,,7.8,,,7.7,,,7.1,,,7.1,,,6.7,,,6.6,,,6.9,,,8.6,,,9.7,,,9.1,,,7.7,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Walk,Walk,Walk
p01_8461,p01,14:45:00,,5.5,,,5.5,,,5.2,,,5.2,,,,5.1,,,5.1,,,4.7,,,4.4,,,4.8,,,6.1,,,6.9,,,6.3,,,5.2,,,4.7,,,5.1,,,5.1,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
p01_8462,p01,04:30:00,,3.4,,,3.9,,,4.7,,,4.1,,,3.5,,,3.9,,,4.3,,,4.6,,,3.8,,,3.1,,,3.6,,,4.2,,,4.2,,,4.1,,,4.2,,,3.9,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
p01_8463,p01,04:20:00,,,8.3,,,10.0,,,12.2,,,12.8,,,12.8,,,12.2,,,11.4,,,12.1,,,11.8,,,10.7,,,9.6,,,8.9,,,8.4,,,7.3,,,5.8,,,4.3,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [9]:
df_all = pd.concat([df,df_test])

Sub-dataset are created for each main set of features to inpute missing values:

In [10]:
# Select columns containing the word "bg"
bg_col = df_all.filter(regex='bg|time|p_num')
insulin_col = df_all.filter(regex='insulin|time|p_num')
carb_col = df_all.filter(regex='carbs|time|p_num')
hr_col = df_all.filter(regex='hr|time|p_num')
step_col = df_all.filter(regex='steps|time|p_num')
cal_col = df_all.filter(regex='cal|time|p_num')
act_col = df_all.filter(regex='activity|time|p_num')

In [11]:
#act_col.unique()

## **1.0 - Fill bg-xx:xx NaN**

In [12]:
bg_col_T = bg_col.drop(columns=["p_num","time","bg+1:00"]).T.copy()
bg_col_T.interpolate(inplace=True)
bg_col_T.ffill(inplace=True)
bg_col_T.bfill(inplace=True)

bg_col_T.isna().sum().sum()

0

In [13]:
df_all[bg_col_T.T.columns] = np.round(bg_col_T.T,1).values

## **2.0 - Fill Insulin NaN**

In [14]:
insulin_col.isna().sum().sum()

688995

In [15]:
#msno.matrix(insulin_col,figsize=(12,5), fontsize=10);

**The operation below is repeated three times. For:"p11","p15","p21"**

**p11**

In [16]:
insulin_fillna = insulin_col[insulin_col["p_num"]=="p11"].drop(["p_num"],axis=1)#.reset_index()#.mean(axis=1)
print(insulin_fillna.shape)
display(insulin_fillna.head(),insulin_fillna.isna().sum().sum())

(24776, 73)


Unnamed: 0_level_0,time,insulin-5:55,insulin-5:50,insulin-5:45,insulin-5:40,insulin-5:35,insulin-5:30,insulin-5:25,insulin-5:20,insulin-5:15,insulin-5:10,insulin-5:05,insulin-5:00,insulin-4:55,insulin-4:50,insulin-4:45,insulin-4:40,insulin-4:35,insulin-4:30,insulin-4:25,insulin-4:20,insulin-4:15,insulin-4:10,insulin-4:05,insulin-4:00,insulin-3:55,insulin-3:50,insulin-3:45,insulin-3:40,insulin-3:35,insulin-3:30,insulin-3:25,insulin-3:20,insulin-3:15,insulin-3:10,insulin-3:05,insulin-3:00,insulin-2:55,insulin-2:50,insulin-2:45,insulin-2:40,insulin-2:35,insulin-2:30,insulin-2:25,insulin-2:20,insulin-2:15,insulin-2:10,insulin-2:05,insulin-2:00,insulin-1:55,insulin-1:50,insulin-1:45,insulin-1:40,insulin-1:35,insulin-1:30,insulin-1:25,insulin-1:20,insulin-1:15,insulin-1:10,insulin-1:05,insulin-1:00,insulin-0:55,insulin-0:50,insulin-0:45,insulin-0:40,insulin-0:35,insulin-0:30,insulin-0:25,insulin-0:20,insulin-0:15,insulin-0:10,insulin-0:05,insulin-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1
p11_0,06:05:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
p11_1,06:10:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
p11_2,06:15:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
p11_3,06:20:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
p11_4,06:25:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


677556

In [17]:
unique_times = insulin_fillna.time.unique()

for time in tqdm(unique_times):
    list_of_means = insulin_fillna[insulin_fillna.time==time].iloc[:,1:].mean(axis=0)

    for col in insulin_fillna.columns[1:]:

      mask = (insulin_fillna['time'] == time)
      insulin_fillna.loc[mask, col] = insulin_fillna.loc[mask, col].fillna(list_of_means[col])

100%|██████████| 288/288 [01:14<00:00,  3.85it/s]


In [18]:
insulin_fillna.head()

Unnamed: 0_level_0,time,insulin-5:55,insulin-5:50,insulin-5:45,insulin-5:40,insulin-5:35,insulin-5:30,insulin-5:25,insulin-5:20,insulin-5:15,insulin-5:10,insulin-5:05,insulin-5:00,insulin-4:55,insulin-4:50,insulin-4:45,insulin-4:40,insulin-4:35,insulin-4:30,insulin-4:25,insulin-4:20,insulin-4:15,insulin-4:10,insulin-4:05,insulin-4:00,insulin-3:55,insulin-3:50,insulin-3:45,insulin-3:40,insulin-3:35,insulin-3:30,insulin-3:25,insulin-3:20,insulin-3:15,insulin-3:10,insulin-3:05,insulin-3:00,insulin-2:55,insulin-2:50,insulin-2:45,insulin-2:40,insulin-2:35,insulin-2:30,insulin-2:25,insulin-2:20,insulin-2:15,insulin-2:10,insulin-2:05,insulin-2:00,insulin-1:55,insulin-1:50,insulin-1:45,insulin-1:40,insulin-1:35,insulin-1:30,insulin-1:25,insulin-1:20,insulin-1:15,insulin-1:10,insulin-1:05,insulin-1:00,insulin-0:55,insulin-0:50,insulin-0:45,insulin-0:40,insulin-0:35,insulin-0:30,insulin-0:25,insulin-0:20,insulin-0:15,insulin-0:10,insulin-0:05,insulin-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1
p11_0,06:05:00,0.080842,0.098657,0.080072,0.136675,0.080072,0.078766,0.076809,0.107706,0.236604,0.075,0.103302,0.075,0.075,0.075,0.075708,0.097642,0.075708,0.075,0.075,0.275,0.075,0.075,0.200236,0.075,0.075708,0.12783,0.075,0.124057,0.075,0.075,0.246698,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.182547,0.075,0.075,0.075,0.075,0.075,0.0917,0.127549,0.157738,0.13887,0.0917,0.0917,0.0917,0.0917,0.205928,0.090834,0.12857,0.090834,0.090834,0.156872,0.090834,0.090834,0.090834,0.089968,0.146572,0.108836,0.089968,0.116383,0.089968,0.089968,0.073585
p11_1,06:10:00,0.101689,0.086689,0.143218,0.087862,0.086047,0.144715,0.112095,0.285876,0.077553,0.102273,0.075,0.075,0.075,0.075682,0.096818,0.075682,0.075418,0.076231,0.26938,0.076971,0.077207,0.19806,0.077,0.07688,0.126978,0.076436,0.124496,0.077973,0.078395,0.243971,0.078,0.076907,0.075987,0.076153,0.076896,0.077625,0.078275,0.07904,0.078635,0.078253,0.181829,0.078287,0.077785,0.077133,0.077295,0.077278,0.094498,0.128727,0.156931,0.13898,0.093013,0.092973,0.093025,0.092549,0.202427,0.092904,0.128671,0.09094,0.091167,0.154502,0.09086,0.090662,0.089836,0.088513,0.142909,0.106545,0.088364,0.113715,0.087884,0.088098,0.0725,0.072335
p11_2,06:15:00,0.080072,0.136675,0.080072,0.078766,0.076809,0.107706,0.236604,0.075,0.103302,0.075,0.075,0.075,0.075708,0.097642,0.075708,0.075,0.075,0.275,0.075,0.075,0.200236,0.075,0.075708,0.12783,0.075,0.124057,0.075,0.075,0.246698,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.182547,0.075,0.075,0.075,0.075,0.075,0.0917,0.127549,0.157738,0.13887,0.0917,0.0917,0.0917,0.0917,0.205928,0.090834,0.12857,0.090834,0.090834,0.156872,0.090834,0.090834,0.090834,0.089968,0.146572,0.108836,0.089968,0.116383,0.089968,0.089968,0.073585,0.073585,0.160377
p11_3,06:20:00,0.135533,0.079978,0.078696,0.076776,0.1071,0.233611,0.075,0.102778,0.074998,0.074843,0.074485,0.074913,0.096372,0.074902,0.074143,0.074089,0.270533,0.07443,0.074641,0.197798,0.075,0.076035,0.128026,0.076826,0.125276,0.077143,0.076861,0.24493,0.0761,0.075806,0.075489,0.075248,0.075,0.075,0.075,0.07525,0.075961,0.076359,0.181337,0.075085,0.075304,0.076528,0.077926,0.077635,0.092559,0.126983,0.157033,0.139843,0.094319,0.093709,0.092735,0.091972,0.203837,0.090863,0.127833,0.090798,0.090804,0.155639,0.09085,0.09085,0.090848,0.089998,0.145554,0.108515,0.089976,0.115639,0.089041,0.088559,0.072567,0.072789,0.157944,0.152407
p11_4,06:25:00,0.080072,0.078766,0.076809,0.107706,0.236604,0.075,0.103302,0.075,0.075,0.075,0.075708,0.097642,0.075708,0.075,0.075,0.275,0.075,0.075,0.200236,0.075,0.075708,0.12783,0.075,0.124057,0.075,0.075,0.246698,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.182547,0.075,0.075,0.075,0.075,0.075,0.0917,0.127549,0.157738,0.13887,0.0917,0.0917,0.0917,0.0917,0.205928,0.090834,0.12857,0.090834,0.090834,0.156872,0.090834,0.090834,0.090834,0.089968,0.146572,0.108836,0.089968,0.116383,0.089968,0.089968,0.073585,0.073585,0.160377,0.154717,0.072792


In [19]:
#df_all.filter(regex='insulin|time|id|p_num')
insulin_fillna.isna().sum().sum(),insulin_fillna.shape,insulin_col.shape

(0, (24776, 73), (180668, 74))

**Replace values in df**

In [20]:
insulin_col.loc[insulin_fillna.index, insulin_fillna.columns] = insulin_fillna.values
insulin_col.loc[insulin_fillna.index, insulin_fillna.columns].isna().sum().sum(),insulin_col.shape

(0, (180668, 74))

In [21]:
insulin_col[insulin_col["insulin-5:55"].isna()==True].p_num.unique()

array(['p15', 'p21'], dtype=object)

In [22]:
#msno.matrix(insulin_col,figsize=(12,5), fontsize=10);

**-----------------------------p15-----------------------------**

In [23]:
p15 = insulin_col[insulin_col["p_num"]=="p15"].copy()
p15_sorted = insulin_col[insulin_col["p_num"]=="p15"].sort_values(by="time").copy()

In [24]:
#display(p15.head())
#display(p15_sorted.head())

In [25]:
p15_sorted.bfill(inplace=True)
p15_sorted.ffill(inplace=True)
p15_sorted.isna().sum().sum()

0

In [26]:
#p15_sorted = p15_sorted.loc[p15.index]
#p15_sorted

In [27]:
insulin_col.loc[p15_sorted.index, p15_sorted.columns] = p15_sorted.values
insulin_col.loc[p15_sorted.index, p15_sorted.columns].isna().sum().sum()

0

In [28]:
insulin_col[insulin_col["insulin-0:00"].isna()==True].p_num.unique()

array(['p06', 'p12', 'p21'], dtype=object)

In [29]:
df_all[insulin_col.columns] = insulin_col.values

**-----------------------------p21-----------------------------**

In [30]:
p21 = insulin_col[insulin_col["p_num"]=="p21"].copy()
p21_sorted = insulin_col[insulin_col["p_num"]=="p21"].sort_values(by="time").copy()

In [31]:
p21[p21["insulin-0:00"].isna()==True].index#.sum().sum()

Index(['p21_11', 'p21_66', 'p21_80', 'p21_200', 'p21_225'], dtype='object', name='id')

In [32]:
p21.loc["p21_11"] = p21.loc["p21_11"].fillna(method="ffill").values
p21.loc["p21_66","insulin-5:55":] = p21_sorted.iloc[207:212,2:].mean(axis=0).values
p21.loc["p21_80","insulin-5:55":] = p21_sorted.iloc[65:74,2:].mean(axis=0).values
p21.loc["p21_200","insulin-5:55":] = p21_sorted.iloc[74:83,2:].mean(axis=0).values
p21.loc["p21_225","insulin-5:55":] = p21_sorted.iloc[166:177,2:].mean(axis=0).values

In [33]:
#p21_sorted.iloc[166:177]#.mean(axis=0)
#p21_sorted.loc[:"p21_225"]

In [34]:
(insulin_col.index==insulin_col.index).all()

True

In [35]:
insulin_col.loc[p21.index, p21.columns] = p21.values
insulin_col.isna().sum().sum()

112

**-----------------------------p06-----------------------------**

In [36]:
p06 = insulin_col[insulin_col["p_num"]=="p06"].copy()
p06_sorted = insulin_col[insulin_col["p_num"]=="p06"].sort_values(by="time").copy()

In [37]:
p06[p06["insulin-0:00"].isna()==True].index#.sum().sum()

Index(['p06_8396'], dtype='object', name='id')

In [38]:
#p06_sorted.iloc[1670:1684]#.mean(axis=0)
#p06_sorted.loc[:"p06_8396"]

In [39]:
p06.loc["p06_8396","insulin-5:25":] = p06_sorted.iloc[1670:1684,8:].mean(axis=0).values

In [40]:
p06[p06["insulin-0:00"].isna()==True].index#.sum().sum()

Index([], dtype='object', name='id')

In [41]:
insulin_col.loc[p06.index, p06.columns] = p06.values
insulin_col.isna().sum().sum()

46

**-----------------------------p12-----------------------------**

In [42]:
p12 = insulin_col[insulin_col["p_num"]=="p12"].copy()
p12_sorted = insulin_col[insulin_col["p_num"]=="p12"].sort_values(by="time").copy()

In [43]:
p12[p12["insulin-0:00"].isna()==True].index#.sum().sum()

Index(['p12_25319'], dtype='object', name='id')

In [44]:
#p12_sorted.iloc[1670:1684]#.mean(axis=0)
#p12_sorted[p12_sorted.time=="21:00:00"]#.loc[:"p12_25319"]

In [45]:
p12.loc["p12_25319","insulin-3:45":] = p12_sorted[p12_sorted.time=="21:00:00"].iloc[:,28:].mean(axis=0).values

In [46]:
p12[p12["insulin-0:00"].isna()==True].index#.sum().sum()

Index([], dtype='object', name='id')

In [47]:
insulin_col.loc[p12.index, p12.columns] = p12.values
insulin_col.isna().sum().sum()

0

### Replace in original dataset:

In [48]:
df_all[insulin_col.columns] = insulin_col.values

In [49]:
insulin_col = df_all.filter(regex='insulin|time|id|p_num')
insulin_col.isna().sum().sum()

0

## **3.0 - Fill NaN for Carb Columns**

In [50]:
#carb_col_ = carb_col.fillna(0)
#carb_col_.iloc[28:50,2:].T.plot()

The most reasonable way to fill the missing values for these features seem to be with 0s.

In [51]:
df_all[df_all.filter(regex='carbs').columns] = df_all.filter(regex='carbs').fillna(0)

## **4.0 - Fill NaN for HR Columns**

In [52]:
#msno.matrix(hr_col,figsize=(12,5), fontsize=10);

In [53]:
#hr_col.reset_index().to_csv("hr.csv")

In [54]:
df_ = hr_col.copy()

# Get all column names
all_columns = df_.columns

# Select the columns that start with 'hr-'
columns_to_fill = [col for col in all_columns if col.startswith('hr-')]

# Group by 'p_num' and 'time', calculate the mean for each group
grouped_df = df_.groupby(['p_num', 'time'])[columns_to_fill].mean().reset_index()

# Merge the grouped DataFrame with the original DataFrame
merged_df = pd.merge(df_, grouped_df, on=['p_num', 'time'], how='left', suffixes=('', '_mean'))

# Select columns in `columns_to_fill` from `merged_df`
filled_df = merged_df[columns_to_fill]

# Iterate over columns and fill missing values
for col in columns_to_fill:
    mean_col = col + '_mean'  # Get the name of the corresponding mean column
    filled_df[col] = filled_df[col].fillna(merged_df[mean_col])

filled_df = filled_df.ffill(axis=1)
filled_df = filled_df.bfill(axis=1)

In [55]:
hr_col[filled_df.columns] = filled_df.values

In [56]:
df_ = hr_col.copy()

# Get all column names
all_columns = df_.columns

# Select the columns that start with 'hr-'
columns_to_fill = [col for col in all_columns if col.startswith('hr-')]

# Group by 'p_num' and 'time', calculate the mean for each group
grouped_df = df_.groupby(['p_num'])[columns_to_fill].mean().reset_index()

# Merge the grouped DataFrame with the original DataFrame
merged_df = pd.merge(df_, grouped_df, on=['p_num'], how='left', suffixes=('', '_mean'))

# Select columns in `columns_to_fill` from `merged_df`
filled_df = merged_df[columns_to_fill]

# Iterate over columns and fill missing values
for col in columns_to_fill:
    mean_col = col + '_mean'  # Get the name of the corresponding mean column
    filled_df[col] = filled_df[col].fillna(merged_df[mean_col])

In [57]:
filled_df = np.round(filled_df,1)
hr_col[filled_df.columns] = filled_df.values

In [58]:
#msno.matrix(hr_col,figsize=(12,5), fontsize=10);

### Replace in original dataset:

In [59]:
df_all[hr_col.columns] = hr_col.values

In [60]:
hr_col = df_all.filter(regex='hr|time|id|p_num')
hr_col.isna().sum().sum()

0

## **5.0 - Fill NaN for Step Columns**

In [61]:
step_col.groupby(["p_num","time"]).mean().iloc[30:80]
step_col.tail(10)

Unnamed: 0_level_0,p_num,time,steps-5:55,steps-5:50,steps-5:45,steps-5:40,steps-5:35,steps-5:30,steps-5:25,steps-5:20,steps-5:15,steps-5:10,steps-5:05,steps-5:00,steps-4:55,steps-4:50,steps-4:45,steps-4:40,steps-4:35,steps-4:30,steps-4:25,steps-4:20,steps-4:15,steps-4:10,steps-4:05,steps-4:00,steps-3:55,steps-3:50,steps-3:45,steps-3:40,steps-3:35,steps-3:30,steps-3:25,steps-3:20,steps-3:15,steps-3:10,steps-3:05,steps-3:00,steps-2:55,steps-2:50,steps-2:45,steps-2:40,steps-2:35,steps-2:30,steps-2:25,steps-2:20,steps-2:15,steps-2:10,steps-2:05,steps-2:00,steps-1:55,steps-1:50,steps-1:45,steps-1:40,steps-1:35,steps-1:30,steps-1:25,steps-1:20,steps-1:15,steps-1:10,steps-1:05,steps-1:00,steps-0:55,steps-0:50,steps-0:45,steps-0:40,steps-0:35,steps-0:30,steps-0:25,steps-0:20,steps-0:15,steps-0:10,steps-0:05,steps-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1
p24_251,p24,22:00:00,,,,0.0,0.0,0.0,23.0,51.0,0.0,0.0,26.0,177.0,313.0,239.0,0.0,0.0,,,,18.0,185.0,12.0,0.0,0.0,0.0,,0.0,45.0,,,,,62.0,,,62.0,,,105.0,146.0,10.0,13.0,23.0,6.0,16.0,141.0,135.0,364.0,333.0,389.0,241.0,185.0,311.0,397.0,349.0,115.0,0.0,0.0,406.0,317.0,260.0,77.0,233.0,6.0,0.0,254.0,0.0,0.0,6.0,0.0,22.0,0.0
p24_252,p24,08:20:00,,0.0,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,0.0
p24_253,p24,03:15:00,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,4.0,0.0,,,0.0,,,,0.0,,,,0.0,,8.0,,0.0,,11.0,0.0,0.0,,58.0,0.0,21.0,,10.0,6.0,0.0,4.0,9.0,,,,23.0,7.0,,21.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,0.0,,,,,,,,,
p24_254,p24,00:35:00,0.0,17.0,33.0,390.0,398.0,238.0,166.0,534.0,45.0,0.0,0.0,0.0,0.0,0.0,65.0,0.0,0.0,106.0,229.0,157.0,136.0,6.0,11.0,0.0,31.0,13.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,88.0,99.0,51.0,0.0,0.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,17.0,,31.0,55.0,47.0,0.0,,,,0.0,133.0,71.0,38.0,0.0,0.0,0.0,,,,0.0,,39.0,196.0,444.0,251.0
p24_255,p24,03:30:00,56.0,64.0,35.0,31.0,16.0,25.0,13.0,0.0,6.0,23.0,46.0,11.0,0.0,0.0,45.0,6.0,9.0,17.0,332.0,60.0,63.0,40.0,0.0,78.0,17.0,0.0,0.0,0.0,49.0,0.0,0.0,32.0,47.0,,,,,,0.0,70.0,0.0,,0.0,27.0,37.0,46.0,11.0,37.0,0.0,0.0,7.0,,7.0,,268.0,127.0,35.0,139.0,0.0,0.0,0.0,397.0,49.0,0.0,,0.0,,,,,,0.0
p24_256,p24,06:40:00,,0.0,0.0,4.0,69.0,0.0,,0.0,5.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,,,,,0.0,,,,,,,,0.0,0.0,,11.0,22.0,,0.0,0.0,,,,,0.0,10.0,,,0.0,,,,,,,,,,,,
p24_257,p24,12:30:00,0.0,,,,0.0,0.0,,,,,,0.0,0.0,,,0.0,,,,,0.0,,0.0,,,,0.0,,0.0,,,0.0,,,,,0.0,,0.0,0.0,,0.0,0.0,7.0,,0.0,6.0,0.0,0.0,,,,,0.0,0.0,0.0,21.0,7.0,0.0,4.0,37.0,0.0,19.0,27.0,115.0,197.0,42.0,111.0,80.0,0.0,9.0,0.0
p24_258,p24,03:45:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
p24_259,p24,06:10:00,,,0.0,0.0,,,,,,0.0,,,,0.0,,0.0,,0.0,,0.0,0.0,,,,,,0.0,0.0,,,,0.0,,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,
p24_260,p24,03:10:00,58.0,98.0,30.0,29.0,29.0,0.0,0.0,4.0,4.0,44.0,78.0,0.0,21.0,27.0,4.0,22.0,0.0,0.0,34.0,17.0,12.0,114.0,149.0,145.0,8.0,16.0,66.0,0.0,0.0,0.0,,0.0,0.0,6.0,0.0,0.0,,9.0,28.0,,,,,,0.0,0.0,78.0,0.0,,,,6.0,,8.0,,,,0.0,,,,,,,,,,0.0,,0.0,,


In [62]:
df_ = step_col.copy()

# Get all column names
all_columns = df_.columns

# Select the columns that start with 'hr-'
columns_to_fill = [col for col in all_columns if col.startswith('steps-')]

# Group by 'p_num' and 'time', calculate the mean for each group
grouped_df = df_.groupby(['p_num', 'time'])[columns_to_fill].mean().reset_index()

# Merge the grouped DataFrame with the original DataFrame
merged_df = pd.merge(df_, grouped_df, on=['p_num', 'time'], how='left', suffixes=('', '_mean'))

# Select columns in `columns_to_fill` from `merged_df`
filled_df = merged_df[columns_to_fill]

# Iterate over columns and fill missing values
for col in columns_to_fill:
    mean_col = col + '_mean'  # Get the name of the corresponding mean column
    filled_df[col] = filled_df[col].fillna(merged_df[mean_col])

filled_df = filled_df.ffill(axis=1)
filled_df = filled_df.bfill(axis=1)

In [63]:
filled_df = np.round(filled_df,0)
step_col[filled_df.columns] = filled_df.values

In [64]:
#msno.matrix(filled_df,figsize=(8,3), fontsize=8);

**-----------------------------p24-----------------------------**

In [65]:
p24 = step_col[step_col["p_num"]=="p24"].copy()
p24_sorted = step_col[step_col["p_num"]=="p24"].sort_values(by="time").copy()

In [66]:
step_col[step_col["steps-0:00"].isna()==True].index#.sum().sum()

Index(['p15_2', 'p15_4', 'p15_8', 'p15_9', 'p15_11', 'p15_14', 'p15_19',
       'p15_21', 'p15_22', 'p15_28',
       ...
       'p24_48', 'p24_50', 'p24_115', 'p24_131', 'p24_159', 'p24_184',
       'p24_198', 'p24_233', 'p24_245', 'p24_258'],
      dtype='object', name='id', length=326)

In [67]:
p24_sorted.ffill(inplace=True)
p24_sorted.bfill(inplace=True)

In [68]:
p24_sorted = p24_sorted.loc[p24.index]
step_col.loc[p24_sorted.index, p24_sorted.columns] = p24_sorted.values

In [69]:
p24_sorted.isna().sum().sum(),step_col.isna().sum().sum()

(0, 22536)

**-----------------------------p15-----------------------------**

In [70]:
p15 = step_col[step_col["p_num"]=="p15"].copy()
p15_sorted = step_col[step_col["p_num"]=="p15"].sort_values(by="time").copy()

In [71]:
#p12_sorted.iloc[1670:1684]#.mean(axis=0)
#p15_sorted[p12_sorted.time=="21:00:00"]#.loc[:"p12_25319"]
#p15_sorted[p15_sorted["steps-0:00"].isna()==True]
#p15_sorted

In [72]:
step_col[step_col["steps-0:00"].isna()==True].index#.sum().sum()

Index(['p15_2', 'p15_4', 'p15_8', 'p15_9', 'p15_11', 'p15_14', 'p15_19',
       'p15_21', 'p15_22', 'p15_28',
       ...
       'p22_163', 'p22_166', 'p22_167', 'p22_174', 'p22_175', 'p22_181',
       'p22_185', 'p22_190', 'p22_193', 'p22_197'],
      dtype='object', name='id', length=313)

In [73]:
p15_sorted.ffill(inplace=True)
p15_sorted.bfill(inplace=True)

In [74]:
p15_sorted = p15_sorted.loc[p15.index]
step_col.loc[p15_sorted.index, p15_sorted.columns] = p15_sorted.values

In [75]:
p15_sorted.isna().sum().sum(),step_col.isna().sum().sum()

(0, 13320)

In [76]:
#msno.matrix(p15_sorted,figsize=(12,5), fontsize=10);

In [77]:
#msno.matrix(step_col,figsize=(8,4), fontsize=8);

**-----------------------------p18-----------------------------**

In [78]:
p18 = step_col[step_col["p_num"]=="p18"].copy()
p18_sorted = step_col[step_col["p_num"]=="p18"].sort_values(by="time").copy()
p18_sorted.shape
#msno.matrix(p18_sorted,figsize=(8,4), fontsize=10);

(231, 74)

In [79]:
#p18_sorted.iloc[:50,:]

In [80]:
#p12_sorted.iloc[1670:1684]#.mean(axis=0)
#p15_sorted[p12_sorted.time=="21:00:00"]#.loc[:"p12_25319"]
#p18_sorted[p18_sorted["steps-0:00"].isna()==True]
#p15_sorted

In [81]:
step_col[step_col["steps-0:00"].isna()==True].index#.sum().sum()

Index(['p18_2', 'p18_9', 'p18_24', 'p18_32', 'p18_34', 'p18_42', 'p18_59',
       'p18_60', 'p18_61', 'p18_63',
       ...
       'p22_163', 'p22_166', 'p22_167', 'p22_174', 'p22_175', 'p22_181',
       'p22_185', 'p22_190', 'p22_193', 'p22_197'],
      dtype='object', name='id', length=185)

In [82]:
p18_sorted.ffill(inplace=True)
p18_sorted.bfill(inplace=True)
p18_sorted[p18_sorted["steps-0:00"].isna()==True]

Unnamed: 0_level_0,p_num,time,steps-5:55,steps-5:50,steps-5:45,steps-5:40,steps-5:35,steps-5:30,steps-5:25,steps-5:20,steps-5:15,steps-5:10,steps-5:05,steps-5:00,steps-4:55,steps-4:50,steps-4:45,steps-4:40,steps-4:35,steps-4:30,steps-4:25,steps-4:20,steps-4:15,steps-4:10,steps-4:05,steps-4:00,steps-3:55,steps-3:50,steps-3:45,steps-3:40,steps-3:35,steps-3:30,steps-3:25,steps-3:20,steps-3:15,steps-3:10,steps-3:05,steps-3:00,steps-2:55,steps-2:50,steps-2:45,steps-2:40,steps-2:35,steps-2:30,steps-2:25,steps-2:20,steps-2:15,steps-2:10,steps-2:05,steps-2:00,steps-1:55,steps-1:50,steps-1:45,steps-1:40,steps-1:35,steps-1:30,steps-1:25,steps-1:20,steps-1:15,steps-1:10,steps-1:05,steps-1:00,steps-0:55,steps-0:50,steps-0:45,steps-0:40,steps-0:35,steps-0:30,steps-0:25,steps-0:20,steps-0:15,steps-0:10,steps-0:05,steps-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1


In [83]:
p18_sorted = p18_sorted.loc[p18.index]
step_col.loc[p18_sorted.index, p18_sorted.columns] = p18_sorted.values
p18_sorted.isna().sum().sum(),step_col.isna().sum().sum()

(0, 9000)

In [84]:
#msno.matrix(p15_sorted,figsize=(12,5), fontsize=10);
#msno.matrix(step_col,figsize=(8,4), fontsize=8);

**-----------------------------p19-----------------------------**

In [85]:
p19 = step_col[step_col["p_num"]=="p19"].copy()
p19_sorted = step_col[step_col["p_num"]=="p19"].sort_values(by="time").copy()
p19_sorted.shape
#msno.matrix(p19_sorted,figsize=(8,3), fontsize=10);

(246, 74)

In [86]:
#p18_sorted.iloc[:50,:]

In [87]:
#p12_sorted.iloc[1670:1684]#.mean(axis=0)
#p15_sorted[p12_sorted.time=="21:00:00"]#.loc[:"p12_25319"]
#p18_sorted[p18_sorted["steps-0:00"].isna()==True]
#p15_sorted

In [88]:
step_col[step_col["steps-0:00"].isna()==True].index#.sum().sum()

Index(['p19_14', 'p19_24', 'p19_26', 'p19_33', 'p19_36', 'p19_52', 'p19_63',
       'p19_71', 'p19_81', 'p19_85',
       ...
       'p22_163', 'p22_166', 'p22_167', 'p22_174', 'p22_175', 'p22_181',
       'p22_185', 'p22_190', 'p22_193', 'p22_197'],
      dtype='object', name='id', length=125)

In [89]:
p19_sorted.ffill(inplace=True)
p19_sorted.bfill(inplace=True)
p19_sorted[p19_sorted["steps-0:00"].isna()==True]

Unnamed: 0_level_0,p_num,time,steps-5:55,steps-5:50,steps-5:45,steps-5:40,steps-5:35,steps-5:30,steps-5:25,steps-5:20,steps-5:15,steps-5:10,steps-5:05,steps-5:00,steps-4:55,steps-4:50,steps-4:45,steps-4:40,steps-4:35,steps-4:30,steps-4:25,steps-4:20,steps-4:15,steps-4:10,steps-4:05,steps-4:00,steps-3:55,steps-3:50,steps-3:45,steps-3:40,steps-3:35,steps-3:30,steps-3:25,steps-3:20,steps-3:15,steps-3:10,steps-3:05,steps-3:00,steps-2:55,steps-2:50,steps-2:45,steps-2:40,steps-2:35,steps-2:30,steps-2:25,steps-2:20,steps-2:15,steps-2:10,steps-2:05,steps-2:00,steps-1:55,steps-1:50,steps-1:45,steps-1:40,steps-1:35,steps-1:30,steps-1:25,steps-1:20,steps-1:15,steps-1:10,steps-1:05,steps-1:00,steps-0:55,steps-0:50,steps-0:45,steps-0:40,steps-0:35,steps-0:30,steps-0:25,steps-0:20,steps-0:15,steps-0:10,steps-0:05,steps-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1


In [90]:
p19_sorted = p19_sorted.loc[p19.index]
step_col.loc[p19_sorted.index, p19_sorted.columns] = p19_sorted.values
p19_sorted.isna().sum().sum(),step_col.isna().sum().sum()

(0, 6912)

In [91]:
#msno.matrix(p15_sorted,figsize=(12,5), fontsize=10);
#msno.matrix(step_col,figsize=(8,3), fontsize=10);

**-----------------------------p21-----------------------------**

In [92]:
step_col[step_col["steps-0:00"].isna()==True].index#.sum().sum()

Index(['p21_8', 'p21_11', 'p21_12', 'p21_14', 'p21_16', 'p21_25', 'p21_33',
       'p21_40', 'p21_43', 'p21_44', 'p21_59', 'p21_66', 'p21_68', 'p21_74',
       'p21_80', 'p21_84', 'p21_85', 'p21_93', 'p21_116', 'p21_125', 'p21_129',
       'p21_144', 'p21_147', 'p21_150', 'p21_151', 'p21_152', 'p21_153',
       'p21_166', 'p21_182', 'p21_187', 'p21_194', 'p21_198', 'p21_200',
       'p21_205', 'p21_207', 'p21_209', 'p21_211', 'p21_223', 'p21_225',
       'p21_226', 'p21_232', 'p21_235', 'p22_5', 'p22_9', 'p22_14', 'p22_15',
       'p22_26', 'p22_29', 'p22_31', 'p22_34', 'p22_42', 'p22_47', 'p22_49',
       'p22_50', 'p22_54', 'p22_58', 'p22_61', 'p22_64', 'p22_68', 'p22_69',
       'p22_73', 'p22_74', 'p22_75', 'p22_79', 'p22_83', 'p22_87', 'p22_91',
       'p22_94', 'p22_96', 'p22_101', 'p22_104', 'p22_106', 'p22_109',
       'p22_116', 'p22_119', 'p22_121', 'p22_127', 'p22_133', 'p22_135',
       'p22_137', 'p22_142', 'p22_143', 'p22_146', 'p22_154', 'p22_158',
       'p22_160', 'p22

In [93]:
p21 = step_col[step_col["p_num"]=="p21"].copy()
p21_sorted = step_col[step_col["p_num"]=="p21"].sort_values(by="time").copy()
print(p21_sorted.shape)
#msno.matrix(p21_sorted,figsize=(8,3), fontsize=10);

(236, 74)


In [94]:
#p21_sorted.iloc[:50,:]

In [95]:
#p12_sorted.iloc[1670:1684]#.mean(axis=0)
#p15_sorted[p12_sorted.time=="21:00:00"]#.loc[:"p12_25319"]
#p18_sorted[p18_sorted["steps-0:00"].isna()==True]
#p15_sorted

In [96]:
p21_sorted.ffill(inplace=True)
p21_sorted.bfill(inplace=True)
p21_sorted[p21_sorted["steps-0:00"].isna()==True]

Unnamed: 0_level_0,p_num,time,steps-5:55,steps-5:50,steps-5:45,steps-5:40,steps-5:35,steps-5:30,steps-5:25,steps-5:20,steps-5:15,steps-5:10,steps-5:05,steps-5:00,steps-4:55,steps-4:50,steps-4:45,steps-4:40,steps-4:35,steps-4:30,steps-4:25,steps-4:20,steps-4:15,steps-4:10,steps-4:05,steps-4:00,steps-3:55,steps-3:50,steps-3:45,steps-3:40,steps-3:35,steps-3:30,steps-3:25,steps-3:20,steps-3:15,steps-3:10,steps-3:05,steps-3:00,steps-2:55,steps-2:50,steps-2:45,steps-2:40,steps-2:35,steps-2:30,steps-2:25,steps-2:20,steps-2:15,steps-2:10,steps-2:05,steps-2:00,steps-1:55,steps-1:50,steps-1:45,steps-1:40,steps-1:35,steps-1:30,steps-1:25,steps-1:20,steps-1:15,steps-1:10,steps-1:05,steps-1:00,steps-0:55,steps-0:50,steps-0:45,steps-0:40,steps-0:35,steps-0:30,steps-0:25,steps-0:20,steps-0:15,steps-0:10,steps-0:05,steps-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1


In [97]:
p21_sorted = p21_sorted.loc[p21.index]
step_col.loc[p21_sorted.index, p21_sorted.columns] = p21_sorted.values
p21_sorted.isna().sum().sum(),step_col.isna().sum().sum()

(0, 3888)

In [98]:
#msno.matrix(p15_sorted,figsize=(12,5), fontsize=10);
#msno.matrix(step_col,figsize=(8,3), fontsize=8);

**-----------------------------p22-----------------------------**

In [99]:
step_col[step_col["steps-0:00"].isna()==True].index#.sum().sum()

Index(['p22_5', 'p22_9', 'p22_14', 'p22_15', 'p22_26', 'p22_29', 'p22_31',
       'p22_34', 'p22_42', 'p22_47', 'p22_49', 'p22_50', 'p22_54', 'p22_58',
       'p22_61', 'p22_64', 'p22_68', 'p22_69', 'p22_73', 'p22_74', 'p22_75',
       'p22_79', 'p22_83', 'p22_87', 'p22_91', 'p22_94', 'p22_96', 'p22_101',
       'p22_104', 'p22_106', 'p22_109', 'p22_116', 'p22_119', 'p22_121',
       'p22_127', 'p22_133', 'p22_135', 'p22_137', 'p22_142', 'p22_143',
       'p22_146', 'p22_154', 'p22_158', 'p22_160', 'p22_163', 'p22_166',
       'p22_167', 'p22_174', 'p22_175', 'p22_181', 'p22_185', 'p22_190',
       'p22_193', 'p22_197'],
      dtype='object', name='id')

In [100]:
p22 = step_col[step_col["p_num"]=="p22"].copy()
p22_sorted = step_col[step_col["p_num"]=="p22"].sort_values(by="time").copy()
print(p22_sorted.shape)
#msno.matrix(p22_sorted,figsize=(8,3), fontsize=10);

(201, 74)


In [101]:
#p22_sorted.iloc[:50,:]

In [102]:
#p12_sorted.iloc[1670:1684]#.mean(axis=0)
#p15_sorted[p12_sorted.time=="21:00:00"]#.loc[:"p12_25319"]
#p18_sorted[p18_sorted["steps-0:00"].isna()==True]
#p15_sorted

In [103]:
p22_sorted.ffill(inplace=True)
p22_sorted.bfill(inplace=True)
p22_sorted[p22_sorted["steps-0:00"].isna()==True]

Unnamed: 0_level_0,p_num,time,steps-5:55,steps-5:50,steps-5:45,steps-5:40,steps-5:35,steps-5:30,steps-5:25,steps-5:20,steps-5:15,steps-5:10,steps-5:05,steps-5:00,steps-4:55,steps-4:50,steps-4:45,steps-4:40,steps-4:35,steps-4:30,steps-4:25,steps-4:20,steps-4:15,steps-4:10,steps-4:05,steps-4:00,steps-3:55,steps-3:50,steps-3:45,steps-3:40,steps-3:35,steps-3:30,steps-3:25,steps-3:20,steps-3:15,steps-3:10,steps-3:05,steps-3:00,steps-2:55,steps-2:50,steps-2:45,steps-2:40,steps-2:35,steps-2:30,steps-2:25,steps-2:20,steps-2:15,steps-2:10,steps-2:05,steps-2:00,steps-1:55,steps-1:50,steps-1:45,steps-1:40,steps-1:35,steps-1:30,steps-1:25,steps-1:20,steps-1:15,steps-1:10,steps-1:05,steps-1:00,steps-0:55,steps-0:50,steps-0:45,steps-0:40,steps-0:35,steps-0:30,steps-0:25,steps-0:20,steps-0:15,steps-0:10,steps-0:05,steps-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1


In [104]:
p22_sorted = p22_sorted.loc[p22.index]
step_col.loc[p22_sorted.index, p22_sorted.columns] = p22_sorted.values
p22_sorted.isna().sum().sum(),step_col.isna().sum().sum()

(0, 0)

In [105]:
#msno.matrix(p15_sorted,figsize=(12,5), fontsize=10);
#msno.matrix(step_col,figsize=(8,3), fontsize=8);

### Replace in original dataset:

In [106]:
df_all[step_col.columns] = step_col.values

In [107]:
step_col = df_all.filter(regex='step|time|id|p_num')
step_col.isna().sum().sum()

0

## **6.0 - Fill NaN for Cal**

In [108]:
#cal_col.groupby(["p_num","time"]).mean().iloc[0:30]
#cal_col.tail(10)#,cal_col.shape
gc.collect()

4

In [109]:
df_ = cal_col.copy()

# Get all column names
all_columns = df_.columns

# Select the columns that start with 'hr-'
columns_to_fill = [col for col in all_columns if col.startswith('cals-')]

# Group by 'p_num' and 'time', calculate the mean for each group
grouped_df = df_.groupby(['p_num', 'time'])[columns_to_fill].mean().reset_index()

# Merge the grouped DataFrame with the original DataFrame
merged_df = pd.merge(df_, grouped_df, on=['p_num', 'time'], how='left', suffixes=('', '_mean'))

# Select columns in `columns_to_fill` from `merged_df`
filled_df = merged_df[columns_to_fill]

# Iterate over columns and fill missing values
for col in columns_to_fill:
    mean_col = col + '_mean'  # Get the name of the corresponding mean column
    filled_df[col] = filled_df[col].fillna(merged_df[mean_col])

filled_df = filled_df.ffill(axis=1)
filled_df = filled_df.bfill(axis=1)

In [110]:
filled_df = np.round(filled_df,2)
cal_col[filled_df.columns] = filled_df.values

In [111]:
#msno.matrix(cal_col,figsize=(8,3), fontsize=8)

**-----------------------------p15-----------------------------**

In [112]:
p15 = cal_col[cal_col["p_num"]=="p15"].copy()
p15_sorted = cal_col[cal_col["p_num"]=="p15"].sort_values(by="time").copy()

In [113]:
cal_col[cal_col["cals-0:00"].isna()==True].index#.sum().sum()

Index(['p15_2', 'p15_9', 'p15_14', 'p15_19', 'p15_21', 'p15_22', 'p15_35',
       'p15_38', 'p15_48', 'p15_52',
       ...
       'p19_152', 'p19_158', 'p19_169', 'p19_175', 'p19_183', 'p19_201',
       'p19_218', 'p19_220', 'p19_238', 'p19_244'],
      dtype='object', name='id', length=143)

In [114]:
#msno.matrix(p15_sorted,figsize=(8,3), fontsize=8)

In [115]:
#p15_sorted

In [116]:
p15_sorted.ffill(inplace=True)
p15_sorted.bfill(inplace=True)

In [117]:
p15_sorted = p15_sorted.loc[p15.index]
cal_col.loc[p15_sorted.index, p15_sorted.columns] = p15_sorted.values

In [118]:
p15_sorted.isna().sum().sum(),cal_col.isna().sum().sum()

(0, 5616)

In [119]:
cal_col[cal_col["cals-0:00"].isna()==True].index

Index(['p18_2', 'p18_9', 'p18_24', 'p18_32', 'p18_34', 'p18_42', 'p18_59',
       'p18_60', 'p18_61', 'p18_63', 'p18_70', 'p18_72', 'p18_74', 'p18_77',
       'p18_82', 'p18_84', 'p18_87', 'p18_88', 'p18_96', 'p18_103', 'p18_104',
       'p18_106', 'p18_107', 'p18_108', 'p18_109', 'p18_111', 'p18_114',
       'p18_115', 'p18_122', 'p18_124', 'p18_127', 'p18_128', 'p18_129',
       'p18_134', 'p18_137', 'p18_145', 'p18_147', 'p18_160', 'p18_164',
       'p18_179', 'p18_180', 'p18_182', 'p18_184', 'p18_194', 'p18_200',
       'p18_210', 'p18_211', 'p18_212', 'p18_213', 'p18_215', 'p18_221',
       'p18_226', 'p18_227', 'p19_14', 'p19_24', 'p19_26', 'p19_33', 'p19_36',
       'p19_52', 'p19_71', 'p19_81', 'p19_85', 'p19_95', 'p19_112', 'p19_127',
       'p19_138', 'p19_142', 'p19_149', 'p19_152', 'p19_158', 'p19_169',
       'p19_175', 'p19_183', 'p19_201', 'p19_218', 'p19_220', 'p19_238',
       'p19_244'],
      dtype='object', name='id')

**-----------------------------p18-----------------------------**

In [120]:
p18 = cal_col[cal_col["p_num"]=="p18"].copy()
p18_sorted = cal_col[cal_col["p_num"]=="p18"].sort_values(by="time").copy()

In [121]:
#msno.matrix(p18_sorted,figsize=(8,3), fontsize=8)

In [122]:
#p18_sorted

In [123]:
p18_sorted.ffill(inplace=True)
p18_sorted.bfill(inplace=True)

In [124]:
p18_sorted = p18_sorted.loc[p18.index]
cal_col.loc[p18_sorted.index, p18_sorted.columns] = p18_sorted.values

In [125]:
p18_sorted.isna().sum().sum(),cal_col.isna().sum().sum()

(0, 1800)

In [126]:
cal_col[cal_col["cals-0:00"].isna()==True].index

Index(['p19_14', 'p19_24', 'p19_26', 'p19_33', 'p19_36', 'p19_52', 'p19_71',
       'p19_81', 'p19_85', 'p19_95', 'p19_112', 'p19_127', 'p19_138',
       'p19_142', 'p19_149', 'p19_152', 'p19_158', 'p19_169', 'p19_175',
       'p19_183', 'p19_201', 'p19_218', 'p19_220', 'p19_238', 'p19_244'],
      dtype='object', name='id')

**-----------------------------p19-----------------------------**

In [127]:
p19 = cal_col[cal_col["p_num"]=="p19"].copy()
p19_sorted = cal_col[cal_col["p_num"]=="p19"].sort_values(by="time").copy()

In [128]:
#msno.matrix(p19_sorted,figsize=(8,3), fontsize=8)

In [129]:
#p18_sorted

In [130]:
p19_sorted.ffill(inplace=True)
p19_sorted.bfill(inplace=True)

In [131]:
p19_sorted = p19_sorted.loc[p19.index]
cal_col.loc[p19_sorted.index, p19_sorted.columns] = p19_sorted.values

In [132]:
p19_sorted.isna().sum().sum(),cal_col.isna().sum().sum()

(0, 0)

In [133]:
cal_col[cal_col["cals-0:00"].isna()==True].index

Index([], dtype='object', name='id')

### Replace in original dataset:

In [134]:
df_all[cal_col.columns] = cal_col.values

In [135]:
cal_col = df_all.filter(regex='cals|time|id|p_num')
cal_col.isna().sum().sum()

0

## **6.0 - Fill NaN for Activity**

In [136]:
#cal_col.groupby(["p_num","time"]).mean().iloc[0:30]
#cal_col.tail(10)#,cal_col.shape
gc.collect()

0

The "Activities" features set covers the following:
* Indoor climbing
* Run
* Strength training
* Swim
* Bike
* Dancing
* Stairclimber
* Spinning
* Walking
* HIIT
* Outdoor Bike
* Walk
* Aerobic Workout
* Tennis
* Workout
* Hike
* Zumba
* Sport
* Yoga
* Swimming
* Weights
* Running

*Most Effective:*

* **HIIT** (High-Intensity Interval Training): HIIT involves short bursts of intense exercise followed by brief recovery periods. Research shows HIIT can be very effective at improving insulin sensitivity and lowering blood glucose levels, sometimes even more so than continuous moderate-intensity exercise.
* **Strength Training**: Building muscle mass helps your body use glucose more effectively. Strength training can also improve insulin sensitivity and help with weight management, which is crucial for diabetes control.
* **Aerobic Workouts** (including Running, Swimming, Biking, Spinning, and brisk Walking): Aerobic exercise increases your heart rate and breathing, which improves your body's ability to use insulin and lower blood glucose.

*Moderately Effective:*

* **Dancing** (including Zumba): Dancing combines aerobic activity with entertainment, making it a fun way to get moving. It can improve insulin sensitivity and help with weight management.
* **Stairclimber**: This provides a good cardiovascular workout and helps build leg strength.
* **Tennis**: This is a moderately intense activity that combines aerobic exercise with bursts of speed and agility.
* **Hike**: Hiking can be a moderate-intensity activity that also incorporates strength training depending on the terrain.

*Less Effective (but still beneficial):*

* **Yoga**: While yoga has many health benefits, including stress reduction and flexibility, it may not be as effective as other activities for directly lowering blood glucose. However, certain types of yoga that involve more dynamic movements can be more beneficial.
* **Indoor Climbing**: This can build strength and endurance, but its impact on blood glucose may be less pronounced than other activities.

In [137]:
act_to_replace = {"Indoor climbing":1,
                  "Run":3,
                  "Strength training":3,
                  "Swim":3,
                  "Bike":3,
                  "Dancing":2,
                  "Stairclimber":2,
                  "Spinning":3,
                  "Walking":2,
                  "HIIT":3,
                  "Outdoor Bike":3,
                  "Walk":2,
                  "Aerobic Workout":3,
                  "Tennis":2,
                  "Workout":3,
                  "Hike":2,
                  "Zumba":2,
                  "Sport":2,
                  "Yoga":1,
                  "Swimming":3,
                  "Weights":3,
                  "Running":3}
act_col.replace(act_to_replace, inplace=True)
act_col.fillna(0, inplace=True)
act_col.isna().sum().sum()

0

### Replace in original dataset:

In [138]:
df_all[act_col.columns] = act_col.values

In [139]:
act_col = df_all.filter(regex='activity|time|id|p_num')
cal_col.isna().sum().sum()

0

In [140]:
df_all.isna().sum().sum()
#msno.matrix(df_all,figsize=(8,3), fontsize=8);

3644

# Save new Datasets:

In [141]:
new_train = df_all.loc[train_index,:]
new_test = df_all.loc[test_index,:]
new_test.drop(columns=["bg+1:00"], inplace=True)

In [142]:
new_train['time'] = pd.to_datetime(new_train['time'], format='%H:%M:%S')
new_test['time'] = pd.to_datetime(new_test['time'], format='%H:%M:%S')

new_train['hour'] = new_train['time'].dt.hour
new_train['minute'] = new_train['time'].dt.minute
new_train.drop('time', axis=1, inplace=True)

new_test['hour'] = new_test['time'].dt.hour
new_test['minute'] = new_test['time'].dt.minute
new_test.drop('time', axis=1, inplace=True)

In [143]:
new_train["obv_n"]  = new_train.reset_index()["id"].apply(lambda x: x.split("_")[1]).values.astype(float)
new_test["obv_n"] = new_test.reset_index()["id"].apply(lambda x: x.split("_")[1]).values.astype(float)

In [144]:
new_train.reset_index(inplace=True)
new_train.to_csv("/content/drive/MyDrive/Exercises/Studies_Structured_Data/Data/Glucose/new_train.csv", index=False)

In [145]:
new_test.reset_index(inplace=True)
new_test.to_csv("/content/drive/MyDrive/Exercises/Studies_Structured_Data/Data/Glucose/new_test.csv", index=False)