<a href="https://colab.research.google.com/github/fabriziobasso/Colab_backup/blob/main/File_02_EDA_ext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **<h1 align="center"><font color='#001ddd'> GLUCOSE PREDICTION DATASET**</font></h1>

## **Dataset Description**
The dataset is from a study that collected data from young adults in the UK with type 1 diabetes, who used a continuous glucose monitor (CGM), an insulin pump and a smartwatch. These devices collected blood glucose readings, insulin dosage, carbohydrate intake, and activity data. The data collected was aggregated to five-minute intervals and formatted into samples. Each sample represents a point in time and includes the aggregated five-minute intervals from the previous six hours. The aim is to predict the blood glucose reading an hour into the future, for each of these samples.

The training set takes samples from the first three months of study data from nine of the participants and includes the future blood glucose value. These training samples appear in chronological order and overlap. The testing set takes samples from the remainder of the study period from fifteen of the participants (so unseen participants appear in the testing set). These testing samples do not overlap and are in a random order to avoid data leakage.

**Complexities to be aware of:**

This is medical data so there are missing values and noise in the data
the participants did not all use the same device models (CGM, insulin pump and smartwatch) so there may be differences in the collection method of the data
some participants in the test set do not appear in the training set

In [None]:
%%capture
# Connect to Colab:#
from google.colab import drive
import os
drive.mount('/content/drive')

!pip install category-encoders
!pip install optuna
!pip install optuna-integration
#!pip install scikit-learn==1.4
!pip install catboost
!pip install deeptables

!pip install keras-tuner --upgrade
!pip install keras-nlp
!pip install BorutaShap
!pip install scikit-lego
!!pip install --no-index -U --find-links=/kaggle/input/deeptables-v0-2-5/deeptables-0.2.5 deeptables==0.2.5

In [None]:
folder_script = models_folders = "/content/drive/MyDrive/Exercises/Studies_Structured_Data/Data/Glucose"
os.chdir(folder_script)

In [None]:
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.wrapper import PolynomialWrapper
from category_encoders.count import CountEncoder

# Setup notebook
from pathlib import Path
import ipywidgets as widgets
import pandas as pd
import numpy as np
from pickle import load, dump
import json
import joblib
#import calplot as cal
import missingno as msno
import category_encoders as ce

# Graphic Libraries:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.image as mpimg

# Palette Setup
colors = ['#FB5B68','#FFEB48','#2676A1','#FFBDB0',]
colormap_0 = mpl.colors.LinearSegmentedColormap.from_list("",colors)
palette_1 = sns.color_palette("coolwarm", as_cmap=True)
palette_2 = sns.color_palette("YlOrBr", as_cmap=True)
palette_3 = sns.light_palette("red", as_cmap=True)
palette_4 = sns.color_palette("viridis", as_cmap=True)
palette_5 = sns.color_palette("rocket", as_cmap=True)
palette_6 = sns.color_palette("GnBu", as_cmap=True)
palette_7 = sns.color_palette("tab20c", as_cmap=False)
palette_8 = sns.color_palette("Set2", as_cmap=False)

palette_custom = ['#fbb4ae','#b3cde3','#ccebc5','#decbe4','#fed9a6','#ffffcc','#e5d8bd','#fddaec','#f2f2f2']
palette_9 = sns.color_palette(palette_custom, as_cmap=False)


# Bloomberg
#from xbbg import blp
from catboost import CatBoostRegressor, Pool
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from xgboost.callback import EarlyStopping

import lightgbm as lgb
from lightgbm import (LGBMRegressor,
                      LGBMClassifier,
                      early_stopping,
                      record_evaluation,
                      log_evaluation)

# Time Management
from tqdm import tqdm
from datetime import date
from datetime import datetime
from pandas.tseries.offsets import BMonthEnd, QuarterEnd
import datetime
from pandas.tseries.offsets import BDay # BDay is business day, not birthday...
import datetime as dt
import click
import glob
import os
import gc
import re
import string

from ipywidgets import AppLayout
from ipywidgets import Dropdown, Layout, HTML, AppLayout, VBox, Label, HBox, BoundedFloatText, interact, Output

#from my_func import *

import optuna
from optuna.integration import TFKerasPruningCallback
from optuna.trial import TrialState
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_contour

os.environ["KERAS_BACKEND"] = "tensorflow"

import tensorflow as tf
import keras
from keras import ops
from keras import layers

from keras.layers import Input, LSTM, Dense, Lambda, RepeatVector, Reshape
from keras.models import Model
from keras.losses import MeanSquaredError
from keras.metrics import RootMeanSquaredError

from keras.utils import FeatureSpace, plot_model

# Import libraries for Hypertuning
import keras_tuner as kt
from keras_tuner.tuners import RandomSearch, GridSearch, BayesianOptimization

#from my_func import *

# preprocessing modules
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_validate, GroupKFold, GridSearchCV, RepeatedStratifiedKFold, cross_val_predict

from sklearn.preprocessing import (LabelEncoder,
                                   StandardScaler,
                                   MinMaxScaler,
                                   OrdinalEncoder,
                                   RobustScaler,
                                   PowerTransformer,
                                   OneHotEncoder,
                                   LabelEncoder,
                                   QuantileTransformer,
                                   PolynomialFeatures)

# metrics
import sklearn
from sklearn.metrics import (mean_squared_error,
                             root_mean_squared_error,
                             r2_score,
                             mean_absolute_error,
                             mean_absolute_percentage_error,
                             classification_report,
                             confusion_matrix,
                             ConfusionMatrixDisplay,
                             multilabel_confusion_matrix,
                             accuracy_score,
                             roc_auc_score,
                             auc,
                             roc_curve,
                             log_loss,
                             make_scorer)

# modeling algos
from sklearn.linear_model import (LogisticRegression,
                                  Lasso,
                                  ridge_regression,
                                  LinearRegression,
                                  Ridge,
                                  RidgeCV,
                                  ElasticNet,
                                  BayesianRidge,
                                  HuberRegressor,
                                  TweedieRegressor,
                                  QuantileRegressor,
                                  ARDRegression,
                                  TheilSenRegressor,
                                  PoissonRegressor,
                                  GammaRegressor)

from sklearn.ensemble import (AdaBoostRegressor,
                              AdaBoostClassifier,
                              RandomForestRegressor,
                              RandomForestClassifier,
                              VotingRegressor,
                              GradientBoostingRegressor,
                              GradientBoostingClassifier,
                              StackingRegressor,
                              HistGradientBoostingClassifier,
                              HistGradientBoostingRegressor,
                              ExtraTreesClassifier)

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import FunctionTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
%matplotlib inline

import seaborn as sns
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

from sklearn.multioutput import RegressorChain
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

import itertools
import warnings
import logging
from openpyxl import load_workbook

import statsmodels.api as sm
from pylab import rcParams
import scipy.stats as ss

warnings.filterwarnings('ignore')
#plt.style.use('fivethirtyeight')

# Setting rc parameters in seaborn for plots and graphs-
# Reference - https://matplotlib.org/stable/tutorials/introductory/customizing.html:-
# To alter this, refer to matplotlib.rcParams.keys()

sns.set({"axes.facecolor"       : "#ffffff",
         "figure.facecolor"     : "#ffffff",
         "axes.edgecolor"       : "#000000",
         "grid.color"           : "#ffffff",
         "font.family"          : ['Cambria'],
         "axes.labelcolor"      : "#000000",
         "xtick.color"          : "#000000",
         "ytick.color"          : "#000000",
         "grid.linewidth"       : 0.5,
         'grid.alpha'           :0.5,
         "grid.linestyle"       : "--",
         "axes.titlecolor"      : 'black',
         'axes.titlesize'       : 12,
         'axes.labelweight'     : "bold",
         'legend.fontsize'      : 7.0,
         'legend.title_fontsize': 7.0,
         'font.size'            : 7.5,
         'xtick.labelsize'      : 7.5,
         'ytick.labelsize'      : 7.5,
        });

sns.set_style("whitegrid",{"grid.linestyle":"--", 'grid.linewidth':0.2, 'grid.alpha':0.5})
# Set Style
mpl.rcParams['figure.dpi'] = 120;

# Making sklearn pipeline outputs as dataframe:-
pd.set_option('display.max_columns', 100);
pd.set_option('display.max_rows', 50);

sns.despine(left=True, bottom=True, top=False, right=False)

mpl.rcParams['axes.spines.left'] = True
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.bottom'] = True

## 1.0 Functions:

In [None]:
def encode_target(y_train, y_test, encoder_type='label', enc_strategy=False):
    """
    Encodes the target columns in the training and testing data
    using the specified encoder type.

    Parameters:
    y_train (pd.Series or pd.DataFrame): Training target data.
    y_test (pd.Series or pd.DataFrame): Testing target data.

    Returns:
    y_train_encoded (pd.Series): Encoded training target data.
    y_test_encoded (pd.Series): Encoded testing target data.
    """

    if encoder_type == 'label':
        encoder = LabelEncoder()
        y_train_encoded = encoder.fit_transform(y_train)
        y_test_encoded = encoder.transform(y_test)

        y_train_encoded = pd.Series(y_train_encoded, index=y_train.index, name="Target")
        y_test_encoded = pd.Series(y_test_encoded, index=y_test.index, name="Target")


    elif encoder_type == 'onehot':
        y_train_ = y_train.values.reshape(-1, 1)
        y_test_ = y_test.values.reshape(-1, 1)

        encoder = OneHotEncoder(sparse_output=False)
        y_train_encoded = encoder.fit_transform(y_train_)
        y_test_encoded = encoder.transform(y_test_)

        y_train_encoded = pd.DataFrame(y_train_encoded, index=y_train.index)
        y_test_encoded = pd.DataFrame(y_test_encoded, index=y_test.index)

    else:
        raise ValueError("Invalid encoder_type. Currently supported: 'label'.")

    if enc_strategy:
        return y_train_encoded, y_test_encoded, encoder

    else:
        return y_train_encoded, y_test_encoded

def encode_data(X_train, X_test, encoder_type='label', columns=None, map=None):
    """
    Encodes the training and testing data using the specified encoder type.

    Parameters:
    X_train (pd.DataFrame): Training data.
    X_test (pd.DataFrame): Testing data.
    encoder_type (str): Type of encoder ('label' or 'onehot'). Default is 'label'.
    columns (list): List of columns to encode. If None, all object type columns are encoded.

    Returns:
    X_train_encoded (pd.DataFrame): Encoded training data.
    X_test_encoded (pd.DataFrame): Encoded testing data.
    """

    if columns is None:
        # Default to all object type columns if no columns are specified
        columns = X_train.select_dtypes(include=['object']).columns.tolist()

    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()

    if encoder_type == 'label':
        for col in columns:
            le = LabelEncoder()
            X_train_encoded[col] = le.fit_transform(X_train[col])
            X_test_encoded[col] = le.transform(X_test[col])

    elif encoder_type == 'onehot':
        for col in columns:
            ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
            # Fit the encoder on the training data and transform both training and test data
            encoded_train = ohe.fit_transform(X_train[[col]])
            encoded_test = ohe.transform(X_test[[col]])

            # Create a DataFrame with the encoded data
            encoded_train_df = pd.DataFrame(encoded_train, columns=ohe.get_feature_names_out([col]))
            encoded_test_df = pd.DataFrame(encoded_test, columns=ohe.get_feature_names_out([col]))

            # Concatenate the new columns to the original dataframes and drop the original columns
            X_train_encoded = pd.concat([X_train_encoded.drop(col, axis=1), encoded_train_df], axis=1)
            X_test_encoded = pd.concat([X_test_encoded.drop(col, axis=1), encoded_test_df], axis=1)

    elif encoder_type == 'count_encoder':

          for col in columns:

                target_encoder = CountEncoder(cols=columns)
                X_train_encoded = target_encoder.fit_transform(X_train_encoded)
                X_test_encoded = target_encoder.transform(X_test_encoded)

    else:
        raise ValueError("Invalid encoder_type. Currently supported: 'label', 'onehot', 'target_encoder'.")

    return X_train_encoded, X_test_encoded

## **Importing the Dataset**

## **Files**
* activities.txt - a list of activity names that appear in the activity-X:XX columns
* sample_submission.csv - a sample submission file in the correct format
* test.csv - the test set
* train.csv - the training set

## **Columns**
* train.csv:
    * **id - row id** consisting of participant number and a count for that participant
    * **p_num** - participant number
    * **time** - time of day in the format HH:MM:SS
    * **bg-X:XX** - blood glucose reading in mmol/L, X:XX(H:SS) time in the past (e.g. bg-2:35, would be the blood glucose reading from 2 hours and 35 minutes before the time value for that row), recorded by the continuous glucose monitor
    * **insulin-X:XX** - total insulin dose received in units in the last 5 minutes, X:XX(H:SS) time in the past (e.g. insulin-2:35, would be the total insulin dose received between 2 hours and 40 minutes and 2 hours and 35 minutes before the time value for that row), recorded by the insulin pump
    * **carbs-X:XX** - total carbohydrate value consumed in grammes in the last 5 minutes, X:XX(H:SS) time in the past (e.g. carbs-2:35, would be the total carbohydrate value consumed between 2 hours and 40 minutes and 2 hours and 35 minutes before the time value for that row), recorded by the participant
    * **hr-X:XX** - mean heart rate in beats per minute in the last 5 minutes, X:XX(H:SS) time in the past (e.g. hr-2:35, would be the mean heart rate between 2 hours and 40 minutes and 2 hours and 35 minutes before the time value for that row), recorded by the smartwatch
    * **steps-X:XX** - total steps walked in the last 5 minutes, X:XX(H:SS) time in the past (e.g. * steps-2:35, would be the total steps walked between 2 hours and 40 minutes and 2 hours and 35 minutes before the time value for that row), recorded by the smartwatch
    * **cals-X:XX** - total calories burnt in the last 5 minutes, X:XX(H:SS) time in the past (e.g. cals-2:35, would be the total calories burned between 2 hours and 40 minutes and 2 hours and 35 minutes before the time value for that row), calculated by the smartwatch
    * **activity-X:XX** - self-declared activity performed in the last 5 minutes, X:XX(H:SS) time in the past (e.g. activity-2:35, would show a string name of the activity performed between 2 hours and 40 minutes and 2 hours and 35 minutes before the time value for that row), set on the smartwatch
    * **bg+1:00** - blood glucose reading in mmol/L an hour in the future, this is the value you will be predicting (not provided in test.csv)

In [None]:
df_train=pd.read_csv("/content/drive/MyDrive/Exercises/Studies_Structured_Data/Data/Glucose/final_train.csv")
df_test=pd.read_csv("/content/drive/MyDrive/Exercises/Studies_Structured_Data/Data/Glucose/final_test.csv", index_col=0)

df_test.shape

In [None]:
df_train.p_num.unique()

In [None]:
df_train.head(5)

In [None]:
df_test.head(5)

In [None]:
df_train.head()
df_train.columns

In [None]:
df_all = pd.concat([df_train,df_test])

Sub-dataset are created for each main set of features to inpute missing values:

In [None]:
# # Select columns containing the word "bg"
# bg_col_train = df_train.filter(regex='bg|time|p_num|hour|minute')
# bg_col_test = df_test.filter(regex='bg|time|p_num|hour|minute')

# insulin_col_train = df_train.filter(regex='insulin|bg+1:00')
# insulin_col_test = df_test.filter(regex='insulin|bg+1:00')
# insulin_col_train["bg+1:00"] = df_train["bg+1:00"]

# carb_col_train = df_train.filter(regex='carbs|time|p_num|bg+1:00|hour|minute')
# carb_col_test = df_test.filter(regex='carbs|time|p_num|bg+1:00|hour|minute')
# carb_col_train["bg+1:00"] = df_train["bg+1:00"]

# hr_col_train = df_train.filter(regex='hr|time|p_num|bg+1:00|hour|minute')
# hr_col_test = df_test.filter(regex='hr|time|p_num|bg+1:00|hour|minute')
# hr_col_train["bg+1:00"] = df_train["bg+1:00"]

# cal_col_train = df_train.filter(regex='cal|time|p_num|bg+1:00|hour|minute')
# cal_col_test = df_test.filter(regex='cal|time|p_num|bg+1:00|hour|minute')
# cal_col_train["bg+1:00"] = df_train["bg+1:00"]


# act_col_train = df_train.filter(regex='activity|time|p_num|bg+1:00|hour|minute')
# act_col_test = df_test.filter(regex='activity|time|p_num|bg+1:00|hour|minute')
# act_col_train["bg+1:00"] = df_train["bg+1:00"]

## **Outliers**

-----------------------------------------------
### Insulin

In [None]:
# insulin_col_train.min().min(),insulin_col_test.min().min()

In [None]:
# insulin_col_test.clip(upper=10.0, inplace=True)
# insulin_col_train.clip(upper=10.0, inplace=True)
#insulin_col_train

In [None]:
#insulin_col_train

In [None]:
# insulin_col_train.plot(kind="scatter", x="insulin_av31", y="insulin_av30", alpha=0.6)
# plt.xlabel("Insulin 0:00")
# plt.ylabel("Insulin 1:00")
# plt.show()

In [None]:
# fig, ax =  plt.subplots(figsize=(15,5))
# ax.boxplot(insulin_col_train)
# #plt.yscale('log')
# plt.show()

In [None]:
# insulin_col_train["bg+1:00"] = df_train["bg+1:00"]
# insulin_col_train.corr()["bg+1:00"][:-1].plot()
# plt.show()

In [None]:
# insulin_col_train.drop(columns="bg+1:00", inplace=True)

In [None]:
# insulin_col_test.plot(kind="scatter", x="insulin_av31", y="insulin_av20", alpha=0.6)
# plt.xlabel("Insulin 0:00")
# plt.ylabel("Insulin 1:00")
# plt.show()

In [None]:
# insulin_col_test.head()

In [None]:
# all_ins_col = list(insulin_col_test.columns)

# obs_index = set()

# for c in all_ins_col:
#   susp = insulin_col_train[insulin_col_train[c]>10]
#   new_ind = set(susp.index)
#   obs_index = obs_index.union(new_ind)

# obs_index = list(obs_index)
# print(len(obs_index))

-----------------------------------------------
### Steps

In [None]:
# step_col_train = df_train.filter(regex='steps')
# step_col_test = df_test.filter(regex='steps')
# step_col_train["bg+1:00"] = df_train["bg+1:00"]

# step_col_train.min().min(),step_col_train.min().min()

In [None]:
# insulin_col_test.clip(upper=10.0, inplace=True)
# insulin_col_train.clip(upper=10.0, inplace=True)
#insulin_col_train

In [None]:
#insulin_col_train

In [None]:
# step_col_train.plot(kind="scatter", x="steps_av31", y="steps_av19", alpha=0.6)
# plt.xlabel("steps 0:00")
# plt.ylabel("steps 1:00")
# plt.show()

In [None]:
# fig, ax =  plt.subplots(figsize=(15,5))
# ax.boxplot(step_col_train)
# #plt.yscale('log')
# plt.show()

In [None]:
# step_col_test.plot(kind="scatter", x="steps_av31", y="steps_av19", alpha=0.6)
# plt.xlabel("Steps 0:00")
# plt.ylabel("Steps 1:00")
# plt.show()

### Brake

In [None]:
# brake_col_train = df_train.filter(regex='brake')
# brake_col_test = df_test.filter(regex='brake')
# #step_col_train["bg+1:00"] = df_train["bg+1:00"]
# display(brake_col_train.sample(3))
# brake_col_train.min().min(),brake_col_test.min().min()

In [None]:
# brake_col_train.plot(kind="scatter", x="brake31", y="brake30", alpha=0.6)
# plt.xlabel("brake 0:00")
# plt.ylabel("brake 1:00")
# plt.show()

In [None]:
# fig, ax =  plt.subplots(figsize=(15,5))
# ax.boxplot(brake_col_train)
# #plt.yscale('log')
# plt.show()

In [None]:
# brake_col_train["bg+1:00"] = df_train["bg+1:00"]
# brake_col_train.corr()["bg+1:00"][:-1].plot()
# plt.show()

In [None]:
# brake_col_train.drop(columns="bg+1:00", inplace=True)

In [None]:
gc.collect()

## Scale the Data:

Data are scaled in groups.

In [None]:
static_fields = ["p_num"] #"id",
target = ["bg+1:00"]
ts_fields = list(df_train.drop(columns=static_fields+target))

In [None]:
X_train = df_train.drop(columns=target).copy()
y_train = df_train[target].copy()

X_test = df_test.drop(columns=target).copy()

In [None]:
ts_fields_group = list({q[:-2] for q in ts_fields})
ts_fields_group = ['steps_av','insulin_av','activity','cals_av', 'brake', 'carbs_av','bg','hr','intake']

In [None]:
X_test.filter(regex="bg")

In [None]:
scaling_groups = {group: X_train.filter(regex=group) for group in ts_fields_group}
scaling_groups_test = {group: X_test.filter(regex=group) for group in ts_fields_group}

scaled_groups = {group: X_train.filter(regex=group) for group in ts_fields_group}
scaled_groups_test = {group: X_test.filter(regex=group) for group in ts_fields_group}

In [None]:
# Create the function for the transformation
def log_transform(x, c=1):  # 'c' is the constant to add
    return np.log1p(x + c)

scaling_strat = {group: QuantileTransformer(output_distribution='normal', subsample=None, random_state=42) for group in ts_fields_group}
scaling_strat["steps_av"] = StandardScaler() #Pipeline([('log_trans', FunctionTransformer(func=log_transform, kw_args={'c': 0})), ('quantile', QuantileTransformer(output_distribution='normal', subsample=None))])
scaling_strat["carbs_av"] = MinMaxScaler()
scaling_strat["activity"] = MinMaxScaler()
scaling_strat['bg'] = StandardScaler()
#scaling_strat['intake'] = QuantileTransformer(output_distribution='normal', subsample=25000, random_state=42) #FunctionTransformer(func=log_transform, kw_args={'c': 10})
#scaling_strat['carbs_av'] = PowerTransformer() #QuantileTransformer(output_distribution='normal', subsample=25000, random_state=42) #FunctionTransformer(func=log_transform, kw_args={'c': 10})
#scaling_strat['insulin_av'] = QuantileTransformer(output_distribution='normal', subsample=None, random_state=42) #PowerTransformer() #FunctionTransformer(func=log_transform, kw_args={'c': 10})

scaling_strat

In [None]:
# log_features = []

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('log', FunctionTransformer(func=np.log1p), log_features),
#         ('minmax', MinMaxScaler(), minmax_features)
#     ])

In [None]:
# n_bins=30
# for group in ts_fields_group:

#   stand_scaler = StandardScaler()

#   data_train_ = scaling_groups[group].values.reshape(-1,1)
#   data_train = scaling_strat[group].fit_transform(data_train_)

#   data_train_s = pd.DataFrame(data_train.reshape(-1,32),index=scaling_groups[group].index, columns=scaling_groups[group].columns)
#   data_train_s = pd.DataFrame(stand_scaler.fit_transform(data_train_s),index=data_train_s.index, columns=data_train_s.columns)

#   scaled_groups[group] = data_train_s

#   data_test_ = scaling_groups_test[group].values.reshape(-1,1)
#   data_test = scaling_strat[group].transform(data_test_)
#   data_test_s = pd.DataFrame(data_test.reshape(-1,32),index=scaling_groups_test[group].index, columns=scaling_groups_test[group].columns)
#   data_test_s = pd.DataFrame(stand_scaler.transform(data_test_s),index=data_test_s.index, columns=data_test_s.columns)
#   scaled_groups_test[group] = data_test_s

#   fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True, figsize=(6, 3))

#   # We can set the number of bins with the *bins* keyword argument.
#   axs[0].hist(data_train_, bins=n_bins)
#   axs[1].hist(data_train, bins=n_bins)
#   axs[0].set_title("Unscaled", fontsize=8)
#   axs[1].set_title("Scaled", fontsize=8)
#   plt.suptitle(group, fontsize=12)
#   plt.show()

In [None]:
X_train_scaled_ = X_train[static_fields].copy()
X_test_scaled_ = X_test[static_fields].copy()

X_train_scaled = pd.concat([scaled_groups[group] for group in ts_fields_group], axis=1)
X_train_scaled = pd.concat([X_train_scaled_,X_train_scaled], axis=1)

X_test_scaled = pd.concat([scaled_groups_test[group] for group in ts_fields_group], axis=1)
X_test_scaled = pd.concat([X_test_scaled_,X_test_scaled], axis=1)

print(X_train.shape,X_train_scaled.shape)
print(X_test.shape,X_test_scaled.shape)

In [None]:
gc.collect()

## Post Scaling Analysis:

#### Insulin

In [None]:
# insulin_col_train = X_train_scaled.filter(regex='insulin').copy()
# insulin_col_test = X_test_scaled.filter(regex='insulin').copy()

# insulin_col_train.min().min()

In [None]:
# insulin_col_train.plot(kind="scatter", x="insulin_av31", y="insulin_av30", alpha=0.6)
# plt.xlabel("Insulin 0:00")
# plt.ylabel("Insulin 1:00")
# plt.show()

In [None]:
# fig, ax =  plt.subplots(figsize=(15,5))
# ax.boxplot(insulin_col_train)
# #plt.yscale('log')
# plt.show()

In [None]:
# insulin_col_train["bg+1:00"] = df_train["bg+1:00"]
# insulin_col_train.corr()["bg+1:00"][:-1].plot()
# plt.show()

In [None]:
# insulin_col_train.drop(columns="bg+1:00", inplace=True)
# scaler=StandardScaler()

# insulin_col_train = pd.DataFrame(scaler.fit_transform(insulin_col_train),index=insulin_col_train.index, columns=insulin_col_train.columns)

In [None]:
# fig, ax =  plt.subplots(figsize=(15,5))
# ax.boxplot(insulin_col_train)
# #plt.yscale('log')
# plt.show()

In [None]:
# insulin_col_train["bg+1:00"] = df_train["bg+1:00"]
# insulin_col_train.corr()["bg+1:00"][:-1].plot()
# plt.show()

-----------------------------------------------
### Steps

In [None]:
# step_col_train = X_train_scaled.filter(regex='steps')
# step_col_test = X_test_scaled.filter(regex='steps')
# #step_col_train["bg+1:00"] = df_train["bg+1:00"]

# step_col_train.min().min(),step_col_test.min().min()

In [None]:
# insulin_col_test.clip(upper=10.0, inplace=True)
# insulin_col_train.clip(upper=10.0, inplace=True)
#insulin_col_train

In [None]:
#insulin_col_train

In [None]:
# step_col_train.plot(kind="scatter", x="steps_av31", y="steps_av19", alpha=0.6)
# plt.xlabel("steps 0:00")
# plt.ylabel("steps 1:00")
# plt.show()

In [None]:
# fig, ax =  plt.subplots(figsize=(15,5))
# ax.boxplot(step_col_train)
# #plt.yscale('log')
# plt.show()

In [None]:
# step_col_train["bg+1:00"] = df_train["bg+1:00"]
# step_col_train.corr()["bg+1:00"][:-1].plot()
# plt.show()

In [None]:
# step_col_train.drop(columns="bg+1:00", inplace=True)
# scaler=StandardScaler()

# step_col_train = pd.DataFrame(scaler.fit_transform(step_col_train),index=step_col_train.index, columns=step_col_train.columns)

In [None]:
# fig, ax =  plt.subplots(figsize=(15,5))
# ax.boxplot(step_col_train.iloc[:,:-1])
# #plt.yscale('log')
# plt.show()

In [None]:
# step_col_train["bg+1:00"] = df_train["bg+1:00"]
# step_col_train.corr()["bg+1:00"][:-1].plot()
# plt.show()

## Add back Target:

In [None]:
X_train_scaled["bg+1:00"] = y_train.values
X_test_scaled["bg+1:00"] = np.nan
print(X_train.shape,X_train_scaled.shape)
print(X_test.shape,X_test_scaled.shape)

In [None]:
plt.hist(y_train, bins=30, color="salmon")
plt.xlabel("bg+1:00")
plt.ylabel("Frequency")
plt.title("Target Distribution")
plt.show()

## Save the Data

In [None]:
X_train_scaled.reset_index().to_csv("X_train_scaled.csv",index=False)
X_test_scaled.reset_index().to_csv("X_test_scaled.csv",index=False)

In [None]:
num_cols = X_train_scaled.select_dtypes(include="float").columns

X_train_scaled[num_cols] = X_train_scaled[num_cols].astype("float32")
X_test_scaled[num_cols] = X_test_scaled[num_cols].astype("float32")

In [None]:
gc.collect()

In [None]:
X_train_scaled.reset_index().to_csv("X_train_scaled_smaller.csv",index=False)
X_test_scaled.reset_index().to_csv("X_test_scaled_smaller.csv",index=False)