In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df_train = pd.read_csv("..\\data\\raw\\train.csv")

df_train.head()

In [None]:
df_train.shape

How many observations for each patient?

In [None]:
df_train[["id","p_num"]].groupby("p_num").describe()

**Comment**:</br>
The number of observations per patient differs.

What time range do we have in our dataset?

In [None]:
time_ranges = sorted(df_train["time"].unique())
print("Number of time Observations:", len(time_ranges))
print("Time Ranges:")
print(time_ranges)

#df_train[["time","p_num"]].groupby("p_num").describe()

**Comment:**</br>
We have 288 time observations from 00:00 to 23:55 every 5 minutes steps  and they correspond to a whole day (24*60 / 5) of observations.</br>
However, there are some missing observations at some time points across different patients.


Convert the time in format HH:MM:SS into pandas datetime format for easier manipulation

In [None]:
df_train["time"] = df_train["time"].apply(lambda x: pd.to_datetime(x,format='%H:%M:%S').time())
df_train.head()

# Check the number of missing values for each group of columns

In [None]:
#get all the columns from the dataset
columns = df_train.columns.tolist()
#create a dict that will contains variables that match a pattern
group_cols = dict()


interest_cols = ["hr","bg","insulin","carbs","steps","cals","activity"]
for interest_col in interest_cols: 

    #find all the colums that start_with the same pattern (e.g "hr","bg"...)
    cols_match_interest = list(filter(lambda x:x != 0,
                                     list(
                                         map(lambda col: col if col.startswith(interest_col) else 0,columns))))
    #add them in a list
    group_cols[interest_col] = cols_match_interest

    #remove them from original list of columns
    columns = list(set(columns) - set(cols_match_interest))
c = 0


#here we just print the result of the groups using a format we have chosen
print("The group of colums are ",end=":")
print("[common_pattern,example_col, number]")
for key in group_cols.keys():
    c = c + len(group_cols[key])
    print(f"[{key},{group_cols[key][0]},{len(group_cols[key])} ]")
print(c) 

In [None]:
plt.figure(figsize=(20,5))
cols= group_cols["bg"]
#data = pd.DataFrame(100 * df_train[cols].isna().sum() / df_train.shape[0],index=cols,columns=["missing_rate"]).sort_values(ascending=False,by="missing_rate")

data = pd.DataFrame(100 * df_train[cols].isna().sum() / df_train.shape[0],index=cols,columns=["missing_rate"]).sort_index()
sns.barplot(data,x=data.index,hue=data.missing_rate,y=data.missing_rate,palette="Spectral",legend=False)
plt.xticks(rotation=90);
plt.xlabel("Blood Glucose levels at different times");

**Comment:** </br>
* According to the above plot, we can see that, the target **(bg-1:00)** column has **no** missing values.</br>
* However, the past bg columns have different missing values ratios. In fact, there is a "strange" pattern that is spotted on the plot. </br>
* Depending on the starting point from bg-0:00 / bg-0:05 / bg-0:10, the missing values ratios are are increasing/decreasing every 15 min intervals.
* We can also form groups of columns, the first group can be the "red" ones having missing values ratios within [1.8-7.5]
* The second group can be the "green" ones having missing values ratios within [11.2-14]
* The third group can be the "mauve" ones having missing values ratios within [14.2-16]


In [None]:
fig, ax = plt.subplots(1,2,figsize=(20,5))


#first plot
cols= group_cols["steps"]
data = pd.DataFrame(100 * df_train[cols].isna().sum() / df_train.shape[0],index=cols,columns=["missing_rate"]).sort_values(ascending=False,by="missing_rate")
sns.barplot(data,x=data.index,hue=data.missing_rate,y=data.missing_rate,palette="Spectral",legend=False,ax=ax[0])
ax[0].set_xlabel("Steps related columns")
ax[0].set_ylabel("Missing ratio in %")

ax[0].tick_params(axis="x",rotation=90,size=8);



#second plot
cols= group_cols["insulin"]
data = pd.DataFrame(100 * df_train[cols].isna().sum() / df_train.shape[0],index=cols,columns=["missing_rate"]).sort_values(ascending=False,by="missing_rate")
sns.barplot(data,x=data.index,hue=data.missing_rate,y=data.missing_rate,palette="Spectral",legend=False,ax=ax[1])
ax[1].set_xlabel("Insulin related columns")
ax[1].set_ylabel("Missing ratio in %")
ax[1].tick_params(axis="x",rotation=90,size=8);




In [None]:
fig, ax = plt.subplots(1,2,figsize=(20,5))


cols= group_cols["cals"]
data = pd.DataFrame(100 * df_train[cols].isna().sum() / df_train.shape[0],index=cols,columns=["missing_rate"]).sort_values(ascending=False,by="missing_rate")
sns.barplot(data,x=data.index,hue=data.missing_rate,y=data.missing_rate,palette="Spectral",legend=False,ax=ax[0])
ax[0].set_ylabel("Missing ratio in %")
ax[0].set_xlabel("Calories related columns")
ax[0].tick_params(axis="x",rotation=90,size=8);

#second plot
cols= group_cols["hr"]
data = pd.DataFrame(100 * df_train[cols].isna().sum() / df_train.shape[0],index=cols,columns=["missing_rate"]).sort_values(ascending=False,by="missing_rate")
sns.barplot(data,x=data.index,hue=data.missing_rate,y=data.missing_rate,palette="Spectral",legend=False,ax=ax[1])
ax[1].set_xlabel("Heart Rate realted columns")
ax[1].set_ylabel("Missing ratio in %")
ax[1].tick_params(axis="x",rotation=90,size=8);


In [None]:
fig, ax = plt.subplots(1,2,figsize=(30,5))


cols= group_cols["activity"]
data = pd.DataFrame(100 * df_train[cols].isna().sum() / df_train.shape[0],index=cols,columns=["missing_rate"]).sort_values(ascending=False,by="missing_rate")
sns.barplot(data,x=data.index,hue=data.missing_rate,y=data.missing_rate,palette="Spectral",legend=False,ax=ax[0])
ax[0].set_xlabel("activity related columns")
ax[0].set_ylabel("Missing ratio in %")
ax[0].tick_params(axis="x",rotation=90,size=8);

#second plot
cols= group_cols["carbs"]
data = pd.DataFrame(100 * df_train[cols].isna().sum() / df_train.shape[0],index=cols,columns=["missing_rate"]).sort_values(ascending=False,by="missing_rate")
sns.barplot(data,x=data.index,hue=data.missing_rate,y=data.missing_rate,palette="Spectral",legend=False,ax=ax[1])
ax[1].set_ylabel("Missing ratio in %")

ax[1].set_xlabel("Carbs related columns")
ax[1].tick_params(axis="x",rotation=90,size=8);




From the previous plots, we can see that the **Carbohydrate** & **activity** columns have all together a missing ratio close to 100%. </br>Unless finding another complementary data source, we will consider deleting them and impute the missing values in other columns.</br> More on that later.</br>

Plot the Blood glucose evolution over time

In [None]:
#group the data by time values
data = df_train[["bg+1:00","time"]].groupby("time").mean()

# Create a line plot
fig = px.line(data, x=data.index, y='bg+1:00', title='1 HOUR ahead Blood Glucose over time')

# Show the plot
fig.show()

**Comment:</br>**
As expected, the glucose levels at "sleep time" between 00H and 06H are on average decreasing most likely to fasting and reduce of activity.</br>
Therefore, the time feature might be helpful to predict the 1 hour ahead **blood glucose**.

Plot the blood Glucose levels for 9 individuals within the data set.

In [None]:

#group the data by time values
data = df_train[["bg+1:00","time","p_num"]].groupby(["p_num","time"]).mean().reset_index()

# Create a line plot
fig = px.line(data, x="time", y='bg+1:00',color="p_num", title='1 HOUR ahead Blood Glucose over time')

# Show the plot
fig.show()

**Comment:</br>**
The blood glucose levels are quite different from one participants to another, which is conceivable given that they may be doing different activities, having different carbohydrate intakes, etc.

Plot the average blood glucose levels and the average carbs consummed over time


In [None]:
#group the data by time values
data = df_train[["bg+1:00","time","cals-0:00","carbs-0:00",'insulin-0:00','hr-0:00','steps-0:00',"bg-0:00"]].groupby("time").mean().reset_index()

#normalize the data
data[data.columns[1:]] = data[data.columns[1:]].apply(lambda col:((col - col.min()) / (col.max() - col.min())))
# Create a line plot
fig = px.line(data, x="time", y=data.columns[1:], title='1 HOUR ahead Blood Glucose over time')

# Show the plot
fig.show()


**Comment:</br>**
* Heart rate and step at 0:00 appears to follow the same trend over the 24H across all patiens. Which is expected as the heart rate increases when the body is doing more steps. We will see later on if they are indeed correlated with one another.
* Blood gluscose vs Insulin levels (~1h before):</br>



Analyze the correlation between each groups of columns (eg. bg, hr,etc) with the target

In [None]:
plt.figure(figsize=(20,3))
cols = sorted(group_cols["bg"])[:]
data = df_train[cols].corr()[["bg+1:00"]]
data = data.sort_values(by="bg+1:00",ascending=False).T
plt.title("Correlation Heatmap between the target and lag glucose levels")
heatmap = sns.heatmap(data,cmap="Spectral",linewidths=0.5,annot=True,fmt='.2f')
for text in heatmap.texts:
    text.set_rotation(90)  # Set annotations to horizontal;

**Comment:**</br>
The more we go back in time, the lower the correlation between the 1 hour ahead glucose level and the bg at higher past time steps.
The correlation between the 1 hour ahead glucose level and the bg 1 hour before is the highest, meaning it can be a very good feature to predict the target.

In [None]:
plt.figure(figsize=(20,3))
cols = sorted(group_cols["insulin"])[:]
cols.insert(0,"bg+1:00")
data = df_train[cols].corr()[["bg+1:00"]]
data = data.sort_values(by="bg+1:00",ascending=False).T
plt.title("Correlation Heatmap between the target and lag insulin levels")
heatmap = sns.heatmap(data,cmap="Spectral",linewidths=0.5,annot=True,fmt='.2f')
for text in heatmap.texts:
    text.set_rotation(90)  # Set annotations to horizontal;

**Comment:**</br>
According to the plot above, there is no linear correlation between the target varable and insulin levels at different times.</br>
However, according to the litterature, the effect of "insulin" decay over time in exponentially decreasing way.  Therefore, transforming the insulin variable in way that is close to the physiological decay of insulin might help to predict the target variable.</br>

In addition, it's established in the literature that the effect of "insulin" has a non-linear effect on the blood glucose. Therefore, having no correlation is not a suprise.

In [None]:
plt.figure(figsize=(20,3))
cols = sorted(group_cols["hr"])[:]
cols.insert(0,"bg+1:00")
data = df_train[cols].corr()[["bg+1:00"]]
data = data.sort_values(by="bg+1:00",ascending=False).T
plt.title("Correlation Heatmap between the target and lag hr levels")
heatmap = sns.heatmap(data,cmap="Spectral",linewidths=0.5,annot=True,fmt='.2f')
for text in heatmap.texts:
    text.set_rotation(90)  # Set annotations to horizontal;

**Comment:**
Like with the insulin levels, there is no linear correlation between the target and the heart rates variables.

In [None]:
plt.figure(figsize=(20,3))
cols = sorted(group_cols["cals"])[:]
cols.insert(0,"bg+1:00")
data = df_train[cols].corr()[["bg+1:00"]]
data = data.sort_values(by="bg+1:00",ascending=False).T
plt.title("Correlation Heatmap between the target and lag cals levels")
heatmap = sns.heatmap(data,cmap="Spectral",linewidths=0.5,annot=True,fmt='.2f')
for text in heatmap.texts:
    text.set_rotation(90)  # Set annotations to horizontal;

**Comment:**
Like with the insulin levels, there is no linear correlation between the target and the heart rates variables.</br>
In fact, this effect is not suprising given that the cals

In [None]:
plt.figure(figsize=(20,3))
cols = sorted(group_cols["carbs"])[:]
cols.insert(0,"bg+1:00")
data = df_train[cols].corr()[["bg+1:00"]]
data = data.sort_values(by="bg+1:00",ascending=False).T
plt.title("Correlation Heatmap between the target and lag cals levels")
heatmap = sns.heatmap(data,cmap="Spectral",linewidths=0.5,annot=True,fmt='.2f')
for text in heatmap.texts:
    text.set_rotation(90)  # Set annotations to horizontal;

**Comment:**
The above plot shows that there is a small positive correlation between the among of carbs (food) ingested and the target variable. Moreover,according to our plot, the amount of food ingested between 2H to 3H tends to have a higher correlation to the target variable than the rest.</br>

Like the insulin, it has a non-linear effect on the blood glucose within the blood.

In [None]:
plt.figure(figsize=(20,3))
cols = sorted(group_cols["steps"])
cols.insert(0,"bg+1:00")
data = df_train[cols].corr()[["bg+1:00"]].T
plt.title("Correlation Heatmap between the target and lag steps levels")
heatmap = sns.heatmap(data,cmap="Spectral",linewidths=0.5,annot=True,fmt='.2f')
for text in heatmap.texts:
    text.set_rotation(90)  # Set annotations to horizontal;

**Comment:**
The above plot shows that there is a small correlation between the among of carbs (food) ingested and the target variable. Moreover,according to our plot, the amount of food ingested between 2H to 3H tends to have a higher correlation to the target variable than the rest.


**EDA Take aways:** </br>
* Some missing values imputations strategies will be necessary for bloog glucose,insulin, cals, steps, hr.
* Carbs and activity related columns will be discarded since the ratio of missing values is close to 100%
* Past bg columns have a positive   correlation with the target variable
* The other columns have a small (even close to 0) correlation with the target variable, which might be explained a an inherent no linear relationship between them and the target variable (e.g insulin).
* Feature transformations might be useful for e.g for insulin to mimic the decay behavior of insulin within the blood stream.
*  The bg values of different patients fluctuate around different intervalss