In [74]:
% matplotlib notebook
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
from IPython.display import Image
from sklearn.preprocessing import Imputer

# Add modules path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from Translator import Translator
import preprocessor as pp

# Add modules path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Define language and translator functions
language = "es"
translator = Translator(language)
to_lang = translator.translate_to_language
to_col = translator.translate_to_column

# Load data
raw_data = pd.read_csv("sample.txt", header=0, skiprows=1, delimiter="\t", index_col=0, usecols=list(range(0, 9)),
                       parse_dates=to_lang(["Datetime"]), decimal=",",
                       date_parser=lambda x: pd.to_datetime(x, format="%Y/%m/%d %H:%M"))
# Translate column names
raw_data.columns = (to_col(raw_data.columns))

# Divide in blocks, extend dataset and clean data
block_data = pp.define_blocks(raw_data)
cleaned_block_data = pp.clean_processed_data(block_data)

## Add information of previous day

In [75]:
extended_data = pp.extend_data(cleaned_block_data)
extended_data[["Datetime", "Day_Block", "Glucose_Mean_Day", "Glucose_Mean_Prev_Day"]].iloc[[1, 200, 400, 600, 800, 1000]]

Unnamed: 0,Datetime,Day_Block,Glucose_Mean_Day,Glucose_Mean_Prev_Day
1,2016-03-31 17:44:00,2016-03-31,98.948718,
200,2016-04-02 01:15:00,2016-04-01,116.715152,98.948718
400,2016-04-03 12:43:00,2016-04-03,140.438095,128.056738
600,2016-04-05 11:32:00,2016-04-05,93.044199,125.390476
800,2016-04-06 11:12:00,2016-04-06,111.1,93.044199
1000,2016-04-07 16:40:00,2016-04-07,119.063636,111.1


## Fill previous day values in the first day with the mean 


In [76]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputed_cols = imp.fit_transform(
    extended_data[["Glucose_Mean_Prev_Day",
                   "Glucose_Std_Prev_Day", "Glucose_Min_Prev_Day",
                   "Glucose_Max_Prev_Day"]].values)
extended_data.loc[:, ["Glucose_Mean_Prev_Day",
                      "Glucose_Std_Prev_Day", "Glucose_Min_Prev_Day",
                      "Glucose_Max_Prev_Day"]] = imputed_cols
extended_data[["Datetime", "Day_Block", "Glucose_Mean_Day", "Glucose_Mean_Prev_Day"]].iloc[
    [1, 200, 400, 600, 800, 1000]]

Unnamed: 0,Datetime,Day_Block,Glucose_Mean_Day,Glucose_Mean_Prev_Day
1,2016-03-31 17:44:00,2016-03-31,98.948718,121.370527
200,2016-04-02 01:15:00,2016-04-01,116.715152,98.948718
400,2016-04-03 12:43:00,2016-04-03,140.438095,128.056738
600,2016-04-05 11:32:00,2016-04-05,93.044199,125.390476
800,2016-04-06 11:12:00,2016-04-06,111.1,93.044199
1000,2016-04-07 16:40:00,2016-04-07,119.063636,111.1


## Obtain values of glucose of previous day at the same time (Rounded to quarter)

In [77]:
import datetime

#Round datetime to nearest quarter hour 
rounded_quarters = extended_data[["Datetime", "Glucose_Auto"]].copy()
rounded_quarters["Datetime"] = rounded_quarters["Datetime"].apply(
    lambda dt: datetime.datetime(dt.year, dt.month, dt.day, dt.hour,15*(dt.minute // 15)))
rounded_quarters.iloc[[1, 200, 400, 600, 800, 1000]]

Unnamed: 0,Datetime,Glucose_Auto
1,2016-03-31 17:30:00,55.0
200,2016-04-02 01:15:00,83.0
400,2016-04-03 12:30:00,134.0
600,2016-04-05 11:30:00,60.0
800,2016-04-06 11:00:00,94.0
1000,2016-04-07 16:30:00,109.0


In [78]:
rounded_quarters[["Prev_Day_Datetime"]] = rounded_quarters[["Datetime"]].apply(lambda row: row - datetime.timedelta(days=1))
joined = rounded_quarters.merge(rounded_quarters, how='left', left_on='Prev_Day_Datetime', right_on='Datetime',
                       suffixes=('', '_Prev_Day'))
joined[["Datetime", "Glucose_Auto", "Datetime_Prev_Day", "Glucose_Auto_Prev_Day"]].iloc[[1, 200, 400, 600, 800, 1000]]

Unnamed: 0,Datetime,Glucose_Auto,Datetime_Prev_Day,Glucose_Auto_Prev_Day
1,2016-03-31 17:30:00,55.0,NaT,
200,2016-04-01 21:30:00,53.0,2016-03-31 21:30:00,115.0
400,2016-04-02 16:45:00,188.0,2016-04-01 16:45:00,141.0
600,2016-04-03 16:15:00,111.0,2016-04-02 16:15:00,165.0
800,2016-04-05 11:30:00,60.0,2016-04-04 11:30:00,166.0
1000,2016-04-06 09:45:00,125.0,2016-04-05 09:45:00,77.0


In [79]:
extended_data["Glucose_Auto_Prev_Day"] = joined["Glucose_Auto_Prev_Day"]

## Delete all rows that does not contain values of glucose of previous day

In [80]:
extended_data.dropna(inplace='True', subset=["Glucose_Auto_Prev_Day"])
extended_data[["Datetime", "Glucose_Auto", "Glucose_Auto_Prev_Day"]].iloc[[1, 200, 400, 600, 800, 1000]]

Unnamed: 0,Datetime,Glucose_Auto,Glucose_Auto_Prev_Day
130,2016-04-01 17:23:00,153.0,64.0
333,2016-04-02 19:51:00,217.0,123.0
537,2016-04-04 22:57:00,140.0,151.0
738,2016-04-06 01:10:00,125.0,136.0
938,2016-04-07 03:20:00,190.0,135.0
1139,2016-04-09 00:56:00,178.0,70.0


## Calculate difference of glucose with previous day

In [81]:
extended_data["Delta_Glucose_Prev_Day"] = abs(extended_data["Glucose_Auto"] - extended_data["Glucose_Auto_Prev_Day"])
extended_data[["Datetime", "Glucose_Auto", "Glucose_Auto_Prev_Day", "Delta_Glucose_Prev_Day"]].iloc[[1, 200, 400, 600, 800, 1000]]

Unnamed: 0,Datetime,Glucose_Auto,Glucose_Auto_Prev_Day,Delta_Glucose_Prev_Day
130,2016-04-01 17:23:00,153.0,64.0,89.0
333,2016-04-02 19:51:00,217.0,123.0,94.0
537,2016-04-04 22:57:00,140.0,151.0,11.0
738,2016-04-06 01:10:00,125.0,136.0,11.0
938,2016-04-07 03:20:00,190.0,135.0,55.0
1139,2016-04-09 00:56:00,178.0,70.0,108.0


## Add new label to diagnosis (Severe hyperglycemia) (>240)

In [82]:
def label_map(value):
    
    hypoglycemia_threshold = 70
    hyperglycemia_threshold = 180
    severe_hyperglycemia_threshold = 240

    if value < hypoglycemia_threshold:
        return 'Hypoglycemia'
    elif value > hyperglycemia_threshold:
        if value > severe_hyperglycemia_threshold:
            return 'Severe_Hyperglycemia'
        else:
            return 'Hyperglycemia'
    else:
        return 'In_Range'


# Add label to each entry (Diagnosis)
extended_data["Diagnosis"] = extended_data["Glucose_Auto"].apply(label_map)
extended_data[["Datetime", "Glucose_Auto", "Diagnosis"]].ix[[167, 200, 312, 322]]

Unnamed: 0,Datetime,Glucose_Auto,Diagnosis
167,2016-04-01 21:11:00,66.0,Hypoglycemia
200,2016-04-02 01:15:00,83.0,In_Range
312,2016-04-02 16:50:00,188.0,Hyperglycemia
322,2016-04-02 18:05:00,249.0,Severe_Hyperglycemia


In [83]:
from sklearn.preprocessing import LabelBinarizer
import numpy as np
# Binarize labels in a one-vs-all fashion (Severe_Hyperglycemia, Hyperglycemia, Hypoglycemia and Normal)
# to get binary labels
lb = LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
lb.fit(extended_data["Diagnosis"])
labels = pd.DataFrame(index=extended_data.index)
for x in lb.classes_:
    labels[x + "_Diagnosis"] = np.nan
labels.loc[:, [x + "_Diagnosis" for x in lb.classes_]] = lb.transform(extended_data["Diagnosis"])
labels.ix[[167, 200, 312, 322]]

Unnamed: 0,Hyperglycemia_Diagnosis,Hypoglycemia_Diagnosis,In_Range_Diagnosis,Severe_Hyperglycemia_Diagnosis
167,0,1,0,0
200,0,0,1,0
312,1,0,0,0
322,0,0,0,1


### Logic OR between hyperglycemia and severe_hyperglycemia in hyperglycemia column

In [84]:
labels["Hyperglycemia_Diagnosis"] = labels["Hyperglycemia_Diagnosis"] | labels["Severe_Hyperglycemia_Diagnosis"]
labels.ix[[167, 200, 312, 322]]

Unnamed: 0,Hyperglycemia_Diagnosis,Hypoglycemia_Diagnosis,In_Range_Diagnosis,Severe_Hyperglycemia_Diagnosis
167,0,1,0,0
200,0,0,1,0
312,1,0,0,0
322,1,0,0,1


## Add diagnosis of next block

In [85]:
# Join labels to data
extended_data = pd.concat([extended_data, labels], axis=1, join_axes=[extended_data.index])
extended_data.columns.values

array(['Datetime', 'Glucose_Auto', 'Hour', 'Block', 'Day_Block',
       'Last_Meal', 'Overlapped_Block', 'Carbo_Block',
       'Rapid_Insulin_Block', 'Glucose_Mean_Block', 'Glucose_Std_Block',
       'Glucose_Min_Block', 'Glucose_Max_Block', 'Glucose_Mean_Day',
       'Glucose_Std_Day', 'Glucose_Min_Day', 'Glucose_Max_Day', 'MAGE',
       'Weekday', 'Minutes_Last_Meal', 'Last_Meal_Hour',
       'Glucose_Mean_Prev_Block', 'Glucose_Std_Prev_Block',
       'Glucose_Min_Prev_Block', 'Glucose_Max_Prev_Block',
       'Rapid_Insulin_Prev_Block', 'Carbo_Prev_Block',
       'Glucose_Mean_Prev_Day', 'Glucose_Std_Prev_Day',
       'Glucose_Min_Prev_Day', 'Glucose_Max_Prev_Day', 'Diagnosis',
       'Glucose_Auto_Prev_Day', 'Delta_Glucose_Prev_Day',
       'Hyperglycemia_Diagnosis', 'Hypoglycemia_Diagnosis',
       'In_Range_Diagnosis', 'Severe_Hyperglycemia_Diagnosis'], dtype=object)

In [86]:
# Group by blocks and get aggregated diagnosis for current block
def logical_or(x):
    return 1 if np.sum(x) > 0 else 0
new_columns = extended_data.groupby(['Day_Block', 'Block']).agg(
    {'Hypoglycemia_Diagnosis': logical_or, 'In_Range_Diagnosis': logical_or,'Hyperglycemia_Diagnosis': logical_or,
     'Severe_Hyperglycemia_Diagnosis': logical_or})
new_columns

Unnamed: 0_level_0,Unnamed: 1_level_0,Hyperglycemia_Diagnosis,Severe_Hyperglycemia_Diagnosis,Hypoglycemia_Diagnosis,In_Range_Diagnosis
Day_Block,Block,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-04-01,3,0,0,0,1
2016-04-01,4,0,0,1,1
2016-04-01,5,0,0,1,1
2016-04-01,6,0,0,1,1
2016-04-02,0,1,0,1,1
2016-04-02,1,0,0,1,1
2016-04-02,2,0,0,1,1
2016-04-02,3,1,1,1,1
2016-04-02,4,1,1,0,1
2016-04-03,0,1,0,0,1


In [87]:
# Join aggregated data to dataset
new_columns.rename(columns={'Hyperglycemia_Diagnosis': 'Hyperglycemia_Diagnosis_Block',
                    'Hypoglycemia_Diagnosis': 'Hypoglycemia_Diagnosis_Block',
                    'In_Range_Diagnosis': 'In_Range_Diagnosis_Block',
                    'Severe_Hyperglycemia_Diagnosis': 'Severe_Hyperglycemia_Diagnosis_Block'}, inplace=True)
new_columns = new_columns.reset_index(level=[0, 1])
new_data = pd.merge(extended_data, new_columns, on=["Block", "Day_Block"], how='left')
new_data[["Datetime", "Block", "Glucose_Auto", "Hyperglycemia_Diagnosis", 'Hyperglycemia_Diagnosis_Block']].tail(50)

Unnamed: 0,Datetime,Block,Glucose_Auto,Hyperglycemia_Diagnosis,Hyperglycemia_Diagnosis_Block
1972,2016-04-13 18:55:00,6,90.0,0,0
1973,2016-04-13 19:10:00,5,76.0,0,0
1974,2016-04-13 19:10:00,6,76.0,0,0
1975,2016-04-13 19:25:00,5,64.0,0,0
1976,2016-04-13 19:25:00,6,64.0,0,0
1977,2016-04-13 19:25:00,7,64.0,0,1
1978,2016-04-13 19:25:00,7,64.0,0,1
1979,2016-04-13 19:41:00,6,55.0,0,0
1980,2016-04-13 19:41:00,7,55.0,0,1
1981,2016-04-13 19:56:00,6,51.0,0,0


In [88]:
# Add label corresponding to the next block (offset = 1)
offset = 1
counter = 0
previous = np.nan
new_data.loc[:, "Hyperglycemia_Diagnosis_Next_Block"] = np.nan
new_data.loc[:, "Hypoglycemia_Diagnosis_Next_Block"] = np.nan
new_data.loc[:, "In_Range_Diagnosis_Next_Block"] = np.nan
new_data.loc[:, "Severe_Hyperglycemia_Diagnosis_Next_Block"] = np.nan

# Reverse iteration
for block in new_data[["Day_Block", "Block", "Hyperglycemia_Diagnosis_Block", "Hypoglycemia_Diagnosis_Block",
                                    "In_Range_Diagnosis_Block", "Severe_Hyperglycemia_Diagnosis_Block"]]\
        .drop_duplicates().iloc[::-1].itertuples():
    if counter >= offset:
        mask = (new_data["Day_Block"] == block[1]) & (new_data["Block"] == block[2])
        new_data.loc[mask, "Hyperglycemia_Diagnosis_Next_Block"] = next[3]
        new_data.loc[mask, "Hypoglycemia_Diagnosis_Next_Block"] = next[4]
        new_data.loc[mask, "In_Range_Diagnosis_Next_Block"] = next[5]
        new_data.loc[mask, "Severe_Hyperglycemia_Diagnosis_Next_Block"] = next[6]
    next = block
    counter += 1
new_data[["Datetime", "Block", "Glucose_Auto", "Hyperglycemia_Diagnosis_Block", 'Hyperglycemia_Diagnosis_Next_Block']].tail(50)

Unnamed: 0,Datetime,Block,Glucose_Auto,Hyperglycemia_Diagnosis_Block,Hyperglycemia_Diagnosis_Next_Block
1972,2016-04-13 18:55:00,6,90.0,0,1.0
1973,2016-04-13 19:10:00,5,76.0,0,0.0
1974,2016-04-13 19:10:00,6,76.0,0,1.0
1975,2016-04-13 19:25:00,5,64.0,0,0.0
1976,2016-04-13 19:25:00,6,64.0,0,1.0
1977,2016-04-13 19:25:00,7,64.0,1,1.0
1978,2016-04-13 19:25:00,7,64.0,1,1.0
1979,2016-04-13 19:41:00,6,55.0,0,1.0
1980,2016-04-13 19:41:00,7,55.0,1,1.0
1981,2016-04-13 19:56:00,6,51.0,0,1.0


In [89]:
new_data.to_csv(path_or_buf='extended.csv')

## Drop information current day


In [90]:
cleaned_extended_data = pp.clean_extended_data(new_data)
cleaned_extended_data.drop(["Glucose_Mean_Day", "Glucose_Std_Day",
          "Glucose_Min_Day", "Glucose_Max_Day"], inplace=True, axis=1)
list(cleaned_extended_data.columns.values)

['Datetime',
 'Glucose_Auto',
 'Hour',
 'Block',
 'Day_Block',
 'Last_Meal',
 'Overlapped_Block',
 'MAGE',
 'Weekday',
 'Minutes_Last_Meal',
 'Last_Meal_Hour',
 'Glucose_Mean_Prev_Block',
 'Glucose_Std_Prev_Block',
 'Glucose_Min_Prev_Block',
 'Glucose_Max_Prev_Block',
 'Rapid_Insulin_Prev_Block',
 'Carbo_Prev_Block',
 'Glucose_Mean_Prev_Day',
 'Glucose_Std_Prev_Day',
 'Glucose_Min_Prev_Day',
 'Glucose_Max_Prev_Day',
 'Diagnosis',
 'Glucose_Auto_Prev_Day',
 'Delta_Glucose_Prev_Day',
 'Hyperglycemia_Diagnosis',
 'Hypoglycemia_Diagnosis',
 'In_Range_Diagnosis',
 'Severe_Hyperglycemia_Diagnosis',
 'Hyperglycemia_Diagnosis_Block',
 'Severe_Hyperglycemia_Diagnosis_Block',
 'Hypoglycemia_Diagnosis_Block',
 'In_Range_Diagnosis_Block',
 'Hyperglycemia_Diagnosis_Next_Block',
 'Hypoglycemia_Diagnosis_Next_Block',
 'In_Range_Diagnosis_Next_Block',
 'Severe_Hyperglycemia_Diagnosis_Next_Block']

## Drop rows with unknown labels (Data corresponding to last block) and column labels corresponding to current entry and block

In [90]:
cleaned_extended_data.dropna(inplace='True', subset=["Hyperglycemia_Diagnosis_Next_Block",
                                                     "Hypoglycemia_Diagnosis_Next_Block",
                                                     "In_Range_Diagnosis_Next_Block",
                                                     "Severe_Hyperglycemia_Diagnosis_Next_Block"])
cleaned_extended_data.drop(["Diagnosis", "Hyperglycemia_Diagnosis",
                            "Hypoglycemia_Diagnosis", "In_Range_Diagnosis", "Severe_Hyperglycemia_Diagnosis",
                            "Hyperglycemia_Diagnosis_Block", "Hypoglycemia_Diagnosis_Block",
                            "In_Range_Diagnosis_Block", "Severe_Hyperglycemia_Diagnosis_Block"], inplace=True, axis=1)
list(cleaned_extended_data.columns.values)

['Datetime',
 'Glucose_Auto',
 'Hour',
 'Block',
 'Day_Block',
 'Last_Meal',
 'Overlapped_Block',
 'MAGE',
 'Weekday',
 'Minutes_Last_Meal',
 'Last_Meal_Hour',
 'Glucose_Mean_Prev_Block',
 'Glucose_Std_Prev_Block',
 'Glucose_Min_Prev_Block',
 'Glucose_Max_Prev_Block',
 'Rapid_Insulin_Prev_Block',
 'Carbo_Prev_Block',
 'Glucose_Mean_Prev_Day',
 'Glucose_Std_Prev_Day',
 'Glucose_Min_Prev_Day',
 'Glucose_Max_Prev_Day',
 'Glucose_Auto_Prev_Day',
 'Delta_Glucose_Prev_Day',
 'Hyperglycemia_Diagnosis_Next_Block',
 'Hypoglycemia_Diagnosis_Next_Block',
 'In_Range_Diagnosis_Next_Block',
 'Severe_Hyperglycemia_Diagnosis_Next_Block']