In [254]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

pd.options.display.max_columns = None

from scripts import (
    load_uci, load_tidepool_dummy, 
    load_so_pump_raw, load_so_pump_clean,
    load_so_cgm    
)

## CGM data

In [255]:
cgm = load_so_cgm()
print(cgm.shape)
cgm.head()

(25285, 14)


Unnamed: 0,Index,Timestamp (YYYY-MM-DDThh:mm:ss),Event Type,Event Subtype,Patient Info,Device Info,Source Device ID,Glucose Value (mg/dL),Insulin Value (u),Carb Value (grams),Duration (hh:mm:ss),Glucose Rate of Change (mg/dL/min),Transmitter Time (Long Integer),Transmitter ID
9,10,2019-01-14T00:02:42,EGV,,,,iPhone,128,,,,,1943354.0,41W22H
10,11,2019-01-14T00:07:41,EGV,,,,iPhone,129,,,,,1943654.0,41W22H
11,12,2019-01-14T00:12:41,EGV,,,,iPhone,130,,,,,1943954.0,41W22H
12,13,2019-01-14T00:17:42,EGV,,,,iPhone,131,,,,,1944254.0,41W22H
13,14,2019-01-14T00:22:41,EGV,,,,iPhone,132,,,,,1944554.0,41W22H


In [256]:
cgm['Event Subtype'].nunique()

1

In [257]:
cgm['Event Type'].unique()

array(['EGV', 'Calibration'], dtype=object)

### Drop the non-EGV events

In [258]:
cgm = cgm[cgm['Event Type'] == 'EGV']

In [259]:
print('cgm.shape:', cgm.shape)
cgm.head()

cgm.shape: (25170, 14)


Unnamed: 0,Index,Timestamp (YYYY-MM-DDThh:mm:ss),Event Type,Event Subtype,Patient Info,Device Info,Source Device ID,Glucose Value (mg/dL),Insulin Value (u),Carb Value (grams),Duration (hh:mm:ss),Glucose Rate of Change (mg/dL/min),Transmitter Time (Long Integer),Transmitter ID
9,10,2019-01-14T00:02:42,EGV,,,,iPhone,128,,,,,1943354.0,41W22H
10,11,2019-01-14T00:07:41,EGV,,,,iPhone,129,,,,,1943654.0,41W22H
11,12,2019-01-14T00:12:41,EGV,,,,iPhone,130,,,,,1943954.0,41W22H
12,13,2019-01-14T00:17:42,EGV,,,,iPhone,131,,,,,1944254.0,41W22H
13,14,2019-01-14T00:22:41,EGV,,,,iPhone,132,,,,,1944554.0,41W22H


In [260]:
cgm['Timestamp (YYYY-MM-DDThh:mm:ss)'].min(), cgm['Timestamp (YYYY-MM-DDThh:mm:ss)'].max()

('2019-01-14T00:02:42', '2019-04-13T09:04:08')

### Remove unnecessary columns

In [261]:
cgm = cgm.drop(columns=['Index', 'Event Type', 'Event Subtype', 'Patient Info', 'Device Info', 
                        'Source Device ID', 'Insulin Value (u)', 'Carb Value (grams)',
                        'Duration (hh:mm:ss)', 'Glucose Rate of Change (mg/dL/min)',
                        'Transmitter Time (Long Integer)', 'Transmitter ID'])
cgm = cgm.rename(mapper={'Timestamp (YYYY-MM-DDThh:mm:ss)': 'Timestamp'},
                         axis=1)
print('cgm.shape:', cgm.shape)
cgm.head()

cgm.shape: (25170, 2)


Unnamed: 0,Timestamp,Glucose Value (mg/dL)
9,2019-01-14T00:02:42,128
10,2019-01-14T00:07:41,129
11,2019-01-14T00:12:41,130
12,2019-01-14T00:17:42,131
13,2019-01-14T00:22:41,132


In [262]:
cgm['Timestamp'] = cgm['Timestamp'] \
                    .apply(lambda x: 
                        pd.Timestamp(x).to_datetime64())
cgm.head()

Unnamed: 0,Timestamp,Glucose Value (mg/dL)
9,2019-01-14 00:02:42,128
10,2019-01-14 00:07:41,129
11,2019-01-14 00:12:41,130
12,2019-01-14 00:17:42,131
13,2019-01-14 00:22:41,132


In [263]:
cgm.Timestamp.iloc[cgm.shape[0] - 1] - cgm.Timestamp.iloc[0]

Timedelta('89 days 09:01:26')

### Convert timestamp to int and apply check mean value as an initial guess

In [264]:
print('cgm.shape:', cgm.shape)
cgm['Timestamp'] = (cgm.Timestamp.astype(int)/(10**9)).astype(int)

# Remove Glucose Values that are not numbers and convert them to numbers
cgm = cgm[~cgm['Glucose Value (mg/dL)'].str.contains("[a-zA-Z]").fillna(False)]
cgm['Glucose Value (mg/dL)'] = cgm['Glucose Value (mg/dL)'].astype(float)

print('cgm.shape:', cgm.shape)
cgm.head()

cgm.shape: (25170, 2)
cgm.shape: (25141, 2)


Unnamed: 0,Timestamp,Glucose Value (mg/dL)
9,1547424162,128.0
10,1547424461,129.0
11,1547424761,130.0
12,1547425062,131.0
13,1547425361,132.0


In [265]:
cgm.dtypes

Timestamp                  int64
Glucose Value (mg/dL)    float64
dtype: object

In [266]:
def split_train_validate(df, target_name, test_fraction=0.2):

    test_size = int(df.shape[0] * test_fraction)
    df_train = df.iloc[:df.shape[0]-test_size,  :]
    df_val   = df.iloc[ df.shape[0]-test_size:, :]
    X_train  = df_train.drop(columns=[target_name])
    y_train  = df_train[target_name]
    X_val    = df_val.drop(columns=[target_name])
    y_val    = df_val[target_name]

    print('X_train.shape:', X_train.shape, 'y_train.shape:', y_train.shape)
    print('X_val.shape:', X_val.shape, 'y_val.shape:', y_val.shape)
    
    return X_train, X_val, y_train, y_val

X_train, X_val, y_train, y_val = split_train_validate(cgm, 
                                                     'Glucose Value (mg/dL)', 
                                                     0.2)

X_train.shape: (20113, 1) y_train.shape: (20113,)
X_val.shape: (5028, 1) y_val.shape: (5028,)


In [269]:
mean_value_train = y_train.mean()
print('mean_value_train:', mean_value_train)

mean_value_train: 138.45000745786308


In [274]:
y_pred = [abs(y - mean_value_train) for y in y_val]
mean_value_ypred = np.average(y_pred)
mae = np.average([abs(y - mean_value_ypred) for y in y_pred], axis = 0)
print('Average of the training y values gives MAE:', mae)

Average of the training y values gives MAE: 25.52558788655618


### Create column with glucose values 30 minutes into the future

In [None]:
cgm.head()

In [275]:
def wrangle(df, minutes):
    seconds = minutes * 60
    xs = df['Timestamp'].values
    ys = []
    for i in range(df.shape[0]):
        y = np.interp(xs[i],
                      df['Timestamp'],
                      df['Glucose Value (mg/dL)'])
        ys.append(y)
    print('df.shape:', df.shape)
    print('len(ys):', len(ys))
    df[str(minutes) + ' minutes'] = ys
    return df

In [276]:
cgm = wrangle(cgm, 30)
print('cgm.shape:', cgm.shape)
cgm.head()

df.shape: (25141, 2)
len(ys): 25141
cgm.shape: (25141, 3)


Unnamed: 0,Timestamp,Glucose Value (mg/dL),30 minutes
9,1547424162,128.0,128.0
10,1547424461,129.0,129.0
11,1547424761,130.0,130.0
12,1547425062,131.0,131.0
13,1547425361,132.0,132.0


In [280]:
X_train, X_val, y_train, y_val = split_train_validate(cgm, '30 minutes', 0.2)

X_train.shape: (20113, 2) y_train.shape: (20113,)
X_val.shape: (5028, 2) y_val.shape: (5028,)


In [281]:
param_grid = {
#     'learning_rate': [0.06],
#     'n_estimators':  [80],
#     'max_depth': [3],
#     'subsample': [0.9],
#     'colsample_bytree': [0.3],
#     'gamma': [4, 5]
    
    'learning_rate': [0.01, 0.03, 0.06, 0.09],
    'n_estimators':  [70, 75, 80, 85, 90],
    'max_depth': [2, 3, 4],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.2, 0.3, 0.4],
    'gamma': [3, 4, 5, 6]
}

gridsearch = GridSearchCV(XGBRegressor(),
                          param_grid=param_grid, 
                          # scoring='roc_auc', 
                          cv=3, n_jobs=-1,
                          return_train_score=True, verbose=10)

In [282]:
gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 2160 candidates, totalling 6480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.01, 0.03, 0.06, 0.09], 'n_estimators': [70, 75, 80, 85, 90], 'max_depth': [2, 3, 4], 'subsample': [0.7, 0.8, 0.9], 'colsample_bytree': [0.2, 0.3, 0.4], 'gamma': [3, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=10)

In [283]:
y_pred = gridsearch.predict(X_val)
print('mse:', mean_absolute_error(y_val, y_pred))

mse: 11.719491803086553


### Since we have CGM data from 2019-01-14 thru 2019-04-13, we can take the pump data from this date range as well

## Pump data

In [103]:
pump = load_so_pump_clean()
print(pump.shape)
pump.head()

(4507, 7)


Unnamed: 0,Datetime,Type,Value,Unit,Description,Other Info,Comment
0,2018-08-19 00:00:00,Insulin Summary,12.6,units,End of Day Basal Total,Basal,
1,2018-08-19 00:00:00,Insulin Summary,16.7,units,Bolus-End of Day General Bolus Total,Bolus,
2,2018-08-19 10:28:00,Basal Insulin,1.3,units,Pod activated.Basal rate set to 1.30 units/hour.,Basal,Pod activated.Basal rate set to 1.30 units/hour.
3,2018-08-19 10:30:00,Basal Insulin,1.0,units,Basal rate set to 1.00 units/hour.,Basal,Basal rate set to 1.00 units/hour.
4,2018-08-19 10:44:00,Bolus Insulin,2.0,units,Bolus-General Bolus.,Bolus,Suggested Bolus: ; Programmed Meal: ; Programm...


In [104]:
pump.Datetime.min(), pump.Datetime.max()

(Timestamp('2018-08-19 00:00:00'), Timestamp('2019-04-12 20:18:00'))

In [105]:
pump.Type.value_counts()

Bolus Insulin      2129
Basal Insulin      1191
Insulin Summary     521
Glucose             494
Pump Alarm          152
Notes                20
Name: Type, dtype: int64

In [106]:
pump[pump.Type == 'Bolus Insulin'][:5]

Unnamed: 0,Datetime,Type,Value,Unit,Description,Other Info,Comment
4,2018-08-19 10:44:00,Bolus Insulin,2.0,units,Bolus-General Bolus.,Bolus,Suggested Bolus: ; Programmed Meal: ; Programm...
5,2018-08-19 13:02:00,Bolus Insulin,1.2,units,Bolus-General Bolus.,Bolus,Suggested Bolus: ; Programmed Meal: ; Programm...
7,2018-08-19 15:57:00,Bolus Insulin,2.5,units,Bolus-General Bolus.,Bolus,Suggested Bolus: ; Programmed Meal: ; Programm...
8,2018-08-19 16:48:00,Bolus Insulin,2.5,units,Bolus-General Bolus.,Bolus,Suggested Bolus: ; Programmed Meal: ; Programm...
9,2018-08-19 17:45:00,Bolus Insulin,3.0,units,Bolus-General Bolus.,Bolus,Suggested Bolus: ; Programmed Meal: ; Programm...


In [107]:
pump[pump.Type == 'Basal Insulin'][:5]

Unnamed: 0,Datetime,Type,Value,Unit,Description,Other Info,Comment
2,2018-08-19 10:28:00,Basal Insulin,1.3,units,Pod activated.Basal rate set to 1.30 units/hour.,Basal,Pod activated.Basal rate set to 1.30 units/hour.
3,2018-08-19 10:30:00,Basal Insulin,1.0,units,Basal rate set to 1.00 units/hour.,Basal,Basal rate set to 1.00 units/hour.
10,2018-08-19 18:27:00,Basal Insulin,0.0,units,Basal suspended.Basal rate set to 0.00 units/h...,Basal,Basal suspended.Basal rate set to 0.00 units/h...
12,2018-08-19 18:59:00,Basal Insulin,1.0,units,Basal resumed.Basal rate set to 1.00 units/hour.,Basal,Basal resumed.Basal rate set to 1.00 units/hour.
15,2018-08-19 22:49:00,Basal Insulin,0.0,units,Basal suspended.Basal rate set to 0.00 units/h...,Basal,Basal suspended.Basal rate set to 0.00 units/h...


In [108]:
pump[pump.Type == 'Insulin Summary'][:5]

Unnamed: 0,Datetime,Type,Value,Unit,Description,Other Info,Comment
0,2018-08-19,Insulin Summary,12.6,units,End of Day Basal Total,Basal,
1,2018-08-19,Insulin Summary,16.7,units,Bolus-End of Day General Bolus Total,Bolus,
17,2018-08-20,Insulin Summary,12.5,units,Bolus-End of Day General Bolus Total,Bolus,
18,2018-08-20,Insulin Summary,20.15,units,End of Day Basal Total,Basal,
34,2018-08-21,Insulin Summary,22.1,units,End of Day Basal Total,Basal,


In [109]:
pump[pump.Type == 'Glucose'][:5]

Unnamed: 0,Datetime,Type,Value,Unit,Description,Other Info,Comment
6,2018-08-19 14:36:00,Glucose,5.6,mmol/L,,,
40,2018-08-21 10:49:00,Glucose,4.9,mmol/L,,,
65,2018-08-22 14:05:00,Glucose,9.0,mmol/L,,,
95,2018-08-24 20:26:00,Glucose,5.3,mmol/L,,,
109,2018-08-25 12:18:00,Glucose,8.9,mmol/L,,,


In [110]:
pump[pump.Type == 'Pump Alarm'].Description.unique()

array(['OmniPod Alarm: Suspend end advisory alarm',
       'OmniPod Alarm: Pod expiration advisory alarm',
       'OmniPod Alarm: Low reservoir advisory alarm',
       'OmniPod Alarm: Pod expired hazard alarm',
       'OmniPod Alarm: Clock reset alarm', 'OmniPod Alarm: Pod alarm',
       'OmniPod Alarm: Last Pod status hazard alarm'], dtype=object)

In [111]:
pump[pump.Type == 'Notes'].Description.unique()

array(['Date changed to: 10/21/2018', 'Time changed to: 15:05',
       'Time changed to: 19:00', 'Basal rate set to 1.00 units/hour.',
       'General Bolus.Delivered: 1.00.', 'General Bolus.Delivered: 1.50.',
       'Pod deactivated.Basal rate set to 0.00 units/hour.',
       'Pod activated.Basal rate set to 1.00 units/hour.',
       'General Bolus.Delivered: 2.10.', 'General Bolus.Delivered: 1.10.',
       'General Bolus.Delivered: 3.00.', 'General Bolus.Delivered: 2.50.',
       'General Bolus.Delivered: 3.60.', 'General Bolus.Delivered: 1.60.',
       'General Bolus.Delivered: 5.60.',
       'Temporary basal rate set to 0.75 units/hour.Temp percent change: -25%.',
       'Time changed to: 22:14'], dtype=object)

### Use Bolus/Basal Insulin rows, and use data only after 10/21/2018 or before - never across. There was a date/time change on the pump on that day

## Data Cleaning

NameError: name 'train_test_split' is not defined