In [70]:
import numpy as np
import pandas as pd
import pickle
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, \
                            mean_squared_error
from xgboost import XGBRegressor

pd.options.display.max_columns = None

from scripts.data_processing import (
    load_uci, load_tidepool_dummy, 
    load_so_pump,
    load_so_cgm    
)

## CGM data

In [71]:
cgm = load_so_cgm()
print(cgm.shape)
cgm.head()

(92055, 3)


Unnamed: 0,timestamp,measurement,below_threshold
9,2018-04-26T00:02:36,80,False
10,2018-04-26T00:07:37,83,False
11,2018-04-26T00:12:36,86,False
12,2018-04-26T00:17:37,89,False
13,2018-04-26T00:22:37,93,False


In [72]:
cgm.dtypes

timestamp          object
measurement         int64
below_threshold      bool
dtype: object

In [73]:
cgm.timestamp = pd.to_datetime(cgm.timestamp)

In [74]:
cgm.timestamp = cgm['timestamp'].astype(np.int64) // 10**9
cgm.head()

Unnamed: 0,timestamp,measurement,below_threshold
9,1524700956,80,False
10,1524701257,83,False
11,1524701556,86,False
12,1524701857,89,False
13,1524702157,93,False


In [75]:
cgm.timestamp.iloc[cgm.shape[0] - 1] - cgm.timestamp.iloc[0]

30445292

### Convert timestamp to int and apply check mean value as an initial guess

In [76]:
cgm.dtypes

timestamp          int64
measurement        int64
below_threshold     bool
dtype: object

In [77]:
def split_train_validate(df, target_name, test_fraction=0.2):

    test_size = int(df.shape[0] * test_fraction)
    df_train = df.iloc[:df.shape[0]-test_size,  :]
    df_val   = df.iloc[ df.shape[0]-test_size:, :]
    X_train  = df_train.drop(columns=[target_name])
    y_train  = df_train[target_name]
    X_val    = df_val.drop(columns=[target_name])
    y_val    = df_val[target_name]

    print('X_train.shape:', X_train.shape, 'y_train.shape:', y_train.shape)
    print('X_val.shape:', X_val.shape, 'y_val.shape:', y_val.shape)
    
    return X_train, X_val, y_train, y_val

X_train, X_val, y_train, y_val = split_train_validate(cgm, 
                                                     'measurement', 
                                                     0.2)

X_train.shape: (73644, 2) y_train.shape: (73644,)
X_val.shape: (18411, 2) y_val.shape: (18411,)


In [98]:
def baseline(y_train, y_val):
    
    mean_value_train = y_train.mean()
    y_pred = [mean_value_train] * len(y_val)
    
    mae = np.average([abs(y - mean_value_train) for y in y_val], axis = 0)
    return mae

print('Average of the training y against validation y gives MAE:', baseline(y_train, y_val))
print('Average of the training y against itself gives MAE:', baseline(y_train, y_train))

Average of the training y against validation y gives MAE: 39.56040929712985
Average of the training y against itself gives MAE: 36.21612231964571


### Create column with glucose values 30 minutes into the future

In [79]:
cgm.head()

Unnamed: 0,timestamp,measurement,below_threshold
9,1524700956,80,False
10,1524701257,83,False
11,1524701556,86,False
12,1524701857,89,False
13,1524702157,93,False


In [80]:
def wrangle(df, minutes):
    seconds = minutes * 60
    xs = df['timestamp'].values
    ys = []
    for i in range(df.shape[0]):
        y = np.interp(xs[i] + seconds,
                      df['timestamp'],
                      df['measurement'])
        ys.append(y)
    print('df.shape:', df.shape)
    print('len(ys):', len(ys))
    df[str(minutes) + ' minutes'] = ys
    return df

In [81]:
cgm = wrangle(cgm, 30)
print('cgm.shape:', cgm.shape)
cgm.head()

df.shape: (92055, 3)
len(ys): 92055
cgm.shape: (92055, 4)


Unnamed: 0,timestamp,measurement,below_threshold,30 minutes
9,1524700956,80,False,95.0
10,1524701257,83,False,92.0
11,1524701556,86,False,94.0
12,1524701857,89,False,92.993333
13,1524702157,93,False,90.996667


In [82]:
cgm.tail(10)

Unnamed: 0,timestamp,measurement,below_threshold,30 minutes
25284,1555143548,99,False,90.0
25285,1555143848,97,False,88.0
25286,1555144148,96,False,86.0
25287,1555144448,95,False,87.0
25288,1555144748,93,False,87.0
25289,1555145048,92,False,87.0
25290,1555145348,90,False,87.0
25291,1555145648,88,False,87.0
25292,1555145948,86,False,87.0
25293,1555146248,87,False,87.0


### Must lop off the last 30 minutes of data, since we don't have data to predict the last 30 minutes. This is why we're getting a linear interpolated value of 87 for the last few data points

In [83]:
X_train, X_val, y_train, y_val = split_train_validate(cgm, '30 minutes', 0.2)

X_train.shape: (73644, 3) y_train.shape: (73644,)
X_val.shape: (18411, 3) y_val.shape: (18411,)


In [84]:
param_grid = {
# XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#        colsample_bytree=1, gamma=0, importance_type='gain',
#        learning_rate=0.06, max_delta_step=0, max_depth=2,
#        min_child_weight=1, missing=None, n_estimators=60, n_jobs=1,
#        nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
#        reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
#        subsample=1)
# gave us +/- 18.66 mg/dL
    
    'learning_rate': [0.05, 0.06, 0.07, 0.08],
    'n_estimators':  [40, 50, 60],
    'max_depth': [2],
#     'subsample': [0.5, 0.75, 0.9],
#     'colsample_bytree': [0.1, 0.2, 0.3, 0.4],
#     'gamma': [0, 1, 2]
}

gridsearch = GridSearchCV(XGBRegressor(),
                          param_grid=param_grid, 
                          # scoring='roc_auc', 
                          cv=3, n_jobs=-1,
                          return_train_score=True, verbose=10)

In [85]:
gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  25 out of  36 | elapsed:    4.1s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  29 out of  36 | elapsed:    4.5s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  33 out of  36 | elapsed:    5.0s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    5.2s finished
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.05, 0.06, 0.07, 0.08], 'n_estimators': [40, 50, 60], 'max_depth': [2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=10)

In [86]:
gridsearch.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.07, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=60, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [87]:
gridsearch.cv_results_['mean_train_score'].mean()

0.7364833869488868

In [88]:
gridsearch.cv_results_['mean_test_score'].mean()

0.7309524626328895

In [89]:
y_pred = gridsearch.predict(X_val)
print('mae:', mean_absolute_error(y_val, y_pred))
print('rmse:', np.sqrt(mean_squared_error(y_val, y_pred)))

mae: 18.868568153223297
rmse: 25.38692352389278


In [90]:
print('len(y_pred):', len(y_pred))

len(y_pred): 18411


### Write out the model and the related data to the filesystem

In [91]:
with open('diabetesmanager/data/private/model', 'wb') as f:
    pickle.dump(gridsearch, f)

In [99]:
cgm.head()

Unnamed: 0,timestamp,measurement,below_threshold,30 minutes
9,1524700956,80,False,95.0
10,1524701257,83,False,92.0
11,1524701556,86,False,94.0
12,1524701857,89,False,92.993333
13,1524702157,93,False,90.996667


In [93]:
# Calculate the shift for y_pred so it matches the Glucose values
def shift_predictions_concat(df, minutes, y_pred):
    seconds = minutes * 60
    first_timestamp = df.timestamp.iloc[0]
    timestamp = df.timestamp.values
    shift = None
    for i in range(df.shape[0]):
        t = timestamp[i] - first_timestamp
        if t < seconds:
            continue
        shift = i
        break

    df2 = df.copy().iloc[shift:, :].dropna()
    
    y_pred2 = y_pred[:df2.shape[0]]
    y_pred = y_pred.reset_index(drop=True)

    df2 = df2.reset_index(drop=True)
    df2 = pd.concat([df2, y_pred2], axis=1, ignore_index=True)
    df2 = df2.drop(df2.columns[2], axis=1)
    df2 = df2.drop(df2.columns[3], axis=1)
    df2 = df2.rename(columns={0: 'timestamp', 
                              1: 'Measured', 
                              3: 'Prediction'})

    return df2

y_pred_all = gridsearch.predict(cgm.drop(columns=['30 minutes'], axis=1))
output = shift_predictions_concat(cgm, 30, pd.Series(y_pred_all))
print('output.shape:', output.shape)

with open('diabetesmanager/data/private/table', 'wb') as f:
    pickle.dump(output, f)

output.shape: (92049, 3)


In [94]:
output.tail()

Unnamed: 0,timestamp,Measured,Prediction
92044,1555145048,92,87.0
92045,1555145348,90,87.0
92046,1555145648,88,87.0
92047,1555145948,86,87.0
92048,1555146248,87,87.0


In [95]:
print('mae:', mean_absolute_error(output['Measured'], output['Prediction']))
print('rmse:', np.sqrt(mean_squared_error(output['Measured'], output['Prediction'])))

mae: 17.25957972778266
rmse: 24.17658897135064


### Prediction using TPOT

In [100]:
import pandas as pd
from tpot import TPOTRegressor

  return f(*args, **kwds)


### TPOT requires numerical data. So we have to convert categorical values to numerical values.

### TPOT requires floating-point values. So we have to convert numerical values to floating point.

In [115]:
def numerical_to_floats(df):
    df2 = df.copy()
    for col in df2.select_dtypes(include=np.number).columns:
        df2[col] = df[col].astype(float)
    return df2

cgm_numeric = cgm.copy()
cgm_numeric['below_threshold'] = cgm_numeric['below_threshold'].astype('int')
cgm_numeric = numerical_to_floats(cgm_numeric)
cgm_numeric.head()

Unnamed: 0,timestamp,measurement,below_threshold,30 minutes
9,1524701000.0,80.0,0.0,95.0
10,1524701000.0,83.0,0.0,92.0
11,1524702000.0,86.0,0.0,94.0
12,1524702000.0,89.0,0.0,92.993333
13,1524702000.0,93.0,0.0,90.996667


### TPOT cannot work with null values - so handle nulls

In [117]:
cgm_numeric.isnull().sum()

timestamp          0
measurement        0
below_threshold    0
30 minutes         0
dtype: int64

In [118]:
X_train, X_val, y_train, y_val = split_train_validate(cgm_numeric, 
                                                     'measurement', 
                                                     0.2)

X_train.shape: (73644, 3) y_train.shape: (73644,)
X_val.shape: (18411, 3) y_val.shape: (18411,)


In [135]:
%%time

np.random.seed(42)

tpot = TPOTRegressor(generations=5, population_size=20, 
                     verbosity=2, n_jobs=-1)
tpot.fit(X_train, y_train)

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=120, style=ProgressStyle(descript…

Generation 1 - Current best internal CV score: -525.2669452690468
Generation 2 - Current best internal CV score: -525.217126003612
Generation 3 - Current best internal CV score: -524.9416450561697
Generation 4 - Current best internal CV score: -524.9416450561697
Generation 5 - Current best internal CV score: -524.2923316389206

Best pipeline: LassoLarsCV(DecisionTreeRegressor(input_matrix, max_depth=4, min_samples_leaf=20, min_samples_split=17), normalize=False)
CPU times: user 6.62 s, sys: 400 ms, total: 7.02 s
Wall time: 22min 3s


In [136]:
print(tpot.score(X_val, y_val))
y_pred = tpot.predict(X_val)

-643.4797287912282


In [137]:
print('Mean Average Error:', mean_absolute_error(y_val, y_pred))
print('rmse:', np.sqrt(mean_squared_error(y_val, y_pred)))

Mean Average Error: 18.172980027792356
rmse: 25.366902230884016


In [138]:
y_pred = tpot.predict(X_train)
print('Mean Average Error:', mean_absolute_error(y_train, y_pred))
print('rmse:', np.sqrt(mean_squared_error(y_train, y_pred)))

Mean Average Error: 16.347211117536034
rmse: 22.834901465960648
