In [None]:
import numpy as np
import pandas as pd
import pickle
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error
)
from xgboost import XGBRegressor

pd.options.display.max_columns = None

from scripts.data_processing import (
    load_uci, load_tidepool_dummy, 
    load_so_pump, 
    load_so_cgm
)

In [None]:
def split_train_validate(df, target_name, test_fraction=0.2):
    test_size = int(df.shape[0] * test_fraction)
    df_train = df.iloc[0:-test_size]
    df_val   = df.iloc[-test_size:]
    print(f'train size: {len(df_train)}')
    print(f'test size: {len(df_val)}')
    
    X_train  = df_train.drop(columns=[target_name])
    y_train  = df_train[target_name]
    
    X_val    = df_val.drop(columns=[target_name])
    y_val    = df_val[target_name]
    
    return X_train, X_val, y_train, y_val


def preprocess(df, minutes=30, n_historical_cols=2):
    # convert datetime to int
    df['timestamp'] = df['timestamp'].astype(np.int64) // 10**9

    for x in range(1, n_historical_cols+1):
        df[['prev_meas', 'prev_time']] = df[['measurement', 'timestamp']].shift(x)
        df[f'prev_trend_{x}'] = (
            df['prev_meas'].divide(df['timestamp'] - df['prev_time']))
        df = df.drop(columns=['prev_meas', 'prev_time'])

    # get 30 minute future value
    df = append_future_value_col(df, minutes)

    # remove nans
    og_len = len(df)
    df = df.loc[~df[f'{minutes}_minutes'].isna() & 
                ~df[f'prev_trend_{n_historical_cols}'].isna()]
    n_dropped = og_len - len(df)
    assert n_dropped < (10 + n_historical_cols)

    return df


def append_future_value_col(df, minutes):
    seconds = minutes * 60
    
    df[f'{minutes}_minutes'] = np.interp(
        df['timestamp'].add(seconds), df['timestamp'],
        df['measurement']
    )
    
    max_valid_time = df['timestamp'].max() - seconds
    df.loc[df['timestamp'] > max_valid_time, f'{minutes}_minutes'] = np.nan

    return df

### Compute baseline MSE

In [None]:
def baseline_rmse(df, minutes=30):
    # if we predict the same value as the current value
    df = preprocess(df, minutes)
    
    mse = sum((df[f'{minutes}_minutes'] - 
               df['measurement']).pow(2)) / len(df)
    
    return np.sqrt(mse)


baseline_rmse(df=load_so_cgm(), minutes=30)

### Build Model

In [None]:
df = load_so_cgm()
df = preprocess(df)

X_train, X_val, y_train, y_val = split_train_validate(
    df.copy(), target_name='30_minutes'
)

In [None]:
df.head()

In [None]:
# XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#        colsample_bytree=1, gamma=0, importance_type='gain',
#        learning_rate=0.06, max_delta_step=0, max_depth=2,
#        min_child_weight=1, missing=None, n_estimators=60, n_jobs=1,
#        nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
#        reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
#        subsample=1)
# gave us +/- 18.66 mg/dL

In [None]:
param_grid = {  
    'learning_rate': [0.05, 0.06, 0.07, 0.08],
    'n_estimators':  [40, 50, 60],
    'max_depth': [2],
#     'subsample': [0.5, 0.75, 0.9],
#     'colsample_bytree': [0.1, 0.2, 0.3, 0.4],
#     'gamma': [0, 1, 2]
}

gridsearch = GridSearchCV(XGBRegressor(),
                          param_grid=param_grid, 
                          # scoring='roc_auc', 
                          cv=3, n_jobs=-1,
                          return_train_score=True, verbose=10)

In [None]:
gridsearch.fit(X_train, y_train)

In [None]:
gridsearch.best_estimator_

In [None]:
print(gridsearch.cv_results_['mean_train_score'].mean(),
      gridsearch.cv_results_['mean_test_score'].mean())

In [None]:
y_pred = gridsearch.predict(X_val)
print('mae:', mean_absolute_error(y_val, y_pred))
print('rmse:', np.sqrt(mean_squared_error(y_val, y_pred)))

In [None]:
def make_prediction(model, minutes=30):
    # pull in user data
    df = load_so_cgm()
    # process
    df = preprocess(df)
    df = df.drop(columns=[f'{minutes}_minutes'])
    # predict
    predictions = model.predict(df)
    
    df = df[['timestamp']].assign(predicted_value = list(predictions))
    df['timestamp'] = df['timestamp'] + 60 * minutes
    
    # TODO: make JSON or write to DB
    return df

In [None]:
make_prediction(gridsearch)

### Write out model

In [None]:
with open('diabetesmanager/model.pkl', 'wb') as f:
    pickle.dump(gridsearch, f)

### Load model

In [None]:
with open('diabetesmanager/model.pkl', 'rb') as f:
    gridsearch = pickle.load(f)