In [37]:
from json import load, dumps
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from datetime import datetime, time, timedelta
import dateutil.tz
from operator import itemgetter

In [38]:
f = open('raw_data.json')
data = load(f)
print(f"data size: {len(data)}")

data size: 14803


In [39]:
print("First data point: ")
print(dumps(data[0], indent=4, sort_keys=True))

First data point: 
{
    "_id": "62915c74587b0c23c18f2e8f",
    "date": 1653607324193,
    "dateString": "2022-05-26T23:22:04.193Z",
    "delta": 5.999,
    "device": "xDrip-DexcomG5",
    "direction": "Flat",
    "filtered": 0,
    "noise": 1,
    "rssi": 100,
    "sgv": 81,
    "sysTime": "2022-05-26T23:22:04.193Z",
    "type": "sgv",
    "unfiltered": 0,
    "utcOffset": -420
}


In [40]:
def compute_X_and_Y(feature_size: int, target_size: int):
    X = []
    Y = []
    for i in range(0, len(data) - feature_size - target_size):
        x = []

        utc_offset = data[i].get('utcOffset')
        utc_date_from_timestamp = datetime.utcfromtimestamp(data[i].get('date') / 1000)
        date_in_clients_local_time = utc_date_from_timestamp.replace(
            tzinfo=dateutil.tz.tzoffset(None, offset=utc_offset * 60)) + timedelta(minutes=utc_offset
        )
        difference = date_in_clients_local_time.timestamp() - datetime.combine(date_in_clients_local_time, time.min).timestamp()
        minutes_since_day_started = difference / 60

        x.append(minutes_since_day_started)

        features = data[i:i + feature_size]
        max_time_difference = 0
        for idx, feature in enumerate(features):
            if idx < len(features) - 1:
                next_feature = features[idx + 1]
                max_time_difference = max(next_feature.get('date') - feature.get('date'), max_time_difference)
            x.append(feature.get('sgv'))
        if max_time_difference / 300000 >= 1.9:
            continue

        X.append(x)
        y = []
        targets = data[i + feature_size:i + feature_size + target_size]
        for target in targets:
            y.append(target.get('sgv'))
        Y.append(y)
    
    # Split data
    X_train, X_cv, Y_train, Y_cv = train_test_split(X, Y, test_size=0.2, train_size=0.8)
    # X_train, X_cv, Y_train, Y_cv = train_test_split(x, y, test_size=0.25, train_size=0.75)

    return X_train, Y_train, X_cv, Y_cv


In [41]:
models_to_test = [
    # {
    #     "feature_size": int,
    #     "target_sizes": {
    #           "target_size": int,
    #           "traning_score": float,
    #           "cv_score": float, 
    #     },
    # }
]

for feature_size in range(8, 96, 4):
    models_to_test.append({
        "feature_size": feature_size,
        "target_sizes": []
    })
    for target_size in range(2, 30):
        models_to_test[-1]['target_sizes'].append({
            "target_size": target_size,
        })


In [42]:
for model_to_test in models_to_test:
    feature_size, target_sizes = itemgetter('feature_size', 'target_sizes')(model_to_test)
    for target_size_item in target_sizes:
        target_size = target_size_item['target_size']
        [X_train, Y_train, X_cv, Y_cv] = compute_X_and_Y(feature_size, target_size)
        estimator = XGBRegressor(objective="reg:squarederror", n_estimators=1000)
        model = MultiOutputRegressor(estimator=estimator)
        model.fit(X_train, Y_train)
        print("/////////////////////////////////////////////////")
        print(f"Model feature_size: {feature_size}, target_size: {target_size}")
        # Training Score
        training_score = model.score(X_train, Y_train)
        print(f"Training score: ", training_score)

        cv_score = model.score(X_cv, Y_cv)
        print(f"Cross validation score: {cv_score}")
        model_to_test['training_score'] = training_score
        model_to_test['cv_score'] = cv_score

        if (cv_score < 0.9): break

/////////////////////////////////////////////////
Model feature_size: 8, target_size: 2
Training score:  0.9997592368155417
Cross validation score: 0.9453285410381747
/////////////////////////////////////////////////
Model feature_size: 8, target_size: 3
Training score:  0.9996656984438878
Cross validation score: 0.921494018846524
/////////////////////////////////////////////////
Model feature_size: 8, target_size: 4
Training score:  0.9995626559000627
Cross validation score: 0.8979366444221898
/////////////////////////////////////////////////
Model feature_size: 12, target_size: 2
Training score:  0.9999111493857673
Cross validation score: 0.947567441229506
/////////////////////////////////////////////////
Model feature_size: 12, target_size: 3
Training score:  0.9998801095575226
Cross validation score: 0.928722472290315
/////////////////////////////////////////////////
Model feature_size: 12, target_size: 4
Training score:  0.9998583068273936
Cross validation score: 0.905950050846159