In [10]:
!pip install -U azureml-fsspec mltable

Requirement already up-to-date: azureml-fsspec in /anaconda/envs/azureml_py38/lib/python3.8/site-packages (0.1.0b2)
Requirement already up-to-date: mltable in /anaconda/envs/azureml_py38/lib/python3.8/site-packages (0.1.0b4)


In [13]:
from azureml.fsspec import AzureMachineLearningFileSystem
import numpy as np
import pandas as pd

In [2]:
from azureml.core import Workspace, Dataset, Datastore

subscription = 'f48a2553-c966-4d06-8faa-c5096da10254'
resource_group = 'rg-fecdata'
workspace = 'fecaml'
datastore_name = 'amitoosweet'
path_on_datastore = 'clarityUploads'

# long-form Datastore uri format:
uri = f'azureml://subscriptions/{subscription}/resourcegroups/{resource_group}/workspaces/{workspace}/datastores/{datastore_name}/paths/{path_on_datastore}'

uri

'azureml://subscriptions/f48a2553-c966-4d06-8faa-c5096da10254/resourcegroups/rg-fecdata/workspaces/fecaml/datastores/amitoosweet/paths/clarityUploads'

In [3]:
# instantiate file system using datastore URI
fs = AzureMachineLearningFileSystem(uri)

pandas_frames = []
for path in fs.ls():
    print(f'Reading from {path}')
    with fs.open(path) as f:
        pandas_frames.append(pd.read_csv(f, header=0))

print(f'concating {len(pandas_frames)} frames')
full_df = pd.concat(pandas_frames)


In [4]:
# pull out EGV events only
egv_events = full_df[full_df['Event Type'] == "EGV"]
egv_events = egv_events[['Timestamp (YYYY-MM-DDThh:mm:ss)', 'Source Device ID',
       'Glucose Value (mg/dL)', 'Transmitter Time (Long Integer)', 'Transmitter ID']]

egv_events = egv_events.drop_duplicates(keep='last', subset='Timestamp (YYYY-MM-DDThh:mm:ss)')
egv_events.columns = ['timestamp_str', 'source_device', 'glucose', 'transmittertime', 'transmitter_id']

In [10]:
# interpolate gaps (linear) to get clean 5 mins 
egv_events['timestamp'] = pd.to_datetime(egv_events['timestamp_str'], format='%Y-%m-%dT%H:%M:%S')
egv_events['timestamp_diff'] = egv_events.timestamp.diff()

In [51]:
gaps = egv_events[egv_events.timestamp_diff >= np.timedelta64(6, 'm')].index

Unnamed: 0,timestamp_str,source_device,glucose,transmittertime,transmitter_id,timestamp,timestamp_diff
1214,2022-11-12T02:14:33,Android G6,176,367574.0,8FELKW,2022-11-12 02:14:33,0 days 00:25:00
2816,2022-11-17T17:44:53,Android G6,204,855374.0,8FELKW,2022-11-17 17:44:53,0 days 02:10:01


In [69]:
new_time_slots = []

for low, high in zip(gaps-1, gaps):
    start_event = egv_events.loc[low]
    end_event = egv_events.loc[high]
    start = start_event.timestamp
    end = end_event.timestamp
    start_time = start
    previous_start_time = start
    transmitter_id = start_event['transmitter_id'] if start_event['transmitter_id'] == end_event['transmitter_id'] else "Switch"
    while end - start_time >= np.timedelta64(5, 'm'):
        start_time += np.timedelta64(5, 'm')
        new_time_slots.append({
            'timestamp_str': str(start_time), 
            'source_device': 'Interpolate',
            'glucose': np.nan,
            'trasmittertime': -1,
            'transmitter_id': transmitter_id,
            'timestamp': start_time,
            'timestamp_diff': start_time - previous_start_time
        })
        previous_start_time = start_time


In [73]:
# create basic rows here
new_data = pd.DataFrame(new_time_slots)
egv_events_with_interpolate = pd.concat([egv_events, new_data])
egv_events_with_interpolate.sort_values(by='timestamp', inplace=True)
egv_events_with_interpolate.set_index('timestamp').glucose.interpolate(method='cubic', inplace=True)
egv_events_with_interpolate.reset_index()

# assert all diffs < 6 mins
assert egv_events_with_interpolate.timestamp.diff().max() < np.timedelta64(6, 'm')