In [1]:
# forecast monthly births with xgboost
from numpy import asarray
import pandas as pd
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from xgboost import XGBRegressor
from matplotlib import pyplot


In [2]:
import pandas as pd
from sklearn.multioutput import MultiOutputRegressor
import numpy as np
import xml.etree.cElementTree as et
from datetime import datetime
from pandas.core.tools.datetimes import to_datetime
from datetime import timedelta

In [3]:
# get some noised linear data
# X = np.random.random((1000, 10))
#a = np.random.random((10, 3))
#y = np.dot(X, a) + np.random.normal(0, 1e-3, (1000, 3))
#print(y.shape)
# fitting
#multioutputregressor = MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror')).fit(X, y)

# predicting
#print(np.mean((multioutputregressor.predict(X) - y)**2, axis=0))  # 0.004, 0.003, 0.005

In [4]:
def read_xml_data(filename, selected_items):
  tree=et.parse(filename)
  root=tree.getroot()
  #extract selected items
  for child in root:
    if child.tag in selected_items:
      df = pd.DataFrame()
      for elem in child:      
        df1 = pd.DataFrame(elem.attrib, index=[0])
        df = df.append(df1)
      #First column is the timestamp (dayfirst)      
      #df.iloc[:,0] = pd.to_datetime(df.iloc[:,0], dayfirst=True)
      #write to csv file using the timestamp as index      
      df.to_csv(child.tag+'.csv', index=0)  

In [5]:
def read_ts_file(filename):
  #reads csv file where the first column is a timestamp and the index column
  df = pd.read_csv (filename, parse_dates=[0], dayfirst=True, index_col=0)
  return df

In [6]:
def align_timeseries(align_df, source_df, source_columns, default_vals, dest_columns=[], include_index=False):
  # Generates a new dataframe with source_df data aligned to align_df
  # source_columns : list of columns from source_df to be included in result
  # dest_columns = column names of source_columns, if ommited column names will be the same as in source_df
  #                if specified, the first column has to be the name of the index in the destination
  # include_index: if source index must be included in the result
  # default_vals = list of default values for source_columns
 
  df_align = align_df.copy()
  df_align = df_align.sort_index()
  df2 = source_df.copy()
  df2 = df2.sort_index()
  df_dup = pd.DataFrame()
  if (dest_columns==[]) : dest_columns = source_columns.copy()
  if (len(dest_columns) == len(source_columns)) : dest_columns = ['source_ts'] + dest_columns # add column for index
  #add blank cells in destination
  df_align[dest_columns] = [np.nan] + default_vals
  for i in range(len(df2)):
      ts2 = df2.index[i]
      ts1_loc = df_align.index.get_loc(ts2, method='nearest')
      ts1 = df_align.index[ts1_loc]
      vals = [ts2]+list(df2.loc[ts2, source_columns])
      ts_old = df_align.loc[ts1, dest_columns[0]]
      if not pd.isna(ts_old): #there are duplicates
        if (abs(ts1.value-ts2.value) > abs(ts1.value-ts_old.value)): 
          #difference is greater, do not replace        
          df_dup = df_dup.append(df_align.loc[ts1])
          df_dup.loc[ts1, dest_columns] = vals
        else: #replace
          df_dup = df_dup.append(df_align.loc[ts1])
          df_align.loc[ts1, dest_columns] = vals
        continue
      df_align.loc[ts1, dest_columns] = vals
  
  if not(include_index) : df_align = df_align.drop( columns = dest_columns[0])

  return df_align, df_dup

In [7]:
def timedf(df):
  #creates a 5 minute interval timeseries dataframe based in index of df
  # df must have a timestamp index
  #time_df: result dataframe with timestamp index
  timestamp = pd.date_range(start=df.index[0], end=df.index[-1]  + timedelta(minutes=4), freq='5T')
  time_df = pd.DataFrame({'timestamp':timestamp})
  time_df.set_index('timestamp', inplace=True)
  return time_df

In [8]:
def find_gaps(df, greaterthan=5, units='m'):
  # find gaps relative to index, index must be a datetime field
  # greaterthan is the number of time units to be considered a gap
  # units 'm'=minutes, 'h'=hours
  i = 0
  gaps_df = pd.DataFrame()
  while i < len(df) - 1:
    ts = df.index[i]
    next_ts = df.index[i+1]
    duration = next_ts - ts
    if duration > np.timedelta64(greaterthan, units): 
      begin_gap = ts
      end_gap = next_ts
      gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
    i = i + 1
  gaps_df.sort_values(by=['Duration'], ascending=False, inplace=True)
  return gaps_df

In [9]:
def read_files():
  #read glucose
  glucose_df = read_ts_file('glucose_level.csv')
  glucose_df.rename(columns={"ts": "timestamp", "value": "glucose"}, inplace=True)
  print(glucose_df)
  #read meal
  meal_df = read_ts_file('meal.csv')
  print(meal_df)
  #read bolus
  meal_df = read_ts_file('bolus.csv')
  print(meal_df)

## Reading Training Data

In [10]:
#selected_items = ['glucose_level','bolus','meal']
read_xml_data(filename='c://aadm/575-ws-training.xml', selected_items=['glucose_level','bolus','meal'])

In [11]:
glucose_train = read_ts_file('glucose_level.csv')
glucose_train.rename(columns={"ts": "timestamp", "value": "glucose"}, inplace=True)
#print(glucose_train)

In [12]:
#Finding the length of the complete time series
time_df = timedf(glucose_train)
print('Missing intervals: ', len(time_df) - len(glucose_train) )

Missing intervals:  629


In [13]:
#Adding NA in the whole range of cgm-training
glucose_train=glucose_train.resample('5T').mean()
glucose_train.head()

Unnamed: 0_level_0,glucose
ts,Unnamed: 1_level_1
2021-12-07 16:25:00,101.0
2021-12-07 16:30:00,100.0
2021-12-07 16:35:00,100.0
2021-12-07 16:40:00,99.0
2021-12-07 16:45:00,98.0


## Reading the testing data

In [14]:
#selected_items = ['glucose_level','bolus','meal']
read_xml_data(filename='c://aadm/575-ws-testing.xml', selected_items=['glucose_level','bolus','meal'])

In [15]:
glucose_test = read_ts_file('glucose_level.csv')
glucose_test.rename(columns={"ts": "timestamp", "value": "glucose"}, inplace=True)
glucose_test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2745 entries, 2022-01-17 00:04:00 to 2022-01-26 23:57:00
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   glucose  2745 non-null   int64
dtypes: int64(1)
memory usage: 42.9 KB


In [16]:
#Finding the length of the complete time series
time_df = timedf(glucose_test)
print('Missing intervals: ', len(time_df) - len(glucose_test) )

Missing intervals:  135


In [17]:
# All gaps 
gaps_df = find_gaps(glucose_test)
#print(gaps_df)

In [18]:
#Adding NA in the whole range of cgm-testing
glucose_test=glucose_test.resample('5T').mean()
glucose_test.head()

Unnamed: 0_level_0,glucose
ts,Unnamed: 1_level_1
2022-01-17 00:00:00,135.0
2022-01-17 00:05:00,143.0
2022-01-17 00:10:00,152.0
2022-01-17 00:15:00,159.0
2022-01-17 00:20:00,166.0


In [19]:
#impute_mean(cgmtestmiss, 'glucose')
#cgmtestclean=cgmtestmiss
#cgm559testclean.info()

In [20]:
#cgmtestclean= cgmtestmiss.interpolate(method="polynomial",order=3)
#cgm559testclean.info()

In [21]:
#Normalizing the data
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
datatrain = np.array(glucose_train.values.astype('float32'))
datatest = np.array(glucose_test.values.astype('float32'))
#scaler = MinMaxScaler(feature_range=(0, 1))
#datatrain = scaler.fit_transform(datatrain).flatten()
#datatest = scaler.fit_transform(datatest).flatten()
#n = len(data)
train_data=pd.DataFrame(datatrain)
test_data=pd.DataFrame(datatest)

In [22]:
print(train_data.shape)
print(test_data.shape)

(11611, 1)
(2880, 1)


In [23]:
# convert time series into supervised learning problem
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars=1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [24]:
#Past history: One hour, Prediction horizon: 30 minutes 
n_lag=12
n_seq=6

In [25]:
data1=series_to_supervised(train_data, n_in=n_lag, n_out=n_seq, dropnan=True)
data2=series_to_supervised(test_data, n_in=n_lag, n_out=n_seq, dropnan=True)
train=data1.values
test=data2.values
#print(data1.shape)
print("test shape:",test.shape)
print("train shape:",train.shape)

test shape: (2575, 18)
train shape: (10625, 18)


In [26]:
# transform list into array
#train = asarray(train)
# split into input and output columns
X, y = train[:, 0:n_lag], train[:, n_lag:]
print(y.shape)
# fit model
import time
start_time = time.time()
#model = XGBRegressor(objective='reg:squarederror', n_estimators=1000)
multioutputregressor = MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror',n_estimators=1000)).fit(X, y)
# predicting
a=multioutputregressor.predict(X)-y
a=DataFrame(a)
a.head()

(10625, 6)


Unnamed: 0,0,1,2,3,4,5
0,-0.042648,-0.114166,-0.408859,-0.184128,-0.451088,-0.640091
1,-0.101837,-0.180885,-0.203911,-0.308075,-0.663795,-0.291092
2,-0.029579,-0.22863,-0.072884,-0.475853,-0.040695,-0.115196
3,0.01432,0.159096,0.220589,0.1091,0.706985,0.310745
4,0.08551,0.007645,0.420319,0.527588,0.162544,-0.595467


In [27]:
testX, testy = test[:,0:n_lag], test[:, n_lag:]

In [28]:
b = np.sqrt(np.mean((multioutputregressor.predict(testX)-testy)**2,axis=0))
from math import sqrt
b=DataFrame(b)
#b.head()
b.mean()

0    11.957644
dtype: float32

In [29]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 20.63851237297058 seconds ---
