### Script for processing Glucose multivariate data

In [476]:
import pandas as pd
import numpy as np
import xml.etree.cElementTree as et
from datetime import datetime
from pandas.core.tools.datetimes import to_datetime
from datetime import timedelta
# forecast monthly births with xgboost
from numpy import asarray
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
import matplotlib.pyplot as plt

### Function to read the xml formatted datafile

In [477]:
def read_xml_data(filename, selected_items):
  tree=et.parse(filename)
  root=tree.getroot()
  #extract selected items
  for child in root:
    if child.tag in selected_items:
      df = pd.DataFrame()
      for elem in child:      
        df1 = pd.DataFrame(elem.attrib, index=[0])
        #df = df.append(df1)
        df = pd.concat([df, df1])
      #First column is the timestamp (dayfirst)      
      #df.iloc[:,0] = pd.to_datetime(df.iloc[:,0], dayfirst=True)
      #write to csv file using the timestamp as index      
      df.to_csv(child.tag+'.csv', index=0)  

In [478]:
def read_ts_file(filename):
  #reads csv file where the first column is a timestamp and the index column
  df = pd.read_csv (filename, parse_dates=[0], dayfirst=True, index_col=0)
  return df

### Function for  alignment of two time series 

In [479]:
def align_timeseries(align_df, source_df, source_columns, default_vals, dest_columns=[], include_index=False):
  # Generates a new dataframe with source_df data aligned to align_df
  # source_columns : list of columns from source_df to be included in result
  # dest_columns = column names of source_columns, if ommited column names will be the same as in source_df
  #                if specified, the first column has to be the name of the index in the destination
  # include_index: if source index must be included in the result
  # default_vals = list of default values for source_columns
 
  df_align = align_df.copy()
  df_align = df_align.sort_index()
  df2 = source_df.copy()
  df2 = df2.sort_index()
  df_dup = pd.DataFrame()
  if (dest_columns==[]) : dest_columns = source_columns.copy()
  if (len(dest_columns) == len(source_columns)) : dest_columns = ['source_ts'] + dest_columns # add column for index
  #add blank cells in destination
  df_align[dest_columns] = [np.nan] + default_vals
  for i in range(len(df2)):
      ts2 = df2.index[i]
      ts1_loc = df_align.index.get_loc(ts2, method='nearest')
      ts1 = df_align.index[ts1_loc]
      vals = [ts2]+list(df2.loc[ts2, source_columns])
      ts_old = df_align.loc[ts1, dest_columns[0]]
      if not pd.isna(ts_old): #there are duplicates
        if (abs(ts1.value-ts2.value) > abs(ts1.value-ts_old.value)): 
          #difference is greater, do not replace        
          df_dup = df_dup.append(df_align.loc[ts1])
          df_dup.loc[ts1, dest_columns] = vals
        else: #replace
          df_dup = df_dup.append(df_align.loc[ts1])
          df_align.loc[ts1, dest_columns] = vals
        continue
      df_align.loc[ts1, dest_columns] = vals
  
  if not(include_index) : df_align = df_align.drop( columns = dest_columns[0])

  return df_align, df_dup

### Function to create an equally spaced 5 minutes time series

In [480]:
def timedf(df):
  #creates a 5 minute interval timeseries dataframe based in index of df
  # df must have a timestamp index
  #time_df: result dataframe with timestamp index
  timestamp = pd.date_range(start=df.index[0], end=df.index[-1]  + timedelta(minutes=4), freq='5T')
  time_df = pd.DataFrame({'timestamp':timestamp})
  time_df.set_index('timestamp', inplace=True)
  return time_df

### Function to find out the number of gaps and theirs length  in a time series

In [481]:
def find_gaps(df, greaterthan=5, units='m'):
  # find gaps relative to index, index must be a datetime field
  # greaterthan is the number of time units to be considered a gap
  # units 'm'=minutes, 'h'=hours
  i = 0
  gaps_df = pd.DataFrame()
  while i < len(df) - 1:
    ts = df.index[i]
    next_ts = df.index[i+1]
    duration = next_ts - ts
    if duration > np.timedelta64(greaterthan, units): 
      begin_gap = ts
      end_gap = next_ts
      gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
    i = i + 1
  gaps_df.sort_values(by=['Duration'], ascending=False, inplace=True)
  return gaps_df

In [482]:
#def read_files():
#  #read glucose
#  glucose_df = read_ts_file('glucose_level.csv')
#  glucose_df.rename(columns={"ts": "timestamp", "value": "glucose"}, inplace=True)
#  print(glucose_df)
#read meal
#  meal_df = read_ts_file('meal.csv')
#  print(meal_df)
#  #read bolus
#  meal_df = read_ts_file('bolus.csv')
# print(meal_df)

### Function to impute by the mean

In [483]:
def impute_mean(df, column, by='hour'):
  # impute with mean by hours
  # in the future by could be another mean grouping criterion
  df[by] = df.index.hour
  df[column] = df.groupby(by)[column].apply(lambda x: x.fillna(x.mean()))
  df.drop(by, axis=1, inplace=True)

### Function to convert time series into supervised learning problem

In [484]:
# convert time series into supervised learning problem
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars=1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

## Reading  Training  Data

In [485]:
#selected_items = ['glucose_level','bolus','meal']
read_xml_data(filename='c://aadm/584-ws-training.xml', selected_items=['glucose_level','meal'])

In [486]:
glucose_df = read_ts_file('glucose_level.csv')
glucose_df.rename(columns={"ts": "timestamp", "value": "glucose"}, inplace=True)
print(glucose_df)

                     glucose
ts                          
2025-05-14 00:03:00       48
2025-05-14 00:08:00       48
2025-05-14 00:13:00       53
2025-05-14 00:18:00       63
2025-05-14 00:23:00       69
...                      ...
2025-06-28 23:36:00      206
2025-06-28 23:41:00      208
2025-06-28 23:46:00      213
2025-06-28 23:51:00      224
2025-06-28 23:56:00      235

[12150 rows x 1 columns]


In [487]:
glucose_df.describe()

Unnamed: 0,glucose
count,12150.0
mean,192.484444
std,65.442789
min,40.0
25%,145.0
50%,183.0
75%,230.0
max,400.0


# Explore gaps

In [488]:
#Finding the length of the complete time series
time_df = timedf(glucose_df)
print('Missing intervals: ', len(time_df) - len(glucose_df) )

Missing intervals:  1098


In [489]:
# All gaps 
gaps_df = find_gaps(glucose_df)
#print(gaps_df)

  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'D

  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'D

In [490]:
# Find gaps greater than 5 hours
#gaps_df2 = find_gaps(glucose_df, greaterthan=5, units='h')
#print(gaps_df2)

# Impute using average per time


In [491]:
glucose_full=glucose_df.resample('5T').mean()
glucose_full.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 13248 entries, 2025-05-14 00:00:00 to 2025-06-28 23:55:00
Freq: 5T
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   glucose  12150 non-null  float64
dtypes: float64(1)
memory usage: 207.0 KB


In [492]:
#impute_mean(glucose_full, 'glucose')

In [493]:
#trainmiss[trainmiss['glucose'].isna()]
#trainmiss.info()

In [494]:
#trainmiss.loc[dfnans.index]
#trainmiss.info()

In [495]:
#glucose_full= glucose_full.interpolate(method="spline", order=3)
#glucose_full.head()
#missing_minutes =list(df2sp2[df2sp2['glucose'].isna()].index)
#missing_minutes
#cgmtrainclean.info()

In [496]:
glucose_full.describe()

Unnamed: 0,glucose
count,12150.0
mean,192.484444
std,65.442789
min,40.0
25%,145.0
50%,183.0
75%,230.0
max,400.0


In [497]:
#import matplotlib.pyplot as plt 
#import rpy2
#import rpy2.rinterface
#%load_ext rpy2.ipython
#import rpy2.robjects as robjects
#from rpy2.robjects.packages import importr
#splines = importr('splines') 
#x_train=np.arange(len(glucose_full))
#y_train=glucose_full['glucose']
#r_y = robjects.FloatVector(y_train)
#r_x = robjects.FloatVector(x_train)
#r_smooth_spline = robjects.r['smooth.spline'] #extract R function# run smoothing function
#spline1 = r_smooth_spline(x=r_x,y=r_y, spar=.01)
#ySpline=np.array(robjects.r['predict'](spline1,robjects.FloatVector(x_train)).rx2('y'))
#print(ySpline)
#plt.figure(figsize=(12,6))
#plt.scatter(x_train,y_train,c="blue")
#plt.plot(x_train,ySpline,c='red')

In [498]:
#Imputation using Kalman smoothing
from tsmoothie.smoother import *
from tsmoothie.utils_func import create_windows
#smoother = KalmanSmoother(component='level_trend', 
#                          component_noise={'level':0.1, 'trend':0.1})
smoother = KalmanSmoother(component='level_season', 
                         component_noise={'level':0.1, 'season':0.1},n_seasons=7)
#smoother = SplineSmoother(n_knots=100, spline_type='cubic_spline')
smoother.smooth(glucose_full[['glucose']].T)
glucosekf=smoother.smooth_data[0]
#print(len(glucosekf))
glucose_full['glucose']=glucosekf
#glucose_full['glucose']=ySpline
#glucose_full.head()
#smoother.smooth_data[0].mean()

In [499]:
glucose_full.describe()

Unnamed: 0,glucose
count,13248.0
mean,193.666522
std,62.53071
min,52.890448
25%,147.529595
50%,184.454927
75%,231.020403
max,401.823941


# Process meals   (training)

In [500]:
meal_df = read_ts_file('meal.csv')
print(meal_df)

                          type  carbs
ts                                   
2025-05-15 06:30:00  Breakfast     60
2025-05-15 10:18:00      Snack     20
2025-05-15 12:00:00      Lunch     40
2025-05-15 19:28:00     Dinner     60
2025-05-16 06:39:00  Breakfast     60
...                        ...    ...
2025-06-26 12:00:00      Lunch     60
2025-06-26 18:27:00     Dinner     60
2025-06-26 20:30:00      Snack     15
2025-06-27 18:30:00     Dinner     60
2025-06-27 21:00:00      Snack     15

[95 rows x 2 columns]


In [501]:
#df_aligned2, df_dups2 = align_timeseries(glucose_full, meal_df, ['carbs'], [0], dest_columns=['meal_carbs'], include_index=True)

In [502]:
#df_aligned2.describe()
#print(df_dups2). There are no duplicates

# correcting  meal values

In [503]:
def eff_carbs(aligned_df, meals_df):
  eff_carbs_df = aligned_df.copy()
  eff_carbs_df.sort_index()
  end = eff_carbs_df.index[-1]
  meals_df.sort_index()
  binc= 0.111
  bdecr= 0.028
  eff_carbs_df['eff_carbs'] = 0
  ts_inc = list(range(15,60,5)) #increasing phase
  ts_dec = list(range(60,240,5)) #decreasing phase
  for i in range(len(meals_df)):
    ts_meal = meals_df.index[i]
    ts_loc = eff_carbs_df.index.get_loc(ts_meal, method='nearest')
    ts_eff_carbs = eff_carbs_df.index[ts_loc]
    cmeal = meals_df.loc[ts_meal, 'carbs']
    #increasing phase
    #idx = ts_eff_carbs
    for j in range(len(ts_inc)):
      idx = ts_eff_carbs + timedelta(minutes=ts_inc[j])
      if idx > end:
        break
      eff_carbs_df.loc[idx, 'eff_carbs'] += (j+1)*binc*cmeal
     #decreasing phase
    for j in range(len(ts_dec)):
      idx = ts_eff_carbs + timedelta(minutes=ts_dec[j])
      if idx > end:
        break
      val = cmeal*(1-(j+1)*bdecr)
      if val<=0: 
        break
      eff_carbs_df.loc[idx, 'eff_carbs'] += val
  
  return eff_carbs_df

In [504]:
corrected_meals_df = eff_carbs(glucose_full, meal_df)

  ts_loc = eff_carbs_df.index.get_loc(ts_meal, method='nearest')


In [505]:
corrected_meals_df.describe()

Unnamed: 0,glucose,eff_carbs
count,13248.0,13248.0
mean,193.666522,8.691924
std,62.53071,16.203337
min,52.890448,0.0
25%,147.529595,0.0
50%,184.454927,0.0
75%,231.020403,9.6
max,401.823941,60.939


### Clean training data including cgm and corrected meals

In [506]:
cgmtrainclean=corrected_meals_df.loc[:,['glucose','eff_carbs']]
cgmtrainclean.info()
#cgmtrainclean.to_csv('corrected_meals.csv')

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 13248 entries, 2025-05-14 00:00:00 to 2025-06-28 23:55:00
Freq: 5T
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   glucose    13248 non-null  float64
 1   eff_carbs  13248 non-null  float64
dtypes: float64(2)
memory usage: 826.5 KB


## Reading Testing data

In [507]:
#selected_items = ['glucose_level','bolus','meal']
read_xml_data(filename='c://aadm/584-ws-testing.xml', selected_items=['glucose_level','meal'])

In [508]:
glucose_df = read_ts_file('glucose_level.csv')
glucose_df.rename(columns={"ts": "timestamp", "value": "glucose"}, inplace=True)
print(glucose_df)

                     glucose
ts                          
2025-06-29 00:01:00      243
2025-06-29 00:06:00      253
2025-06-29 00:11:00      262
2025-06-29 00:16:00      269
2025-06-29 00:21:00      269
...                      ...
2025-07-09 09:12:00       71
2025-07-09 09:17:00       74
2025-07-09 09:22:00       72
2025-07-09 09:27:00       78
2025-07-09 09:32:00       79

[2665 rows x 1 columns]


In [509]:
#Finding the length of the complete time series
time_df = timedf(glucose_df)
print('Missing intervals: ', len(time_df) - len(glucose_df) )

Missing intervals:  331


In [510]:
# All gaps 
gaps_df = find_gaps(glucose_df)
#print(gaps_df)

  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'D

In [511]:
glucose_full=glucose_df.resample('5T').mean()
glucose_full.describe()

Unnamed: 0,glucose
count,2665.0
mean,170.340713
std,60.825535
min,41.0
25%,127.0
50%,162.0
75%,207.0
max,400.0


In [512]:
#impute_mean(glucose_full, 'glucose')

In [513]:
#testmiss.loc[dfnans.index]
#testmiss.info()

In [514]:
#glucose_full= glucose_full.interpolate(method="spline",order=3)

In [515]:
#import rpy2
#import rpy2.rinterface
#%load_ext rpy2.ipython
#import rpy2.robjects as robjects
#from rpy2.robjects.packages import importr
#splines = importr('splines') 
#x_train=np.arange(len(glucose_full))
#y_train=glucose_full['glucose']
#r_y = robjects.FloatVector(y_train)
#r_x = robjects.FloatVector(x_train)
#r_smooth_spline = robjects.r['smooth.spline'] #extract R function# run smoothing function
#spline1 = r_smooth_spline(x=r_x,y=r_y, spar=.01)
#ySpline=np.array(robjects.r['predict'](spline1,robjects.FloatVector(x_train)).rx2('y'))
#print(ySpline)
#plt.figure(figsize=(12,6))
#plt.scatter(x_train,y_train,c="blue")
#plt.plot(x_train,ySpline,c='red')

In [516]:
#Imputation using Kalman filter
#smoother = KalmanSmoother(component='level_trend', 
#                          component_noise={'level':0.1, 'trend':0.1})
smoother = KalmanSmoother(component='level_season', 
                          component_noise={'level':0.1, 'season':0.1},n_seasons=7)
#smoother = SplineSmoother(n_knots=100, spline_type='cubic_spline')
smoother.smooth(glucose_full[['glucose']].T)
glucosekf=smoother.smooth_data[0]
glucose_full['glucose']=glucosekf
#glucose_full['glucose']=ySpline
#smoother.smooth_data[0].mean()
glucose_full.describe()

Unnamed: 0,glucose
count,2995.0
mean,173.125089
std,58.826487
min,53.872246
25%,130.698708
50%,164.180142
75%,209.974661
max,390.599955


In [517]:
meal_df = read_ts_file('meal.csv')
#print(meal_df)
#df_aligned2, df_dups2 = align_timeseries(glucose_full, meal_df, ['carbs'], [0], dest_columns=['meal_carbs'], include_index=True)

### Cleaning testing data including cgm and corrected meals

In [518]:
#print(df_aligned2)
corrected_meals_df = eff_carbs(glucose_full, meal_df)
cgmtestclean=corrected_meals_df.loc[:,['glucose','eff_carbs']]
cgmtestclean.describe()
#cgmtestclean.to_csv('corrected_meals.csv')

  ts_loc = eff_carbs_df.index.get_loc(ts_meal, method='nearest')


Unnamed: 0,glucose,eff_carbs
count,2995.0,2995.0
mean,173.125089,9.703339
std,58.826487,16.864175
min,53.872246,0.0
25%,130.698708,0.0
50%,164.180142,0.0
75%,209.974661,14.64
max,390.599955,59.94


## Data Normalization

In [519]:
#Normalizing the data
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
datatrain = np.array(cgmtrainclean.values.astype('float32'))
datatest = np.array(cgmtestclean.values.astype('float32'))
#print(datatest)
#print(testmax[0])
#print(testmin[0])
values = cgmtrainclean.values
values = values.astype('float32')
# normalize features

In [520]:
#Normalizing Testing
values1 = cgmtestclean.values
values1 = values1.astype('float32')


In [521]:
# Transforming the training time series data into a supervised data
train = series_to_supervised(values, 12, 6)
train.head()

Unnamed: 0,var1(t-12),var2(t-12),var1(t-11),var2(t-11),var1(t-10),var2(t-10),var1(t-9),var2(t-9),var1(t-8),var2(t-8),...,var1(t+1),var2(t+1),var1(t+2),var2(t+2),var1(t+3),var2(t+3),var1(t+4),var2(t+4),var1(t+5),var2(t+5)
12,54.608406,0.0,53.356991,0.0,57.555222,0.0,61.376179,0.0,63.596989,0.0,...,70.683212,0.0,74.369041,0.0,78.734406,0.0,93.729004,0.0,101.416901,0.0
13,53.356991,0.0,57.555222,0.0,61.376179,0.0,63.596989,0.0,64.827087,0.0,...,74.369041,0.0,78.734406,0.0,93.729004,0.0,101.416901,0.0,108.942863,0.0
14,57.555222,0.0,61.376179,0.0,63.596989,0.0,64.827087,0.0,67.082718,0.0,...,78.734406,0.0,93.729004,0.0,101.416901,0.0,108.942863,0.0,114.584061,0.0
15,61.376179,0.0,63.596989,0.0,64.827087,0.0,67.082718,0.0,61.966164,0.0,...,93.729004,0.0,101.416901,0.0,108.942863,0.0,114.584061,0.0,120.729813,0.0
16,63.596989,0.0,64.827087,0.0,67.082718,0.0,61.966164,0.0,57.502823,0.0,...,101.416901,0.0,108.942863,0.0,114.584061,0.0,120.729813,0.0,122.613068,0.0


In [522]:
# Transforming the training time series data into a supervised data
#train = series_to_supervised(scaled, 12, 6)
#print(train.shape)
# drop columns we don't want to predict
train.drop(train.columns[[25,27,29,31,33,35]], axis=1, inplace=True)
train.head()
#n_features=1
#X1=train.iloc[:,0:24:2]
#print(X1.shape)
#X1=np.array(X1).reshape(X1.shape[0], X1.shape[1], n_features)
#X2=train.iloc[:,1:24:2]
#X2=np.array(X2).reshape(X2.shape[0], X2.shape[1], n_features)3#

Unnamed: 0,var1(t-12),var2(t-12),var1(t-11),var2(t-11),var1(t-10),var2(t-10),var1(t-9),var2(t-9),var1(t-8),var2(t-8),...,var1(t-2),var2(t-2),var1(t-1),var2(t-1),var1(t),var1(t+1),var1(t+2),var1(t+3),var1(t+4),var1(t+5)
12,54.608406,0.0,53.356991,0.0,57.555222,0.0,61.376179,0.0,63.596989,0.0,...,59.861172,0.0,61.715797,0.0,64.342842,70.683212,74.369041,78.734406,93.729004,101.416901
13,53.356991,0.0,57.555222,0.0,61.376179,0.0,63.596989,0.0,64.827087,0.0,...,61.715797,0.0,64.342842,0.0,70.683212,74.369041,78.734406,93.729004,101.416901,108.942863
14,57.555222,0.0,61.376179,0.0,63.596989,0.0,64.827087,0.0,67.082718,0.0,...,64.342842,0.0,70.683212,0.0,74.369041,78.734406,93.729004,101.416901,108.942863,114.584061
15,61.376179,0.0,63.596989,0.0,64.827087,0.0,67.082718,0.0,61.966164,0.0,...,70.683212,0.0,74.369041,0.0,78.734406,93.729004,101.416901,108.942863,114.584061,120.729813
16,63.596989,0.0,64.827087,0.0,67.082718,0.0,61.966164,0.0,57.502823,0.0,...,74.369041,0.0,78.734406,0.0,93.729004,101.416901,108.942863,114.584061,120.729813,122.613068


In [523]:
# Transforming the testing time series data into a supervised data
test = series_to_supervised(values1, 12, 6)
#print(test.shape)
# drop columns we don't want to predict
n_features=1
test.drop(test.columns[[25,27,29,31,33,35]], axis=1, inplace=True)
#Xtest1=test.iloc[:,0:24:2]
#print(Xtest1)
#Xtest1=np.array(Xtest1).reshape(Xtest1.shape[0], Xtest1.shape[1], n_features)
#Xtest2=test.iloc[:,1:24:2]
#Xtest2=np.array(Xtest2).reshape(Xtest2.shape[0], Xtest2.shape[1], n_features)
test=np.array(test)

In [524]:
#Past history: One hour, Prediction horizon: 30 minutes 
n_lag=24
n_seq=6
epochs=50

In [525]:
# fit model
import time
start_time = time.time()
#model.fit([X1, X2], y, epochs=400, verbose=0)
# transform list into array
train = asarray(train)
# split into input and output columns
X, y = train[:, 0:n_lag], train[:, n_lag:]
print(X.shape)
# fit model
import time
start_time = time.time()
#model = XGBRegressor(objective='reg:squarederror', n_estimators=1000)
multioutputregressor = MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror',n_estimators=1000)).fit(X, y)
# predicting
a=multioutputregressor.predict(X)-y
a=DataFrame(a)
a.head()

(13231, 24)


Unnamed: 0,0,1,2,3,4,5
0,0.009811,-0.014046,-0.008057,-0.007812,-0.032692,-0.053207
1,-0.003502,-0.042351,-0.026894,-0.246574,-0.059395,-0.415283
2,-0.038536,-0.05291,-0.027222,-0.114098,-0.397797,0.001373
3,0.013855,-0.038597,-0.075539,-0.008461,-0.052246,-0.19574
4,-0.14489,-0.195404,-0.25209,-0.435394,-0.022743,-0.103981


In [526]:
testX, testy = test[:,0:n_lag], test[:, n_lag:]

In [527]:
b = np.sqrt(np.mean((multioutputregressor.predict(testX)-testy)**2,axis=0))
from math import sqrt
b=DataFrame(b)
b.head()
b.mean()

0    7.640965
dtype: float32

In [528]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 87.71026611328125 seconds ---
