### Script for processing Glucose multivariate data

In [1]:
import pandas as pd
import numpy as np
import xml.etree.cElementTree as et
from datetime import datetime
from pandas.core.tools.datetimes import to_datetime
from datetime import timedelta

### Function to read the xml formatted datafile

In [2]:
def read_xml_data(filename, selected_items):
  tree=et.parse(filename)
  root=tree.getroot()
  #extract selected items
  for child in root:
    if child.tag in selected_items:
      df = pd.DataFrame()
      for elem in child:      
        df1 = pd.DataFrame(elem.attrib, index=[0])
        #df = df.append(df1)
        df = pd.concat([df, df1])
      #First column is the timestamp (dayfirst)      
      #df.iloc[:,0] = pd.to_datetime(df.iloc[:,0], dayfirst=True)
      #write to csv file using the timestamp as index      
      df.to_csv(child.tag+'.csv', index=0)  

In [3]:
def read_ts_file(filename):
  #reads csv file where the first column is a timestamp and the index column
  df = pd.read_csv (filename, parse_dates=[0], dayfirst=True, index_col=0)
  return df

### Function for  alignment of two time series 

In [4]:
def align_timeseries(align_df, source_df, source_columns, default_vals, dest_columns=[], include_index=False):
  # Generates a new dataframe with source_df data aligned to align_df
  # source_columns : list of columns from source_df to be included in result
  # dest_columns = column names of source_columns, if ommited column names will be the same as in source_df
  #                if specified, the first column has to be the name of the index in the destination
  # include_index: if source index must be included in the result
  # default_vals = list of default values for source_columns
 
  df_align = align_df.copy()
  df_align = df_align.sort_index()
  df2 = source_df.copy()
  df2 = df2.sort_index()
  df_dup = pd.DataFrame()
  if (dest_columns==[]) : dest_columns = source_columns.copy()
  if (len(dest_columns) == len(source_columns)) : dest_columns = ['source_ts'] + dest_columns # add column for index
  #add blank cells in destination
  df_align[dest_columns] = [np.nan] + default_vals
  for i in range(len(df2)):
      ts2 = df2.index[i]
      ts1_loc = df_align.index.get_loc(ts2, method='nearest')
      ts1 = df_align.index[ts1_loc]
      vals = [ts2]+list(df2.loc[ts2, source_columns])
      ts_old = df_align.loc[ts1, dest_columns[0]]
      if not pd.isna(ts_old): #there are duplicates
        if (abs(ts1.value-ts2.value) > abs(ts1.value-ts_old.value)): 
          #difference is greater, do not replace        
          df_dup = df_dup.append(df_align.loc[ts1])
          df_dup.loc[ts1, dest_columns] = vals
        else: #replace
          df_dup = df_dup.append(df_align.loc[ts1])
          df_align.loc[ts1, dest_columns] = vals
        continue
      df_align.loc[ts1, dest_columns] = vals
  
  if not(include_index) : df_align = df_align.drop( columns = dest_columns[0])

  return df_align, df_dup

### Function to create an equally spaced 5 minutes time series

In [5]:
def timedf(df):
  #creates a 5 minute interval timeseries dataframe based in index of df
  # df must have a timestamp index
  #time_df: result dataframe with timestamp index
  timestamp = pd.date_range(start=df.index[0], end=df.index[-1]  + timedelta(minutes=4), freq='5T')
  time_df = pd.DataFrame({'timestamp':timestamp})
  time_df.set_index('timestamp', inplace=True)
  return time_df

### Function to find out the number of gaps and theirs length  in a time series

In [6]:
def find_gaps(df, greaterthan=5, units='m'):
  # find gaps relative to index, index must be a datetime field
  # greaterthan is the number of time units to be considered a gap
  # units 'm'=minutes, 'h'=hours
  i = 0
  gaps_df = pd.DataFrame()
  while i < len(df) - 1:
    ts = df.index[i]
    next_ts = df.index[i+1]
    duration = next_ts - ts
    if duration > np.timedelta64(greaterthan, units): 
      begin_gap = ts
      end_gap = next_ts
      gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
    i = i + 1
  gaps_df.sort_values(by=['Duration'], ascending=False, inplace=True)
  return gaps_df

In [7]:
#def read_files():
#  #read glucose
#  glucose_df = read_ts_file('glucose_level.csv')
#  glucose_df.rename(columns={"ts": "timestamp", "value": "glucose"}, inplace=True)
#  print(glucose_df)
#read meal
#  meal_df = read_ts_file('meal.csv')
#  print(meal_df)
#  #read bolus
#  meal_df = read_ts_file('bolus.csv')
# print(meal_df)

### Function to impute by the mean

In [8]:
def impute_mean(df, column, by='hour'):
  # impute with mean by hours
  # in the future by could be another mean grouping criterion
  df[by] = df.index.hour
  df[column] = df.groupby(by)[column].apply(lambda x: x.fillna(x.mean()))
  df.drop(by, axis=1, inplace=True)

### Function to convert time series into supervised learning problem

In [9]:
# convert time series into supervised learning problem
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars=1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

## Reading  Training  Data

In [10]:
#selected_items = ['glucose_level','bolus','meal']
read_xml_data(filename='c://aadm/570-ws-training.xml', selected_items=['glucose_level','meal'])

In [11]:
glucose_df = read_ts_file('glucose_level.csv')
glucose_df.rename(columns={"ts": "timestamp", "value": "glucose"}, inplace=True)
print(glucose_df)

                     glucose
ts                          
2021-12-07 16:29:00      101
2021-12-07 16:34:00      100
2021-12-07 16:39:00      100
2021-12-07 16:44:00       99
2021-12-07 16:49:00       98
...                      ...
2022-01-16 23:39:00      113
2022-01-16 23:44:00      118
2022-01-16 23:49:00      120
2022-01-16 23:54:00      123
2022-01-16 23:59:00      128

[10982 rows x 1 columns]


In [12]:
glucose_df.describe()

Unnamed: 0,glucose
count,10982.0
mean,187.492989
std,62.328928
min,46.0
25%,142.0
50%,189.0
75%,232.0
max,377.0


# Explore gaps

In [13]:
#Finding the length of the complete time series
time_df = timedf(glucose_df)
print('Missing intervals: ', len(time_df) - len(glucose_df) )

Missing intervals:  629


In [14]:
# All gaps 
gaps_df = find_gaps(glucose_df)
#print(gaps_df)

  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'D

In [15]:
# Find gaps greater than 5 hours
#gaps_df2 = find_gaps(glucose_df, greaterthan=5, units='h')
#print(gaps_df2)

# Impute using average per time


In [16]:
glucose_full=glucose_df.resample('5T').mean()
glucose_full.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 11611 entries, 2021-12-07 16:25:00 to 2022-01-16 23:55:00
Freq: 5T
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   glucose  10982 non-null  float64
dtypes: float64(1)
memory usage: 181.4 KB


In [17]:
#impute_mean(glucose_full, 'glucose')

In [18]:
#trainmiss[trainmiss['glucose'].isna()]
#trainmiss.info()

In [19]:
#trainmiss.loc[dfnans.index]
#trainmiss.info()

In [20]:
glucose_full= glucose_full.interpolate(method="polynomial",order=3)
#glucose_full.head()
#missing_minutes =list(df2sp2[df2sp2['glucose'].isna()].index)
#missing_minutes
#cgmtrainclean.info()

In [21]:
glucose_full.describe()

Unnamed: 0,glucose
count,11611.0
mean,190.536748
std,63.216232
min,46.0
25%,145.5
50%,192.0
75%,236.0
max,377.0


In [22]:
#import matplotlib.pyplot as plt 
#import rpy2
#import rpy2.rinterface
#%load_ext rpy2.ipython
#import rpy2.robjects as robjects
#from rpy2.robjects.packages import importr
#splines = importr('splines') 
#x_train=np.arange(len(glucose_full))
#y_train=glucose_full['glucose']
#r_y = robjects.FloatVector(y_train)
#r_x = robjects.FloatVector(x_train)
#r_smooth_spline = robjects.r['smooth.spline'] #extract R function# run smoothing function
#spline1 = r_smooth_spline(x=r_x,y=r_y, spar=.01)
#ySpline=np.array(robjects.r['predict'](spline1,robjects.FloatVector(x_train)).rx2('y'))
#print(ySpline)
#plt.figure(figsize=(12,6))
#plt.scatter(x_train,y_train,c="blue")
#plt.plot(x_train,ySpline,c='red')

In [23]:
#Imputation using Kalman smoothing
from tsmoothie.smoother import *
from tsmoothie.utils_func import create_windows
#smoother = KalmanSmoother(component='level_trend', 
#                          component_noise={'level':0.1, 'trend':0.1})
#smoother = KalmanSmoother(component='level_season', 
#                          component_noise={'level':0.1, 'season':0.1},n_seasons=7)
#smoother = SplineSmoother(n_knots=100, spline_type='cubic_spline')
#smoother.smooth(glucose_full[['glucose']].T)
#glucosekf=smoother.smooth_data[0]
#print(len(glucosekf))
#glucose_full['glucose']=glucosekf
#glucose_full['glucose']=ySpline
#glucose_full.head()
#smoother.smooth_data[0].mean()

In [24]:
glucose_full.describe()

Unnamed: 0,glucose
count,11611.0
mean,190.536748
std,63.216232
min,46.0
25%,145.5
50%,192.0
75%,236.0
max,377.0


# Process meals   (training)

In [25]:
meal_df = read_ts_file('meal.csv')
print(meal_df)

                          type  carbs
ts                                   
2021-12-07 18:28:00     Dinner     65
2021-12-07 21:33:00      Snack      5
2021-12-08 06:14:00  Breakfast     97
2021-12-08 12:04:00      Lunch    120
2021-12-08 16:50:00      Snack     38
...                        ...    ...
2022-01-15 21:03:00     Dinner    130
2022-01-16 08:28:00  Breakfast     75
2022-01-16 13:46:00      Snack     90
2022-01-16 18:47:00      Snack     60
2022-01-16 20:14:00     Dinner    150

[136 rows x 2 columns]


In [26]:
#df_aligned2, df_dups2 = align_timeseries(glucose_full, meal_df, ['carbs'], [0], dest_columns=['meal_carbs'], include_index=True)

In [27]:
#df_aligned2.describe()
#print(df_dups2). There are no duplicates

# correcting  meal values

In [28]:
def eff_carbs(aligned_df, meals_df):
  eff_carbs_df = aligned_df.copy()
  eff_carbs_df.sort_index()
  end = eff_carbs_df.index[-1]
  meals_df.sort_index()
  binc= 0.111
  bdecr= 0.028
  eff_carbs_df['eff_carbs'] = 0
  ts_inc = list(range(15,60,5)) #increasing phase
  ts_dec = list(range(60,240,5)) #decreasing phase
  for i in range(len(meals_df)):
    ts_meal = meals_df.index[i]
    ts_loc = eff_carbs_df.index.get_loc(ts_meal, method='nearest')
    ts_eff_carbs = eff_carbs_df.index[ts_loc]
    cmeal = meals_df.loc[ts_meal, 'carbs']
    #increasing phase
    #idx = ts_eff_carbs
    for j in range(len(ts_inc)):
      idx = ts_eff_carbs + timedelta(minutes=ts_inc[j])
      if idx > end:
        break
      eff_carbs_df.loc[idx, 'eff_carbs'] += (j+1)*binc*cmeal
     #decreasing phase
    for j in range(len(ts_dec)):
      idx = ts_eff_carbs + timedelta(minutes=ts_dec[j])
      if idx > end:
        break
      val = cmeal*(1-(j+1)*bdecr)
      if val<=0: 
        break
      eff_carbs_df.loc[idx, 'eff_carbs'] += val
  
  return eff_carbs_df

In [29]:
corrected_meals_df = eff_carbs(glucose_full, meal_df)

  ts_loc = eff_carbs_df.index.get_loc(ts_meal, method='nearest')


In [30]:
corrected_meals_df.describe()

Unnamed: 0,glucose,eff_carbs
count,11611.0,11611.0
mean,190.536748,26.867094
std,63.216232,38.320737
min,46.0,0.0
25%,145.5,0.0
50%,192.0,0.0
75%,236.0,46.8
max,377.0,199.8


### Clean training data including cgm and corrected meals

In [31]:
cgmtrainclean=corrected_meals_df.loc[:,['glucose','eff_carbs']]
cgmtrainclean.info()
#cgmtrainclean.to_csv('corrected_meals.csv')

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 11611 entries, 2021-12-07 16:25:00 to 2022-01-16 23:55:00
Freq: 5T
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   glucose    11611 non-null  float64
 1   eff_carbs  11611 non-null  float64
dtypes: float64(2)
memory usage: 530.2 KB


## Reading Testing data

In [32]:
#selected_items = ['glucose_level','bolus','meal']
read_xml_data(filename='c://aadm/570-ws-testing.xml', selected_items=['glucose_level','meal'])

In [33]:
glucose_df = read_ts_file('glucose_level.csv')
glucose_df.rename(columns={"ts": "timestamp", "value": "glucose"}, inplace=True)
print(glucose_df)

                     glucose
ts                          
2022-01-17 00:04:00      135
2022-01-17 00:09:00      143
2022-01-17 00:14:00      152
2022-01-17 00:19:00      159
2022-01-17 00:24:00      166
...                      ...
2022-01-26 23:37:00      204
2022-01-26 23:42:00      208
2022-01-26 23:47:00      212
2022-01-26 23:52:00      218
2022-01-26 23:57:00      224

[2745 rows x 1 columns]


In [34]:
#Finding the length of the complete time series
time_df = timedf(glucose_df)
print('Missing intervals: ', len(time_df) - len(glucose_df) )

Missing intervals:  135


In [35]:
# All gaps 
gaps_df = find_gaps(glucose_df)
#print(gaps_df)

  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)
  gaps_df = gaps_df.append({'From': begin_gap, 'To': end_gap, 'Duration': duration}, ignore_index=True)


In [36]:
glucose_full=glucose_df.resample('5T').mean()
glucose_full.describe()

Unnamed: 0,glucose
count,2745.0
mean,214.764299
std,66.396614
min,60.0
25%,167.0
50%,221.0
75%,264.0
max,388.0


In [37]:
#impute_mean(glucose_full, 'glucose')

In [38]:
#testmiss.loc[dfnans.index]
#testmiss.info()

In [39]:
glucose_full= glucose_full.interpolate(method="polynomial",order=3)

In [40]:
#import rpy2
#import rpy2.rinterface
#%load_ext rpy2.ipython
#import rpy2.robjects as robjects
#from rpy2.robjects.packages import importr
#splines = importr('splines') 
#x_train=np.arange(len(glucose_full))
#y_train=glucose_full['glucose']
#r_y = robjects.FloatVector(y_train)
#r_x = robjects.FloatVector(x_train)
#r_smooth_spline = robjects.r['smooth.spline'] #extract R function# run smoothing function
#spline1 = r_smooth_spline(x=r_x,y=r_y, spar=.01)
#ySpline=np.array(robjects.r['predict'](spline1,robjects.FloatVector(x_train)).rx2('y'))
#print(ySpline)
#plt.figure(figsize=(12,6))
#plt.scatter(x_train,y_train,c="blue")
#plt.plot(x_train,ySpline,c='red')

In [41]:
#Imputation using Kalman filter
#smoother = KalmanSmoother(component='level_trend', 
#                          component_noise={'level':0.1, 'trend':0.1})
#smoother = KalmanSmoother(component='level_season', 
#                          component_noise={'level':0.1, 'season':0.1},n_seasons=7)
#smoother = SplineSmoother(n_knots=100, spline_type='cubic_spline')
#smoother.smooth(glucose_full[['glucose']].T)
#glucosekf=smoother.smooth_data[0]
#glucose_full['glucose']=glucosekf
#glucose_full['glucose']=ySpline
#smoother.smooth_data[0].mean()
glucose_full.describe()

Unnamed: 0,glucose
count,2880.0
mean,217.25525
std,67.367325
min,60.0
25%,168.0
50%,222.0
75%,266.0
max,388.0


In [42]:
meal_df = read_ts_file('meal.csv')
#print(meal_df)
#df_aligned2, df_dups2 = align_timeseries(glucose_full, meal_df, ['carbs'], [0], dest_columns=['meal_carbs'], include_index=True)

### Cleaning testing data including cgm and corrected meals

In [43]:
#print(df_aligned2)
corrected_meals_df = eff_carbs(glucose_full, meal_df)
cgmtestclean=corrected_meals_df.loc[:,['glucose','eff_carbs']]
cgmtestclean.describe()
#cgmtestclean.to_csv('corrected_meals.csv')

  ts_loc = eff_carbs_df.index.get_loc(ts_meal, method='nearest')


Unnamed: 0,glucose,eff_carbs
count,2880.0,2880.0
mean,217.25525,29.744569
std,67.367325,42.17335
min,60.0,0.0
25%,168.0,0.0
50%,222.0,0.0
75%,266.0,53.16
max,388.0,174.825


## Data Normalization

In [45]:
#Normalizing the data
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
datatrain = np.array(cgmtrainclean.values.astype('float32'))
datatest = np.array(cgmtestclean.values.astype('float32'))
#print(datatest)
testmax=datatest.max(axis=0)[0]
testmin=datatest.min(axis=0)[0]
#print(testmax[0])
#print(testmin[0])
values = cgmtrainclean.values
values = values.astype('float32')
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

In [46]:
#Normalizing Testing
values1 = cgmtestclean.values
values1 = values1.astype('float32')
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled1 = scaler.fit_transform(values1)

In [47]:
# Transforming the training time series data into a supervised data
train = series_to_supervised(scaled, 12, 6)
train.head()

Unnamed: 0,var1(t-12),var2(t-12),var1(t-11),var2(t-11),var1(t-10),var2(t-10),var1(t-9),var2(t-9),var1(t-8),var2(t-8),...,var1(t+1),var2(t+1),var1(t+2),var2(t+2),var1(t+3),var2(t+3),var1(t+4),var2(t+4),var1(t+5),var2(t+5)
12,0.390659,0.0,0.385474,0.0,0.395845,0.0,0.409673,0.0,0.423501,0.0,...,0.430416,0.0,0.440787,0.0,0.435601,0.0,0.42523,0.0,0.414859,0.0
13,0.385474,0.0,0.395845,0.0,0.409673,0.0,0.423501,0.0,0.435601,0.0,...,0.440787,0.0,0.435601,0.0,0.42523,0.0,0.414859,0.0,0.407945,0.0
14,0.395845,0.0,0.409673,0.0,0.423501,0.0,0.435601,0.0,0.449429,0.0,...,0.435601,0.0,0.42523,0.0,0.414859,0.0,0.407945,0.0,0.404488,0.0
15,0.409673,0.0,0.423501,0.0,0.435601,0.0,0.449429,0.0,0.461529,0.0,...,0.42523,0.0,0.414859,0.0,0.407945,0.0,0.404488,0.0,0.394116,0.0
16,0.423501,0.0,0.435601,0.0,0.449429,0.0,0.461529,0.0,0.458072,0.0,...,0.414859,0.0,0.407945,0.0,0.404488,0.0,0.394116,0.0,0.369917,0.0


In [48]:
# Transforming the training time series data into a supervised data
#train = series_to_supervised(scaled, 12, 6)
#print(train.shape)
# drop columns we don't want to predict
train.drop(train.columns[[25,27,29,31,33,35]], axis=1, inplace=True)
train.head()
n_features=1
X1=train.iloc[:,0:24:2]
print(X1.shape)
X1=np.array(X1).reshape(X1.shape[0], X1.shape[1], n_features)
X2=train.iloc[:,1:24:2]
X2=np.array(X2).reshape(X2.shape[0], X2.shape[1], n_features)

(12064, 12)


In [49]:
# Transforming the testing time series data into a supervised data
test = series_to_supervised(scaled1, 12, 6)
#print(test.shape)
# drop columns we don't want to predict
test.drop(test.columns[[25,27,29,31,33,35]], axis=1, inplace=True)
Xtest1=test.iloc[:,0:24:2]
print(Xtest1)
Xtest1=np.array(Xtest1).reshape(Xtest1.shape[0], Xtest1.shape[1], n_features)
Xtest2=test.iloc[:,1:24:2]
Xtest2=np.array(Xtest2).reshape(Xtest2.shape[0], Xtest2.shape[1], n_features)
test=np.array(test)

      var1(t-12)  var1(t-11)  var1(t-10)  var1(t-9)  var1(t-8)  var1(t-7)  \
12      0.437786    0.447962    0.458138   0.468314   0.478490   0.488665   
13      0.447962    0.458138    0.468314   0.478490   0.488665   0.501385   
14      0.458138    0.468314    0.478490   0.488665   0.501385   0.514105   
15      0.468314    0.478490    0.488665   0.501385   0.514105   0.519193   
16      0.478490    0.488665    0.501385   0.514105   0.519193   0.519193   
...          ...         ...         ...        ...        ...        ...   
2866    0.435242    0.435242    0.437786   0.442874   0.445418   0.450506   
2867    0.435242    0.437786    0.442874   0.445418   0.450506   0.458138   
2868    0.437786    0.442874    0.445418   0.450506   0.458138   0.460682   
2869    0.442874    0.445418    0.450506   0.458138   0.460682   0.460682   
2870    0.445418    0.450506    0.458138   0.460682   0.460682   0.458138   

      var1(t-6)  var1(t-5)  var1(t-4)  var1(t-3)  var1(t-2)  var1(t-1)  
12

In [50]:
#Past history: One hour, Prediction horizon: 30 minutes 
n_lag=12
n_seq=6
epochs=50

In [51]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers import TimeDistributed
from keras.layers.convolutional import MaxPooling1D
import datetime   
import math

In [52]:
n_features=1
timesteps=n_lag
n_seq=6
#X= np.array(train.iloc[:, 0:24])
#X1=train.iloc[:,0:24:2].reshape(train.shape[0], train.shape[1], n_features)
#X2=train.iloc[:,1:24:2].reshape(train.shape[0], train.shape[1], n_features)
y= np.array(train.iloc[:, 24:36])
print(y)
print(y.shape)
rows_x = len(y)
print("rows-x=",rows_x)
#X = np.reshape(X, (rows_x, timesteps, 1)) 
#print(X.shape)

[[0.4753573  0.43041557 0.44078672 0.43560112 0.42522997 0.4148588 ]
 [0.43041557 0.44078672 0.43560112 0.42522997 0.4148588  0.40794468]
 [0.44078672 0.43560112 0.42522997 0.4148588  0.40794468 0.4044876 ]
 ...
 [0.48399997 0.4857285  0.4891855  0.49437112 0.49955672 0.5064708 ]
 [0.4857285  0.4891855  0.49437112 0.49955672 0.5064708  0.51338494]
 [0.4891855  0.49437112 0.49955672 0.5064708  0.51338494 0.5202991 ]]
(12064, 6)
rows-x= 12064


In [53]:
# first input model
from keras.layers import Input
from keras.models import Model
from keras.layers import concatenate
visible1 = Input(shape=(timesteps, n_features))
cnn1 = Conv1D(filters=64, kernel_size=2, activation='relu')(visible1)
cnn1 = MaxPooling1D(pool_size=2)(cnn1)
cnn1 = Flatten()(cnn1)
# second input model
visible2 = Input(shape=(timesteps, n_features))
cnn2 = Conv1D(filters=64, kernel_size=2, activation='relu')(visible2)
cnn2 = MaxPooling1D(pool_size=2)(cnn2)
cnn2 = Flatten()(cnn2)
# merge input models
merge = concatenate([cnn1, cnn2])
dense = Dense(100, activation='relu')(merge)
output = Dense(n_seq)(dense)
model = Model(inputs=[visible1, visible2], outputs=output)
model.summary()
model.compile(optimizer='adam', loss='mse')

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 12, 1)]      0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 12, 1)]      0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 11, 64)       192         ['input_1[0][0]']                
                                                                                                  
 conv1d_1 (Conv1D)              (None, 11, 64)       192         ['input_2[0][0]']                
                                                                                              

In [54]:
# fit model
import time
start_time = time.time()
model.fit([X1, X2], y, epochs=400, verbose=0)

<keras.callbacks.History at 0x2871757e7f0>

In [55]:
a=model.predict([Xtest1,Xtest2],batch_size=n_seq)
print(a)

[[0.5177825  0.52166766 0.5253947  0.52844757 0.53191566 0.5355246 ]
 [0.5307024  0.53596574 0.54068846 0.54458135 0.5485003  0.5522363 ]
 [0.57233286 0.58438873 0.59471446 0.60362583 0.61164975 0.61788017]
 ...
 [0.4503875  0.45160803 0.45275477 0.45347363 0.45542794 0.4571422 ]
 [0.4545902  0.45652965 0.4582899  0.45964447 0.46209952 0.46433738]
 [0.45921302 0.46181843 0.46411154 0.46600047 0.4691932  0.47201678]]


In [56]:
print(testmax)
print(testmin)

400.0
6.910993


In [57]:
forec1=a*(testmax-testmin)+testmin
print(forec1)

[[210.44562 211.97282 213.43788 214.63794 216.0012  217.41985]
 [215.52429 217.59325 219.44969 220.97995 222.52045 223.98903]
 [231.88876 236.62779 240.68672 244.18968 247.3438  249.79291]
 ...
 [183.95338 184.43315 184.88393 185.1665  185.93472 186.60858]
 [185.60541 186.36778 187.05972 187.5922  188.55725 189.43692]
 [187.42259 188.44675 189.34814 190.09067 191.34569 192.45561]]


In [58]:
test=pd.DataFrame(test)
testy=test.loc[:,24:30]
actual=testy*(testmax-testmin)+testmin
print(actual)

              24          25          26          27          28          29
0     225.000000  233.000015  237.000000  241.000000  246.000015  248.000000
1     233.000015  237.000000  241.000000  246.000015  248.000000  250.000000
2     237.000000  241.000000  246.000015  248.000000  250.000000  252.000000
3     241.000000  246.000015  248.000000  250.000000  252.000000  256.000000
4     246.000015  248.000000  250.000000  252.000000  256.000000  259.000000
...          ...         ...         ...         ...         ...         ...
2854  186.000000  187.000000  188.000000  187.000000  186.000000  185.000000
2855  187.000000  188.000000  187.000000  186.000000  185.000000  182.999985
2856  188.000000  187.000000  186.000000  185.000000  182.999985  182.000000
2857  187.000000  186.000000  185.000000  182.999985  182.000000  180.000000
2858  186.000000  185.000000  182.999985  182.000000  180.000000  177.000000

[2859 rows x 6 columns]


In [59]:
forec1=np.array(forec1)
actual=np.array(actual)
diff=actual-forec1
print(diff.shape)
#np.sqrt(np.mean((diff)**2,axis=0))
np.sqrt(np.mean((diff)**2,axis=0)).mean()

(2859, 6)


18.084686