In [1]:
import os

In [2]:
import json

In [3]:
import numpy as np

In [4]:
import pandas as pd

In [5]:
import tensorflow as tf

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [7]:
import joblib

In [8]:
data_folder = '/home/developer/gcp/cbidmltsf/datasets/electricity'

In [9]:
# constant values for positional encodings
hours_in_day = 24
days_in_week = 7
days_in_month = 30
days_in_year = 365
# weeks_of_year and month_of_year become redundant when using days_of_year, do not evaluate them
# weeks_in_year = 52
# months_in_year = 12

In [10]:
# split the time series in seen (train, eval) and unseen (test) data
# according to academic papers:

# 243 days on seen data, 7 days on unseen data 

# seen data:      '2014-01-01 00:00:00' to '2014-08-31 23:00:00', 243*24 = 5832 lectures

# train/eval split is 0.9/0.1, then

# train data:     '2014-01-01 00:00:00' to '2014-08-07 15:00:00', 5248 lectures
# eval data:      '2014-08-07 15:00:00' to '2014-08-31 23:00:00', 584 lectures

# unseen data:    '2014-09-01 00:00:00' to '2014-09-07 23:00:00', 7*24 = 168 lectures

# 243 weeks for seen data, 1 week for unseen data
no_lectures_seen_data = 243*24 # 5832

# seen data is divided as 90% for training and 10% for evaluation
train_eval_limit = 0.9

train_interval_end = int(no_lectures_seen_data*train_eval_limit) # 5248

In [11]:
# dimensionality of the encoder input
m = 168

# dimensionality of the decoder output 
t = 168

span = m + t

In [12]:
# columns to be included in the SLDB
sldb_columns = [
    'date',
    'token_id',
    'kw_scaled',
    'sin_hour_day',
    'cos_hour_day',
    'sin_day_week',
    'cos_day_week',
    'sin_day_month',
    'cos_day_month',
    'sin_day_year',
    'cos_day_year'    
]

In [13]:
output = pd.read_pickle('{}/hourly_electricity_complete.pkl'.format(data_folder))

In [14]:
output

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
0,3.172589,1,2012-01-01 00:00:00,8760.0,365,0,6,1,1,52,1
1,4.124365,1,2012-01-01 01:00:00,8761.0,365,1,6,1,1,52,1
2,4.758883,1,2012-01-01 02:00:00,8762.0,365,2,6,1,1,52,1
3,4.441624,1,2012-01-01 03:00:00,8763.0,365,3,6,1,1,52,1
4,4.758883,1,2012-01-01 04:00:00,8764.0,365,4,6,1,1,52,1
...,...,...,...,...,...,...,...,...,...,...,...
10464243,8405.405405,370,2014-12-31 20:00:00,35060.0,1460,20,2,31,365,1,12
10464244,8283.783784,370,2014-12-31 21:00:00,35061.0,1460,21,2,31,365,1,12
10464245,7594.594595,370,2014-12-31 22:00:00,35062.0,1460,22,2,31,365,1,12
10464246,6932.432432,370,2014-12-31 23:00:00,35063.0,1460,23,2,31,365,1,12


In [15]:
# filter to match range used by other academic papers
filtered_output = output[(output['days_from_start'] >= 1096) & (output['days_from_start'] < 1346)].copy()

In [16]:
filtered_output

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
17544,2.538071,1,2014-01-01 00:00:00,26304.0,1096,0,2,1,1,1,1
17545,2.855330,1,2014-01-01 01:00:00,26305.0,1096,1,2,1,1,1,1
17546,2.855330,1,2014-01-01 02:00:00,26306.0,1096,2,2,1,1,1,1
17547,2.855330,1,2014-01-01 03:00:00,26307.0,1096,3,2,1,1,1,1
17548,2.538071,1,2014-01-01 04:00:00,26308.0,1096,4,2,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
10461482,20824.324324,370,2014-09-07 19:00:00,32299.0,1345,19,6,7,250,36,9
10461483,19527.027027,370,2014-09-07 20:00:00,32300.0,1345,20,6,7,250,36,9
10461484,20202.702703,370,2014-09-07 21:00:00,32301.0,1345,21,6,7,250,36,9
10461485,19851.351351,370,2014-09-07 22:00:00,32302.0,1345,22,6,7,250,36,9


In [17]:
# a dictionary to manage data per individual customer_id
data = dict()

In [18]:
# a dictionary to manage a MinMaxScaler per individual customer_id
min_max = dict()
# a dictionary to manage a StandardScaler per individual customer_id
standard = dict()

In [19]:
start, end = 320, 330

In [20]:
token_ids = [token_id for token_id in np.arange(start, end + 1)]
token_ids

[320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330]

In [21]:
customer_ids = ['MT_{:03d}'.format(token_id) for token_id in token_ids]
customer_ids

['MT_320',
 'MT_321',
 'MT_322',
 'MT_323',
 'MT_324',
 'MT_325',
 'MT_326',
 'MT_327',
 'MT_328',
 'MT_329',
 'MT_330']

In [22]:
# prototype code in separated cells

In [23]:
token_id = 320

In [24]:
# get the customer identifier
customer_id = 'MT_{:03d}'.format(token_id)

In [25]:
# a temporary dataframe with data per customer_id to build the sub-series/examples
data_df = filtered_output[filtered_output['token_id'] == token_id].copy()

In [26]:
data_df

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
9099871,45.089186,320,2014-01-01 00:00:00,26304.0,1096,0,2,1,1,1,1
9099872,44.531773,320,2014-01-01 01:00:00,26305.0,1096,1,2,1,1,1,1
9099873,46.205407,320,2014-01-01 02:00:00,26306.0,1096,2,2,1,1,1,1
9099874,47.877648,320,2014-01-01 03:00:00,26307.0,1096,3,2,1,1,1,1
9099875,46.344760,320,2014-01-01 04:00:00,26308.0,1096,4,2,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
9105866,150.165831,320,2014-09-07 19:00:00,32299.0,1345,19,6,7,250,36,9
9105867,152.550167,320,2014-09-07 20:00:00,32300.0,1345,20,6,7,250,36,9
9105868,130.726031,320,2014-09-07 21:00:00,32301.0,1345,21,6,7,250,36,9
9105869,120.523969,320,2014-09-07 22:00:00,32302.0,1345,22,6,7,250,36,9


In [27]:
# expand with positional encodings
data_df['sin_hour_day'] = np.sin(2*np.pi*data_df.hour_of_day/hours_in_day)
data_df['cos_hour_day'] = np.cos(2*np.pi*data_df.hour_of_day/hours_in_day)
data_df['sin_day_week'] = np.sin(2*np.pi*data_df.day_of_week/days_in_week)
data_df['cos_day_week'] = np.cos(2*np.pi*data_df.day_of_week/days_in_week)
data_df['sin_day_month'] = np.sin(2*np.pi*data_df.day_of_month/days_in_month)
data_df['cos_day_month'] = np.cos(2*np.pi*data_df.day_of_month/days_in_month)
data_df['sin_day_year'] = np.sin(2*np.pi*data_df.day_of_year/days_in_year)
data_df['cos_day_year'] = np.cos(2*np.pi*data_df.day_of_year/days_in_year)

In [28]:
data_df

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week,sin_day_month,cos_day_month,sin_day_year,cos_day_year
9099871,45.089186,320,2014-01-01 00:00:00,26304.0,1096,0,2,1,1,1,1,0.000000,1.000000,0.974928,-0.222521,0.207912,0.978148,0.017213,0.999852
9099872,44.531773,320,2014-01-01 01:00:00,26305.0,1096,1,2,1,1,1,1,0.258819,0.965926,0.974928,-0.222521,0.207912,0.978148,0.017213,0.999852
9099873,46.205407,320,2014-01-01 02:00:00,26306.0,1096,2,2,1,1,1,1,0.500000,0.866025,0.974928,-0.222521,0.207912,0.978148,0.017213,0.999852
9099874,47.877648,320,2014-01-01 03:00:00,26307.0,1096,3,2,1,1,1,1,0.707107,0.707107,0.974928,-0.222521,0.207912,0.978148,0.017213,0.999852
9099875,46.344760,320,2014-01-01 04:00:00,26308.0,1096,4,2,1,1,1,1,0.866025,0.500000,0.974928,-0.222521,0.207912,0.978148,0.017213,0.999852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9105866,150.165831,320,2014-09-07 19:00:00,32299.0,1345,19,6,7,250,36,9,-0.965926,0.258819,-0.781831,0.623490,0.994522,0.104528,-0.917584,-0.397543
9105867,152.550167,320,2014-09-07 20:00:00,32300.0,1345,20,6,7,250,36,9,-0.866025,0.500000,-0.781831,0.623490,0.994522,0.104528,-0.917584,-0.397543
9105868,130.726031,320,2014-09-07 21:00:00,32301.0,1345,21,6,7,250,36,9,-0.707107,0.707107,-0.781831,0.623490,0.994522,0.104528,-0.917584,-0.397543
9105869,120.523969,320,2014-09-07 22:00:00,32302.0,1345,22,6,7,250,36,9,-0.500000,0.866025,-0.781831,0.623490,0.994522,0.104528,-0.917584,-0.397543


In [29]:
# get a series for the power usage variable on the training dataset, to fit the scaler
lectures_train_data = data_df['power_usage'][:train_interval_end]

In [30]:
# fit a scaler only on train data
# it is required to pass the power usage time series to a (?, 1) NumPy array
lectures_train_data_array = np.array(lectures_train_data).reshape(-1, 1)

In [31]:
# get MinMaxScaler on train data, store it in a dictionary
min_max_scaler = MinMaxScaler()
min_max = min_max_scaler.fit(lectures_train_data_array)

In [32]:
# get an array from the variable time series (seen and unseen)
all_data_variable_array = np.array(data_df.power_usage).reshape(-1, 1)

In [33]:
# apply the scaler over all data (seen and unseen)
# rescale, and squeeze to drop the extra dimension, then assign to the new column kw_scaled
data_df['kw_scaled'] = np.squeeze(min_max.transform(all_data_variable_array))

In [34]:
data_df

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week,sin_day_month,cos_day_month,sin_day_year,cos_day_year,kw_scaled
9099871,45.089186,320,2014-01-01 00:00:00,26304.0,1096,0,2,1,1,1,1,0.000000,1.000000,0.974928,-0.222521,0.207912,0.978148,0.017213,0.999852,0.256956
9099872,44.531773,320,2014-01-01 01:00:00,26305.0,1096,1,2,1,1,1,1,0.258819,0.965926,0.974928,-0.222521,0.207912,0.978148,0.017213,0.999852,0.253737
9099873,46.205407,320,2014-01-01 02:00:00,26306.0,1096,2,2,1,1,1,1,0.500000,0.866025,0.974928,-0.222521,0.207912,0.978148,0.017213,0.999852,0.263404
9099874,47.877648,320,2014-01-01 03:00:00,26307.0,1096,3,2,1,1,1,1,0.707107,0.707107,0.974928,-0.222521,0.207912,0.978148,0.017213,0.999852,0.273062
9099875,46.344760,320,2014-01-01 04:00:00,26308.0,1096,4,2,1,1,1,1,0.866025,0.500000,0.974928,-0.222521,0.207912,0.978148,0.017213,0.999852,0.264209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9105866,150.165831,320,2014-09-07 19:00:00,32299.0,1345,19,6,7,250,36,9,-0.965926,0.258819,-0.781831,0.623490,0.994522,0.104528,-0.917584,-0.397543,0.863875
9105867,152.550167,320,2014-09-07 20:00:00,32300.0,1345,20,6,7,250,36,9,-0.866025,0.500000,-0.781831,0.623490,0.994522,0.104528,-0.917584,-0.397543,0.877647
9105868,130.726031,320,2014-09-07 21:00:00,32301.0,1345,21,6,7,250,36,9,-0.707107,0.707107,-0.781831,0.623490,0.994522,0.104528,-0.917584,-0.397543,0.751592
9105869,120.523969,320,2014-09-07 22:00:00,32302.0,1345,22,6,7,250,36,9,-0.500000,0.866025,-0.781831,0.623490,0.994522,0.104528,-0.917584,-0.397543,0.692665


In [35]:
# at this moment, persist only the time series that produces the test dataset

In [41]:
columns_to_persist = [
    'date',
    'token_id',
    'kw_scaled',
    'sin_hour_day',
    'cos_hour_day',
    'sin_day_week',
    'cos_day_week',
    'sin_day_month',
    'cos_day_month',
    'sin_day_year',
    'cos_day_year'
]

In [43]:
data_to_persist = data_df[columns_to_persist][no_lectures_seen_data-span+1:]

In [44]:
data_to_persist

Unnamed: 0,date,token_id,kw_scaled,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week,sin_day_month,cos_day_month,sin_day_year,cos_day_year
9105368,2014-08-18 01:00:00,320,0.410926,0.258819,0.965926,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
9105369,2014-08-18 02:00:00,320,0.365127,0.500000,0.866025,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
9105370,2014-08-18 03:00:00,320,0.331876,0.707107,0.707107,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
9105371,2014-08-18 04:00:00,320,0.311730,0.866025,0.500000,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
9105372,2014-08-18 05:00:00,320,0.301258,0.965926,0.258819,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
...,...,...,...,...,...,...,...,...,...,...,...
9105866,2014-09-07 19:00:00,320,0.863875,-0.965926,0.258819,-0.781831,0.62349,0.994522,0.104528,-0.917584,-0.397543
9105867,2014-09-07 20:00:00,320,0.877647,-0.866025,0.500000,-0.781831,0.62349,0.994522,0.104528,-0.917584,-0.397543
9105868,2014-09-07 21:00:00,320,0.751592,-0.707107,0.707107,-0.781831,0.62349,0.994522,0.104528,-0.917584,-0.397543
9105869,2014-09-07 22:00:00,320,0.692665,-0.500000,0.866025,-0.781831,0.62349,0.994522,0.104528,-0.917584,-0.397543


In [45]:
# path to persist the time series dataframe
path = '/home/developer/gcp/cbidmltsf/timeseries/LD2011-2014/MT320_test.pkl'

In [46]:
data_to_persist.to_pickle(path)