In [1]:
# make SLDB datasets for BSCTRFM from individual time series
# on traffic dataset

In [2]:
import numpy as np
import pandas as pd

In [3]:
import json

In [4]:
import os

In [5]:
import tensorflow as tf

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

import joblib

In [7]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [8]:
# the main source is the traffic dataset PEMS-SF from UCI

In [9]:
# it resides in
dataset_path = '/home/developer/gcp/cbidmltsf/datasets/traffic'

In [10]:
os.listdir(dataset_path)

['PEMS-SF', 'separated_raw']

In [11]:
# a SLDB is produced from separated time series (raw or preprocessed)

# SLDB contents are:
# TFRecord files for training
# TFRecord files for evaluation (if eval required)
# time series pickles for testing

In [12]:
# constant values for positional encodings
hours_in_day = 24
days_in_week = 7
# days_in_month = 30
# days_in_year = 365

In [13]:
# a constant to make sin/cos functions from hours_from_start (the 'age' covariate)
# hourly lectures on first day: from 1 to 23
# hourly lectures on remaining 172 days: from 0 to 23, then
total_hours = 23 + 172*24
total_hours

4151

In [14]:
# no dates available for traffic dataset, then work only on values and covariates

In [15]:
# define global dataset intervals (they might not be precise when missing values exist)

# split the time series in seen (train, eval) and unseen (test) data
# according to academic papers:

# 23 hours + 165 days on seen data, 7 days on unseen data 

# seen data:      lecture 0 to lecture 3982, initial 23 + 165*24 = 3983 lectures

# train/eval split is 0.9/0.1, then

# train data:     lecture 0 to lecture 3584, for 3585 lectures
no_lectures_train_dataset = 3585

# eval data:      lecture 3585 to lecture 3982, for 398 lectures
no_lectures_eval_dataset = 398

# unseen data:    lecture 3983 to lecture 4150, 7*24 = 168 lectures


In [16]:
# build sub-series to be persisted as serialized training examples

# dimensionality of the encoder input
m = 168

# dimensionality of the decoder output 
t = 168

In [17]:
# columns to be included in the SLDB

# use 7D encoder (age, hour-day, day-week)

sldb_columns = [
    # no timestamp information for traffic dataset, work only on pos encoding covariates
    # 'date',
    'id',
    # car lane occupancy was recorded as a variable from 0 to 1
    'occupancy_scaled',
    'sin_hours_from_start',
    'cos_hours_from_start',
    'sin_hour_day',
    'cos_hour_day',
    'sin_day_week',
    'cos_day_week',
    # 'sin_day_month',
    # 'cos_day_month',
    # 'sin_day_year',
    # 'cos_day_year'    
]

In [18]:
sldb = {
    'ts': 'PEMS-SF_SEPARATED_FULL',
    'embedding': {
        'hourly': 168
    },
    'tau': {
        'hourly': 1
    },
    'no_targets': 168,
    'BSCTRFM': 1,
    'preprocessed': 0
}

In [19]:
sldb

{'ts': 'PEMS-SF_SEPARATED_FULL',
 'embedding': {'hourly': 168},
 'tau': {'hourly': 1},
 'no_targets': 168,
 'BSCTRFM': 1,
 'preprocessed': 0}

In [20]:
# a string with the basic specifications of the SLDB, as part of the SLDB identifier

# add the suffix '11D', '07D', etc., to differentiate this SLDB from the original one, which is 09D

# add the suffix MMX to indicate the scaler used was MinMax
# add the suffix STD to indicate the scaler used was Standard

sldb_specs = 'BSCTRFM_{:03d}_{:03d}_07DB_MMX'.format(sldb['embedding']['hourly'], sldb['no_targets'])
sldb_specs

'BSCTRFM_168_168_07DB_MMX'

In [21]:
# get the time-based identifier for the SLDB
sldb_identifier = '{}_{}'.format(sldb['ts'], sldb_specs)
sldb_identifier

'PEMS-SF_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX'

In [22]:
sldb_dir = '/home/developer/gcp/cbidmltsf/sldbs/{}'.format(sldb_identifier)
sldb_dir

'/home/developer/gcp/cbidmltsf/sldbs/PEMS-SF_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX'

In [23]:
# get a path to the scalers sub-directory
scalers_dir = '{}/scalers'.format(sldb_dir)
scalers_dir

'/home/developer/gcp/cbidmltsf/sldbs/PEMS-SF_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX/scalers'

In [24]:
encoder_input_columns = [
    'occupancy_scaled',
    'sin_hours_from_start',
    'cos_hours_from_start',
    'sin_hour_day',
    'cos_hour_day',
    'sin_day_week',
    'cos_day_week'
]

In [25]:
# both the encoder input and the decoder input use the same columns from the source sub_series dataframe
decoder_input_columns = encoder_input_columns

In [26]:
target_columns = ['occupancy_scaled']

In [27]:
id_columns = ['id']

In [28]:
# a dictionary to manage data per individual station_id
data = dict()

In [29]:
# a dictionary to store the number of examples per customer_id, stage
count = dict()

In [30]:
# the number of cores available for training in Cloud TPU
num_cores = 8

In [31]:
# are we training over raw data or preprocessed data?
state = 'raw'

In [32]:
# build a list with all the station ids

In [33]:
data_folder = '{}/PEMS-SF'.format(dataset_path)

In [34]:
def process_list(s, variable_type=int, delimiter=None):
    """Parses a line in the PEMS format to a list."""
    if delimiter is None:
      l = [
          variable_type(i) for i in s.replace('[', '').replace(']', '').split()
      ]
    else:
      l = [
          variable_type(i)
          for i in s.replace('[', '').replace(']', '').split(delimiter)
      ]

    return l

In [35]:
def read_single_list(filename):
    """Returns single list from a file in the PEMS-custom format."""
    with open(os.path.join(data_folder, filename), 'r') as dat:
        l = process_list(dat.readlines()[0])
    return l

In [36]:
station_ids = [id for id in read_single_list('stations_list')]
len(station_ids)

963

In [37]:
# from the stations list build the main data dictionary
for station_id in station_ids:
    station_data_path = '{}/separated_{}/ST_{}.pkl'.format(dataset_path, state, station_id)
    data[station_id] = pd.read_pickle(station_data_path)

In [38]:
len(data.keys())

963

In [39]:
# isolated-code cells to verify functionality before launching the batch job

In [40]:
data[400000]

Unnamed: 0,values,sensor_day,time_on_day,day_of_week,id,hours_from_start
0,0.019333,0,1,4,400000,1
1,0.020200,0,2,4,400000,2
2,0.022450,0,3,4,400000,3
3,0.029283,0,4,4,400000,4
4,0.055483,0,5,4,400000,5
...,...,...,...,...,...,...
4146,0.038100,172,19,5,400000,4147
4147,0.033550,172,20,5,400000,4148
4148,0.027783,172,21,5,400000,4149
4149,0.019467,172,22,5,400000,4150


In [42]:
# use pd.to_numeric() to convert objects in the time series to numbers
# this is not required for the electricity dataset!

In [41]:
for station_id in station_ids[:1]:

    # initialize the examples dictionary for each station
    examples = {
        'train': [],
        'eval': [],
        # test dataset is not passed to SLDB, but persisted as a time series
        # 'test': []
    }
    
    # use now a reference to the dataframe in the data dictionary 
    data_df = data[station_id]
    
    # a sub-dictionary to keep the number of examples per station_id, stage
    count[station_id] = dict()

    # expand with positional encodings
    data_df['sin_hours_from_start'] = np.sin(2*np.pi*pd.to_numeric(data_df.hours_from_start)/total_hours)
    data_df['cos_hours_from_start'] = np.cos(2*np.pi*pd.to_numeric(data_df.hours_from_start)/total_hours)
    data_df['sin_hour_day'] = np.sin(2*np.pi*pd.to_numeric(data_df.time_on_day)/hours_in_day)
    data_df['cos_hour_day'] = np.cos(2*np.pi*pd.to_numeric(data_df.time_on_day)/hours_in_day)
    data_df['sin_day_week'] = np.sin(2*np.pi*pd.to_numeric(data_df.day_of_week)/days_in_week)
    data_df['cos_day_week'] = np.cos(2*np.pi*pd.to_numeric(data_df.day_of_week)/days_in_week)
    
    print('Added positional encodings to {}.'.format(station_id))


Added positional encodings to 400000.


In [42]:
data_df.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            4141, 4142, 4143, 4144, 4145, 4146, 4147, 4148, 4149, 4150],
           dtype='int64', length=4151)

In [46]:
# DISCARD USE OF ABSOLUTE INDEXES TO SLICE THE TIME SERIES WITH .loc
# USE RELATIVE INDEXES TO SLICE THE TIME SERIES WITH .iloc
# train_start_index = data_df.index[0]
# train_start_index

In [48]:
# DISCARD USE OF ABSOLUTE INDEXES TO SLICE THE TIME SERIES WITH .loc
# USE RELATIVE INDEXES TO SLICE THE TIME SERIES WITH .iloc
# train_end_index = train_start_index + no_lectures_train_dataset
# train_end_index

In [49]:
# DISCARD USE OF ABSOLUTE INDEXES TO SLICE THE TIME SERIES WITH .loc
# USE RELATIVE INDEXES TO SLICE THE TIME SERIES WITH .iloc
# time series for train dataset
# substract 1 to end index because Pandas slicing includes the tail lecture
# data_df.loc[train_start_index:train_end_index - 1]

In [50]:
# all datasets per station_id keep their indexes from the original time series
# then use iloc (integer location) instead of loc to avoid calculating the original indexes
data_df.iloc[:no_lectures_train_dataset]

Unnamed: 0,values,sensor_day,time_on_day,day_of_week,id,hours_from_start,sin_hours_from_start,cos_hours_from_start,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week
0,0.019333,0,1,4,400000,1,0.001514,0.999999,0.258819,9.659258e-01,-0.433884,-0.900969
1,0.020200,0,2,4,400000,2,0.003027,0.999995,0.500000,8.660254e-01,-0.433884,-0.900969
2,0.022450,0,3,4,400000,3,0.004541,0.999990,0.707107,7.071068e-01,-0.433884,-0.900969
3,0.029283,0,4,4,400000,4,0.006055,0.999982,0.866025,5.000000e-01,-0.433884,-0.900969
4,0.055483,0,5,4,400000,5,0.007568,0.999971,0.965926,2.588190e-01,-0.433884,-0.900969
...,...,...,...,...,...,...,...,...,...,...,...,...
3580,0.056800,149,5,3,400000,3581,-0.759656,0.650325,0.965926,2.588190e-01,0.433884,-0.900969
3581,0.126300,149,6,3,400000,3582,-0.758671,0.651474,1.000000,6.123234e-17,0.433884,-0.900969
3582,0.083983,149,7,3,400000,3583,-0.757684,0.652622,0.965926,-2.588190e-01,0.433884,-0.900969
3583,0.062917,149,8,3,400000,3584,-0.756695,0.653768,0.866025,-5.000000e-01,0.433884,-0.900969


In [51]:
# DISCARD USE OF ABSOLUTE INDEXES TO SLICE THE TIME SERIES WITH .loc
# USE RELATIVE INDEXES TO SLICE THE TIME SERIES WITH .iloc
# because Pandas slicing includes the tail lecture
# eval_start_index = train_end_index
# eval_start_index

In [52]:
# DISCARD USE OF ABSOLUTE INDEXES TO SLICE THE TIME SERIES WITH .loc
# USE RELATIVE INDEXES TO SLICE THE TIME SERIES WITH .iloc
# eval_end_index = eval_start_index + no_lectures_eval_dataset
# eval_end_index

In [53]:
# DISCARD USE OF ABSOLUTE INDEXES TO SLICE THE TIME SERIES WITH .loc
# USE RELATIVE INDEXES TO SLICE THE TIME SERIES WITH .iloc
# time series for eval dataset
# substract 1 to end index because Pandas slicing includes the tail lecture
# data_df.loc[eval_start_index:eval_end_index - 1]

In [54]:
# all datasets per station_id keep their indexes from the original time series
# then use iloc (integer location) instead of loc to avoid calculating the original indexes
data_df.iloc[no_lectures_train_dataset:no_lectures_train_dataset + no_lectures_eval_dataset]

Unnamed: 0,values,sensor_day,time_on_day,day_of_week,id,hours_from_start,sin_hours_from_start,cos_hours_from_start,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week
3585,0.064617,149,10,3,400000,3586,-0.754712,0.656056,5.000000e-01,-0.866025,0.433884,-0.900969
3586,0.062400,149,11,3,400000,3587,-0.753718,0.657197,2.588190e-01,-0.965926,0.433884,-0.900969
3587,0.059717,149,12,3,400000,3588,-0.752723,0.658338,1.224647e-16,-1.000000,0.433884,-0.900969
3588,0.060133,149,13,3,400000,3589,-0.751725,0.659476,-2.588190e-01,-0.965926,0.433884,-0.900969
3589,0.057683,149,14,3,400000,3590,-0.750726,0.660613,-5.000000e-01,-0.866025,0.433884,-0.900969
...,...,...,...,...,...,...,...,...,...,...,...,...
3978,0.039050,165,19,5,400000,3979,-0.257418,0.966300,-9.659258e-01,0.258819,-0.974928,-0.222521
3979,0.035333,165,20,5,400000,3980,-0.255955,0.966689,-8.660254e-01,0.500000,-0.974928,-0.222521
3980,0.028650,165,21,5,400000,3981,-0.254491,0.967075,-7.071068e-01,0.707107,-0.974928,-0.222521
3981,0.022067,165,22,5,400000,3982,-0.253027,0.967459,-5.000000e-01,0.866025,-0.974928,-0.222521


In [55]:
# get a series for the occupancy variable on the training dataset, to fit the scaler
# lectures_train_data = data_df['values'].loc[train_start_index:train_end_index - 1]

lectures_train_data = data_df['values'].iloc[:no_lectures_train_dataset]
lectures_train_data

0       0.019333
1       0.020200
2       0.022450
3       0.029283
4       0.055483
          ...   
3580    0.056800
3581    0.126300
3582    0.083983
3583    0.062917
3584    0.063867
Name: values, Length: 3585, dtype: float64

In [56]:
# fit a scaler only on train data
# it is required to pass the occupancy time series to a (?, 1) NumPy array
lectures_train_data_array = np.array(lectures_train_data).reshape(-1, 1)
lectures_train_data_array.shape

(3585, 1)

In [57]:
# get MinMaxScaler on train data, store it in a dictionary
scaler_type = 'min_max'
scaler = MinMaxScaler()
fitted_scaler = scaler.fit(lectures_train_data_array)
print('Scaler {} generated on training data for {}'.format(scaler_type, station_id))

Scaler min_max generated on training data for 400000


In [58]:
# persist the scaler
scaler_filename = '{}/{}_{}.save'.format(scalers_dir, scaler_type, station_id)
joblib.dump(fitted_scaler, scaler_filename)
print('Scaler {} persisted for {}'.format(scaler_type, station_id))

Scaler min_max persisted for 400000


In [59]:
# get an array from the variable time series (seen and unseen)
# replace data_df.values with data_df['values']
# to select only the values of the 'values' column, and not all the columns values
all_data_variable_array = np.array(data_df['values']).reshape(-1, 1)

In [60]:
all_data_variable_array.shape

(4151, 1)

In [61]:
# apply the scaler over all data (seen and unseen)
# rescale, and squeeze to drop the extra dimension, then assign to the new column kw_scaled
data_df['occupancy_scaled'] = np.squeeze(fitted_scaler.transform(all_data_variable_array))

In [63]:
data_df[['occupancy_scaled']]

Unnamed: 0,occupancy_scaled
0,0.037424
1,0.039929
2,0.046431
3,0.066179
4,0.141894
...,...
4146,0.091658
4147,0.078509
4148,0.061844
4149,0.037809


In [64]:
# at this moment, the individual time series are ready to be window-rolled to produce
# sub-series/examples to serialize

# BSCTRFM inference process is not direct, but iterative, therefore
# no TFRecord SLDB is required for test dataset,

In [65]:
# the time series used to build the test dataset must go
# from lecture t3648 to lecture t4150 (starting counter from t0)
# in order to extract 168 features with targets
# (the last element in the decoder output)
# ranging from lecture t3983 to lecture t4150

test_ts_start = 3648
test_ts_end = 4150

# rolling over the time series for the test dataset:
# remember the model was trained with data up to t3584, then no seen data is used for test stage
# eval data is indeed used for test stage, but eval does not modify training processs in CloudTPU
# first encoder input: t3648 to t3815
# first decoder input: t3815 to t3982
# first decoder output: t3816 to t3983,
# then predictions from t3983 to t3983+23 (one-day-ahead) are built iteratively
# and process is repeated for the following six days, to complete a week

In [66]:
# DISCARD USE OF ABSOLUTE INDEXES TO SLICE THE TIME SERIES WITH .loc
# USE RELATIVE INDEXES TO SLICE THE TIME SERIES WITH .iloc
# test_ts_start_index = train_start_index + test_start_offset
# test_end_index = train_start_index + test_end_offset

In [67]:
# DISCARD USE OF ABSOLUTE INDEXES TO SLICE THE TIME SERIES WITH .loc
# USE RELATIVE INDEXES TO SLICE THE TIME SERIES WITH .iloc
# test_time_series = data_df[sldb_columns].loc[test_ts_start_index:test_end_index]
# test_time_series

In [68]:
test_time_series = data_df[sldb_columns].iloc[test_ts_start:test_ts_end + 1]
test_time_series

Unnamed: 0,id,occupancy_scaled,sin_hours_from_start,cos_hours_from_start,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week
3648,400000,0.010404,-6.888165e-01,0.724936,0.258819,0.965926,-0.781831,0.623490
3649,400000,0.009777,-6.877184e-01,0.725978,0.500000,0.866025,-0.781831,0.623490
3650,400000,0.015509,-6.866187e-01,0.727018,0.707107,0.707107,-0.781831,0.623490
3651,400000,0.035642,-6.855175e-01,0.728056,0.866025,0.500000,-0.781831,0.623490
3652,400000,0.122387,-6.844147e-01,0.729093,0.965926,0.258819,-0.781831,0.623490
...,...,...,...,...,...,...,...,...
4146,400000,0.091658,-6.054586e-03,0.999982,-0.965926,0.258819,-0.974928,-0.222521
4147,400000,0.078509,-4.540952e-03,0.999990,-0.866025,0.500000,-0.974928,-0.222521
4148,400000,0.061844,-3.027307e-03,0.999995,-0.707107,0.707107,-0.974928,-0.222521
4149,400000,0.037809,-1.513655e-03,0.999999,-0.500000,0.866025,-0.974928,-0.222521


In [76]:
test_time_series.index[0]

3648

In [78]:
print('Test interval for staion {}: from index {} to index {} for {} lectures'.\
     format(station_id,
            test_time_series.index[0],
            test_time_series.index[-1],
            test_time_series.index[-1] - test_time_series.index[0] + 1))

Test interval for staion 400000: from index 3648 to index 4150 for 503 lectures


In [79]:
# path to persist the time series dataframe corresponding to test dataset
path = '{}/test/ST_{}.pkl'.format(sldb_dir, station_id)
path

'/home/developer/gcp/cbidmltsf/sldbs/PEMS-SF_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX/test/ST_400000.pkl'

In [80]:
test_time_series.to_pickle(path)
print('Test dataset persisted as a time series pickle for {}'.format(station_id))

Test dataset persisted as a time series pickle for 400000


In [74]:
# global numeralia for traffic dataset

# 963 time series, one for each traffic sensor or station_id
# 4151 lectures on each time series
# 23 hourly lectures for the first day, 24 lectures for the following 172 days

# time series for train dataset:
# 3585 lectures
# from t0 to t3584 (for first station, subsequent stations add an offset to starting index)
# 3585 - (168 + 168) + 1 = 3250 sub-series/examples 

# time series for eval dataset:
# 398 lectures
# from t3585 to t3982 (for first station, subsequent stations add an offset to starting index)
# 398 - (168 + 168) + 1 = 63 sub-series/examples

# time series for test dataset:
# 503 lectures
# from t3648 to t4150 (for first station, subsequent stations add an offset to starting index)
# 503 - (168 + 168) + 1 = 168 sub-series/examples


In [81]:
# DISCARD USE OF ABSOLUTE INDEXES TO SLICE THE TIME SERIES WITH .loc
# USE RELATIVE INDEXES TO SLICE THE TIME SERIES WITH .iloc
# make SLDB training dataset
# get an iterable with all the possible sub-series for training examples
# train_starting_indexes = np.arange(train_start_index, train_end_index - (m + t) + 1)
# train_starting_indexes

In [82]:
# make SLDB training dataset
# get an iterable with all the possible sub-series for training examples
train_starting_indexes = np.arange(no_lectures_train_dataset - (m + t) + 1)
train_starting_indexes

array([   0,    1,    2, ..., 3247, 3248, 3249])

In [83]:
len(train_starting_indexes)

3250

In [85]:
# verify sub-series generation for the last starting index
for train_starting_index in train_starting_indexes[-1:]:

    # do not substract 1 at the end of the slice because using iloc
    sub_series_df = data_df[sldb_columns].iloc[train_starting_index:train_starting_index + (m + t)]
    
sub_series_df

Unnamed: 0,id,occupancy_scaled,sin_hours_from_start,cos_hours_from_start,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week
3249,400000,0.162557,-0.978653,0.205517,5.000000e-01,-8.660254e-01,0.974928,-0.222521
3250,400000,0.160534,-0.978341,0.206999,2.588190e-01,-9.659258e-01,0.974928,-0.222521
3251,400000,0.164772,-0.978027,0.208479,1.224647e-16,-1.000000e+00,0.974928,-0.222521
3252,400000,0.161401,-0.977710,0.209959,-2.588190e-01,-9.659258e-01,0.974928,-0.222521
3253,400000,0.163761,-0.977391,0.211439,-5.000000e-01,-8.660254e-01,0.974928,-0.222521
...,...,...,...,...,...,...,...,...
3580,400000,0.145699,-0.759656,0.650325,9.659258e-01,2.588190e-01,0.433884,-0.900969
3581,400000,0.346547,-0.758671,0.651474,1.000000e+00,6.123234e-17,0.433884,-0.900969
3582,400000,0.224256,-0.757684,0.652622,9.659258e-01,-2.588190e-01,0.433884,-0.900969
3583,400000,0.163375,-0.756695,0.653768,8.660254e-01,-5.000000e-01,0.433884,-0.900969


In [86]:
encoder_input_df = sub_series_df[encoder_input_columns][:m]
encoder_input_df

Unnamed: 0,occupancy_scaled,sin_hours_from_start,cos_hours_from_start,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week
3249,0.162557,-0.978653,0.205517,5.000000e-01,-8.660254e-01,0.974928,-0.222521
3250,0.160534,-0.978341,0.206999,2.588190e-01,-9.659258e-01,0.974928,-0.222521
3251,0.164772,-0.978027,0.208479,1.224647e-16,-1.000000e+00,0.974928,-0.222521
3252,0.161401,-0.977710,0.209959,-2.588190e-01,-9.659258e-01,0.974928,-0.222521
3253,0.163761,-0.977391,0.211439,-5.000000e-01,-8.660254e-01,0.974928,-0.222521
...,...,...,...,...,...,...,...
3412,0.142857,-0.898824,0.438311,9.659258e-01,2.588190e-01,0.433884,-0.900969
3413,0.376842,-0.898159,0.439671,1.000000e+00,6.123234e-17,0.433884,-0.900969
3414,0.736538,-0.897493,0.441030,9.659258e-01,-2.588190e-01,0.433884,-0.900969
3415,0.655380,-0.896824,0.442388,8.660254e-01,-5.000000e-01,0.433884,-0.900969


In [87]:
decoder_input_df = sub_series_df[decoder_input_columns][m-1:m-1+t]
decoder_input_df

Unnamed: 0,occupancy_scaled,sin_hours_from_start,cos_hours_from_start,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week
3416,0.252529,-0.896153,0.443745,7.071068e-01,-7.071068e-01,0.433884,-0.900969
3417,0.158848,-0.895481,0.445101,5.000000e-01,-8.660254e-01,0.433884,-0.900969
3418,0.166121,-0.894806,0.446456,2.588190e-01,-9.659258e-01,0.433884,-0.900969
3419,0.164002,-0.894129,0.447809,1.224647e-16,-1.000000e+00,0.433884,-0.900969
3420,0.160052,-0.893450,0.449162,-2.588190e-01,-9.659258e-01,0.433884,-0.900969
...,...,...,...,...,...,...,...
3579,0.037328,-0.760639,0.649175,8.660254e-01,5.000000e-01,0.433884,-0.900969
3580,0.145699,-0.759656,0.650325,9.659258e-01,2.588190e-01,0.433884,-0.900969
3581,0.346547,-0.758671,0.651474,1.000000e+00,6.123234e-17,0.433884,-0.900969
3582,0.224256,-0.757684,0.652622,9.659258e-01,-2.588190e-01,0.433884,-0.900969


In [88]:
target_df = sub_series_df[target_columns][m:m+t]
target_df

Unnamed: 0,occupancy_scaled
3417,0.158848
3418,0.166121
3419,0.164002
3420,0.160052
3421,0.133658
...,...
3580,0.145699
3581,0.346547
3582,0.224256
3583,0.163375


In [89]:
id_df = sub_series_df[id_columns][:1]
id_df

Unnamed: 0,id
3249,400000


In [90]:
# DISCARD USE OF ABSOLUTE INDEXES TO SLICE THE TIME SERIES WITH .loc
# USE RELATIVE INDEXES TO SLICE THE TIME SERIES WITH .iloc
# make SLDB evaluation dataset
# get an iterable with all the possible sub-series for evaluation examples
# eval_starting_indexes = np.arange(eval_start_index, eval_end_index - (m + t) + 2)
# eval_starting_indexes

In [91]:
# make SLDB evaluation dataset
# get an iterable with all the possible sub-series for evaluation examples
eval_starting_indexes = np.arange(no_lectures_train_dataset,
                                  no_lectures_train_dataset + no_lectures_eval_dataset - (m + t) + 1)
                                  
eval_starting_indexes

array([3585, 3586, 3587, 3588, 3589, 3590, 3591, 3592, 3593, 3594, 3595,
       3596, 3597, 3598, 3599, 3600, 3601, 3602, 3603, 3604, 3605, 3606,
       3607, 3608, 3609, 3610, 3611, 3612, 3613, 3614, 3615, 3616, 3617,
       3618, 3619, 3620, 3621, 3622, 3623, 3624, 3625, 3626, 3627, 3628,
       3629, 3630, 3631, 3632, 3633, 3634, 3635, 3636, 3637, 3638, 3639,
       3640, 3641, 3642, 3643, 3644, 3645, 3646, 3647])

In [92]:
len(eval_starting_indexes)

63

In [93]:
# verify sub-series generation for the last starting index
for eval_starting_index in eval_starting_indexes[-1:]:

    # do not substract 1 at the end of the slice because using iloc
    sub_series_df = data_df[sldb_columns].iloc[eval_starting_index:eval_starting_index + (m + t)]
    
sub_series_df

Unnamed: 0,id,occupancy_scaled,sin_hours_from_start,cos_hours_from_start,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week
3647,400000,0.011078,-0.689913,0.723892,0.000000,1.000000,-0.781831,0.623490
3648,400000,0.010404,-0.688816,0.724936,0.258819,0.965926,-0.781831,0.623490
3649,400000,0.009777,-0.687718,0.725978,0.500000,0.866025,-0.781831,0.623490
3650,400000,0.015509,-0.686619,0.727018,0.707107,0.707107,-0.781831,0.623490
3651,400000,0.035642,-0.685517,0.728056,0.866025,0.500000,-0.781831,0.623490
...,...,...,...,...,...,...,...,...
3978,400000,0.094403,-0.257418,0.966300,-0.965926,0.258819,-0.974928,-0.222521
3979,400000,0.083662,-0.255955,0.966689,-0.866025,0.500000,-0.974928,-0.222521
3980,400000,0.064348,-0.254491,0.967075,-0.707107,0.707107,-0.974928,-0.222521
3981,400000,0.045323,-0.253027,0.967459,-0.500000,0.866025,-0.974928,-0.222521


In [94]:
# NOW ASSEMBLE THE TESTED, SEPARATED CODE CELLS INTO AN ITERATIVE CYCLE,
# USE NEXT VERSION OF JUPYTER NOTEBOOK