### make use of TFT Google Research code to download and pre-process the electricity dataset
#### from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py

In [1]:
import gc
import glob
import os
import shutil
import sys
import numpy as np
import pandas as pd
import pyunpack
import wget

### jump to the following markdown to load the complete dataset

In [2]:
def download_from_url(url, output_path):
    """Downloads a file froma url."""

    print('Pulling data from {} to {}'.format(url, output_path))
    wget.download(url, output_path)
    print('done')

In [3]:
def recreate_folder(path):
    """Deletes and recreates folder."""

    shutil.rmtree(path)
    os.makedirs(path)

In [4]:
def unzip(zip_path, output_file, data_folder):
    """Unzips files and checks successful completion."""

    print('Unzipping file: {}'.format(zip_path))
    pyunpack.Archive(zip_path).extractall(data_folder)

    # Checks if unzip was successful
    if not os.path.exists(output_file):
        raise ValueError(
            'Error in unzipping process! {} not found.'.format(output_file))

In [5]:
def download_and_unzip(url, zip_path, csv_path, data_folder):
    """Downloads and unzips an online csv file.
    Args:
    url: Web address
    zip_path: Path to download zip file
    csv_path: Expected path to csv file
    data_folder: Folder in which data is stored.
    """

    download_from_url(url, zip_path)

    unzip(zip_path, csv_path, data_folder)

    print('Done.')

In [6]:
def serialize_structure(structure):
    """Converts a structure to a string."""
    structure = np.asarray(structure)
    dim = len(structure.shape)
    if dim != 1:
        raise NotImplementedError(f'`structure` must be 1d but is {dim}d!')
    return domains.SEP_TOKEN.join(str(token) for token in structure)

In [7]:
def serialize_structures(structures, **kwargs):
    """Converts a list of structures to a list of strings."""
    return [serialize_structure(structure, **kwargs)
          for structure in structures]

In [8]:
def serialize_population_frame(frame, inplace=False, domain=None):
    """Serializes a population `pd.DataFrame` for representing it as plain text.
    Args:
        frame: A `pd.DataFrame` produced by `Population.to_frame`.
        inplace: Whether to serialize `frame` inplace instead of creating a copy.
        domain: An optional domain for decoding structures. If provided, will
          add a column `decoded_structure` with the serialized decoded structures.
    Returns:
        A `pd.DataFrame` with serialized structures.
    """
    if not inplace:
        frame = frame.copy()
    if domain:
        frame['decoded_structure'] = serialize_structures(
            domain.decode(frame['structure'], as_str=False))
    frame['structure'] = serialize_structures(frame['structure'])
    return frame

In [9]:
def population_frame_to_csv(frame, path_or_buf=None, domain=None, index=False, **kwargs):
    """Converts a population `pd.DataFrame` to a csv table.
    Args:
        frame: A `pd.DataFrame` produced by `Population.to_frame`.
        path_or_buf: File path or object. If `None`, the result is returned as a
          string. Otherwise write the csv table to that file.
        domain: A optional domain for decoding structures.
        index: Whether to store the index of `frame`.
        **kwargs: Named arguments passed to `frame.to_csv`.
    Returns:
        If `path_or_buf` is `None`, returns the resulting csv format as a
        string. Otherwise returns `None`.
    """
    if frame.empty:
        raise ValueError('Cannot write empty population frame to CSV file!')
    frame = serialize_population_frame(frame, domain=domain)
    return frame.to_csv(path_or_buf, index=index, **kwargs)

In [10]:
def to_csv(self, path, domain=None):
    """Stores a population to a CSV file.
    Args:
      path: The output file path.
      domain: An optional `domains.Domain`. If provided, will also store
        decoded structures in the CSV file.
    """
    population_frame_to_csv(self.to_frame(), path, domain=domain)

In [11]:
# downloads electricity dataset from UCI repository

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'

In [12]:
! ls -l /home/developer/gcp/cbidmltsf/datasets/electricity

total 2454480
-rw-rw-r-- 1 developer developer 1087589509 ago 23 07:37 hourly_electricity_complete.pkl
-rw-rw-r-- 1 developer developer  208129432 ago  9 10:38 hourly_electricity.csv
-rw-rw-r-- 1 developer developer  245317493 ago 20 11:37 hourly_electricity_filtered.pkl
-rw-rw-r-- 1 developer developer  710998915 ago  9 09:56 LD2011_2014.txt
-rw-rw-r-- 1 developer developer  261335609 ago  9 09:56 LD2011_2014.txt.zip


In [13]:
data_folder = '/home/developer/gcp/cbidmltsf/datasets/electricity'

In [14]:
csv_path = os.path.join(data_folder, 'LD2011_2014.txt')
zip_path = csv_path + '.zip'

In [15]:
# un-comment the following line to pull the dataset from UCI repository
# download_and_unzip(url, zip_path, csv_path, data_folder)

In [16]:
# aggregating to hourly data

In [17]:
df = pd.read_csv(csv_path, index_col=0, sep=';', decimal=',')

In [18]:
df.index = pd.to_datetime(df.index)

In [19]:
df.sort_index(inplace=True)

In [20]:
# used to determine the start and end dates of a series
output = df.resample('1h').mean().replace(0., np.nan)

In [21]:
output

Unnamed: 0,MT_001,MT_002,MT_003,MT_004,MT_005,MT_006,MT_007,MT_008,MT_009,MT_010,...,MT_361,MT_362,MT_363,MT_364,MT_365,MT_366,MT_367,MT_368,MT_369,MT_370
2011-01-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,
2011-01-01 01:00:00,,,,,,,,,,,...,,,,,,,,,,
2011-01-01 02:00:00,,,,,,,,,,,...,,,,,,,,,,
2011-01-01 03:00:00,,,,,,,,,,,...,,,,,,,,,,
2011-01-01 04:00:00,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-12-31 20:00:00,2.220812,25.248933,1.737619,186.483740,92.073171,340.773810,11.305822,315.656566,91.783217,81.451613,...,333.511777,39700.0,1702.531646,2238.636364,74.967405,4.388531,375.768218,108.931553,688.416422,8405.405405
2014-12-31 21:00:00,2.538071,22.759602,1.737619,162.093496,86.280488,319.940476,11.588468,269.360269,76.486014,70.161290,...,327.266238,38575.0,1649.789030,1477.272727,74.967405,3.949678,465.539947,154.841402,662.023460,8283.783784
2014-12-31 22:00:00,1.903553,22.048364,1.737619,161.077236,86.890244,314.732143,11.305822,251.683502,71.678322,72.311828,...,306.209850,35475.0,1636.075949,1375.000000,64.211213,7.753072,655.179982,195.325543,679.252199,7594.594595
2014-12-31 23:00:00,2.220812,21.337127,1.737619,161.585366,83.841463,308.035714,10.740531,250.841751,64.685315,72.580645,...,271.948608,28075.0,1546.413502,1232.954545,28.357236,7.314219,676.031607,161.519199,659.274194,6932.432432


In [22]:
earliest_time = output.index.min()
earliest_time

Timestamp('2011-01-01 00:00:00', freq='H')

### features of the dataset to be included in the time series

In [23]:
df_list = []

for label in output:
    # print('Processing {}'.format(label))
    srs = output[label]

    start_date = min(srs.fillna(method='ffill').dropna().index)
    end_date = max(srs.fillna(method='bfill').dropna().index)

    active_range = (srs.index >= start_date) & (srs.index <= end_date)
    srs = srs[active_range].fillna(0.)

    # TARGET VARIABLE
    # active power consumption; it has to be Z-normalized by each customer
    tmp = pd.DataFrame({'power_usage': srs})
    date = tmp.index
    
    # customer identifier, just the integer token to be used in a learned embedding
    tmp['token_id'] = int(label[-3:])
    
    # removed (unused)
    # tmp['categorical_id'] = label
    # tmp['id'] = label
    
    # timestamp is required for predictions analysis;
    # it can be passed as a string variable to avoid clock inconsistencies across systems
    tmp['date'] = date
    
    # FEATURES FOR GLOBAL POSITION ENCODING
    tmp['hours_from_start'] = (date - earliest_time).seconds / 60 / 60 + (
        date - earliest_time).days * 24
    tmp['days_from_start'] = (date - earliest_time).days
    
    # FEATURES FOR POSITION ENCODING
    tmp['hour_of_day'] = date.hour
    tmp['day_of_week'] = date.dayofweek
    tmp['day_of_month'] = date.day
    # add day of year to comply with DeepAR categorical features
    tmp['day_of_year'] = date.dayofyear

    # DO NOT USE WEEK OF YEAR, UNLESS IT IS CYCLED WITH SINE-COSINE FUNCTIONS!
    # (JAN-01 MIGHT LIE IN WEEK 52, AND ALSO DEC-31 MIGHT LIE IN WEEK 01)
    tmp['week_of_year'] = date.week
    tmp['month_of_year'] = date.month

    df_list.append(tmp)

In [24]:
# how many dataframes in the df_list
len(df_list)

370

In [25]:
output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
output

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
0,3.172589,1,2012-01-01 00:00:00,8760.0,365,0,6,1,1,52,1
1,4.124365,1,2012-01-01 01:00:00,8761.0,365,1,6,1,1,52,1
2,4.758883,1,2012-01-01 02:00:00,8762.0,365,2,6,1,1,52,1
3,4.441624,1,2012-01-01 03:00:00,8763.0,365,3,6,1,1,52,1
4,4.758883,1,2012-01-01 04:00:00,8764.0,365,4,6,1,1,52,1
...,...,...,...,...,...,...,...,...,...,...,...
10464243,8405.405405,370,2014-12-31 20:00:00,35060.0,1460,20,2,31,365,1,12
10464244,8283.783784,370,2014-12-31 21:00:00,35061.0,1460,21,2,31,365,1,12
10464245,7594.594595,370,2014-12-31 22:00:00,35062.0,1460,22,2,31,365,1,12
10464246,6932.432432,370,2014-12-31 23:00:00,35063.0,1460,23,2,31,365,1,12


In [26]:
# removed unused column 't'
# output['hours_from_start'] = output['t'].copy()
# output            

In [27]:
# which are the min and max values of 'days_from_start' in the dataset
min(output['days_from_start']), max(output['days_from_start'])

(0, 1461)

In [29]:
# un-comment the following line to persist the complete dataset before filtering
# output.to_pickle('{}/hourly_electricity_complete.pkl'.format(data_folder))

### the final dataframe, previous to being filtered by interval

In [2]:
data_folder = '/home/developer/gcp/cbidmltsf/datasets/electricity'

In [3]:
output = pd.read_pickle('{}/hourly_electricity_complete.pkl'.format(data_folder))

In [30]:
output.columns

Index(['power_usage', 'token_id', 'date', 'hours_from_start',
       'days_from_start', 'hour_of_day', 'day_of_week', 'day_of_month',
       'day_of_year', 'week_of_year', 'month_of_year'],
      dtype='object')

### filtering the dataset

In [31]:
# filter to match range used by other academic papers
filtered_output = output[(output['days_from_start'] >= 1096) & (output['days_from_start'] < 1346)].copy()

In [32]:
filtered_output

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
17544,2.538071,1,2014-01-01 00:00:00,26304.0,1096,0,2,1,1,1,1
17545,2.855330,1,2014-01-01 01:00:00,26305.0,1096,1,2,1,1,1,1
17546,2.855330,1,2014-01-01 02:00:00,26306.0,1096,2,2,1,1,1,1
17547,2.855330,1,2014-01-01 03:00:00,26307.0,1096,3,2,1,1,1,1
17548,2.538071,1,2014-01-01 04:00:00,26308.0,1096,4,2,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
10461482,20824.324324,370,2014-09-07 19:00:00,32299.0,1345,19,6,7,250,36,9
10461483,19527.027027,370,2014-09-07 20:00:00,32300.0,1345,20,6,7,250,36,9
10461484,20202.702703,370,2014-09-07 21:00:00,32301.0,1345,21,6,7,250,36,9
10461485,19851.351351,370,2014-09-07 22:00:00,32302.0,1345,22,6,7,250,36,9


In [9]:
# un-comment the following line to serialize final dataset as CSV text (according to TFT process)
# output.to_csv('/home/developer/gcp/cbidmltsf/datasets/electricity/data.csv')

### analysis of some of the time series

In [33]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [76]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
from bokeh.palettes import d3
output_notebook()

In [96]:
# a dictionary to manage data per individual customer_id
data = dict()

In [97]:
# a dictionary to manage a MinMaxScaler per individual customer_id
min_max = dict()
# a dictionary to manage a StandardScaler per individual customer_id
standard = dict()

In [98]:
# a dictionary to manage plots per individual customer_id
plots = dict()

In [99]:
token_ids = [token_id for token_id in np.arange(1, 21)]

In [100]:
for token_id in token_ids:
    
    # pass raw data to dictionary
    data[token_id] = filtered_output[filtered_output['token_id'] == token_id]
    
    # pass the power usage time series to a (?, 1) NumPy array
    series_array = np.array(data[token_id].power_usage).reshape(-1, 1)
    
    # get MinMaxScaler
    min_max_scaler = MinMaxScaler()
    min_max[token_id] = min_max_scaler.fit_transform(series_array)

    # get StandardScaler
    standard_scaler = StandardScaler()
    standard[token_id] = standard_scaler.fit_transform(series_array)


In [104]:
# plot original time series
label = 'original'

plots[label] = figure(
    x_axis_type='datetime',
    plot_width=960,
    plot_height=400,
    title='Original Time Series'
)

plots[label].grid.grid_line_alpha=0.3

plots[label].xaxis.axis_label = 'Date'
plots[label].yaxis.axis_label = 'Active Power [KW]'

for index, token_id in enumerate(token_ids):
    plots[label].line(data[token_id].date,
                      data[token_id].power_usage,
                      # cycle the 10 values of the color palette
                      color=d3['Category10'][10][index%10],
                      legend_label='MT_{:03d}'.format(token_id))

show(plots[label])

In [105]:
# plot MinMax normalized time series
label = 'min_max'

plots[label] = figure(
    x_axis_type='datetime',
    plot_width=960,
    plot_height=400,
    title='MinMax Normalized Time Series'
)

plots[label].grid.grid_line_alpha=0.3

plots[label].xaxis.axis_label = 'Date'
plots[label].yaxis.axis_label = 'MinMax Normalized Active Power'

for index, token_id in enumerate(token_ids):
    plots[label].line(data[token_id].date,
                      # remove 1-valued dimension
                      np.squeeze(min_max[token_id]),
                      # cycle the 10 values of the color palette
                      color=d3['Category10'][10][index%10],
                      legend_label='MT_{:03d}'.format(token_id))

show(plots[label])

In [106]:
# plot standardized time series
label = 'standard'

plots[label] = figure(
    x_axis_type='datetime',
    plot_width=960,
    plot_height=400,
    title='Standardized Time Series (Z-score)'
)

plots[label].grid.grid_line_alpha=0.3

plots[label].xaxis.axis_label = 'Date'
plots[label].yaxis.axis_label = 'Standardized Active Power'

for index, token_id in enumerate(token_ids):
    plots[label].line(data[token_id].date,
                      # remove 1-valued dimension
                      np.squeeze(standard[token_id]),
                      # cycle the 10 values of the color palette
                      color=d3['Category10'][10][index%10],
                      legend_label='MT_{:03d}'.format(token_id))

show(plots[label])