In [1]:
import numpy as np
import pandas as pd

In [2]:
from statistics import mean

In [3]:
data_folder = '/home/developer/gcp/cbidmltsf/datasets/electricity'

In [4]:
! ls -l /home/developer/gcp/cbidmltsf/datasets/electricity

total 2258164
-rw-rw-r-- 1 developer developer 920855104 ago 23 16:54 hourly_electricity_complete.pkl
-rw-rw-r-- 1 developer developer 208129432 ago  9 10:38 hourly_electricity.csv
-rw-rw-r-- 1 developer developer 211016244 sep 29 13:27 hourly_electricity_filtered_academic_papers.pkl
-rw-rw-r-- 1 developer developer 710998915 ago  9 09:56 LD2011_2014.txt
-rw-rw-r-- 1 developer developer 261335609 ago  9 09:56 LD2011_2014.txt.zip


In [5]:
# get the filtered dataframe (to match the range used by other academic papers)
filtered_output = pd.read_pickle('{}/hourly_electricity_filtered_academic_papers.pkl'.format(data_folder))

In [6]:
filtered_output

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
17544,2.538071,1,2014-01-01 00:00:00,26304.0,1096,0,2,1,1,1,1
17545,2.855330,1,2014-01-01 01:00:00,26305.0,1096,1,2,1,1,1,1
17546,2.855330,1,2014-01-01 02:00:00,26306.0,1096,2,2,1,1,1,1
17547,2.855330,1,2014-01-01 03:00:00,26307.0,1096,3,2,1,1,1,1
17548,2.538071,1,2014-01-01 04:00:00,26308.0,1096,4,2,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
10461482,20824.324324,370,2014-09-07 19:00:00,32299.0,1345,19,6,7,250,36,9
10461483,19527.027027,370,2014-09-07 20:00:00,32300.0,1345,20,6,7,250,36,9
10461484,20202.702703,370,2014-09-07 21:00:00,32301.0,1345,21,6,7,250,36,9
10461485,19851.351351,370,2014-09-07 22:00:00,32302.0,1345,22,6,7,250,36,9


### analysis of the time series

In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [8]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
from bokeh.layouts import row, gridplot, layout
from bokeh.palettes import d3
output_notebook()

In [9]:
# a dictionary to manage data per individual customer_id
data = dict()

In [10]:
# a dictionary to manage a MinMaxScaler per individual customer_id
min_max = dict()
# a dictionary to manage a StandardScaler per individual customer_id
standard = dict()

In [11]:
# a dictionary to manage plots per individual customer_id
plots = dict()

In [12]:
# plot some individual customer datasets to preview normalization and standardization
start, end = 320, 330
token_ids = [token_id for token_id in np.arange(start, end + 1)]

In [13]:
for token_id in token_ids:
    
    identifier = 'MT_{:03d}'.format(token_id)
    
    # pass raw data to dictionary
    data[identifier] = filtered_output[filtered_output['token_id'] == token_id]
    
    # pass the power usage time series to a (?, 1) NumPy array
    series_array = np.array(data[identifier].power_usage).reshape(-1, 1)
    
    # get MinMaxScaler
    min_max_scaler = MinMaxScaler()
    min_max[identifier] = min_max_scaler.fit_transform(series_array)

    # get StandardScaler
    standard_scaler = StandardScaler()
    standard[identifier] = standard_scaler.fit_transform(series_array)

In [14]:
# plot original time series
label = 'original'

plots[label] = figure(
    x_axis_type='datetime',
    plot_width=960,
    plot_height=400,
    title='Original Time Series'
)

plots[label].grid.grid_line_alpha=0.3

plots[label].xaxis.axis_label = 'Date'
plots[label].yaxis.axis_label = 'Active Power [KW]'

for index, token_id in enumerate(token_ids):
    identifier = 'MT_{:03d}'.format(token_id)
    plots[label].line(data[identifier].date,
                      data[identifier].power_usage,
                      # cycle the 10 values of the color palette
                      color=d3['Category10'][10][index%10],
                      legend_label=identifier)

show(plots[label])

In [15]:
# plot MinMax normalized time series
label = 'min_max'

plots[label] = figure(
    x_axis_type='datetime',
    plot_width=960,
    plot_height=400,
    title='MinMax Normalized Time Series'
)

plots[label].grid.grid_line_alpha=0.3

plots[label].xaxis.axis_label = 'Date'
plots[label].yaxis.axis_label = 'MinMax Normalized Active Power'

for index, token_id in enumerate(token_ids):
    identifier = 'MT_{:03d}'.format(token_id)
    plots[label].line(data[identifier].date,
                      # remove 1-valued dimension
                      np.squeeze(min_max[identifier]),
                      # cycle the 10 values of the color palette
                      color=d3['Category10'][10][index%10],
                      legend_label=identifier)

show(plots[label])

In [16]:
# plot standardized time series
label = 'standard'

plots[label] = figure(
    x_axis_type='datetime',
    plot_width=960,
    plot_height=400,
    title='Standardized Time Series (Z-score)'
)

plots[label].grid.grid_line_alpha=0.3

plots[label].xaxis.axis_label = 'Date'
plots[label].yaxis.axis_label = 'Standardized Active Power'

for index, token_id in enumerate(token_ids):
    identifier = 'MT_{:03d}'.format(token_id)
    plots[label].line(data[identifier].date,
                      # remove 1-valued dimension
                      np.squeeze(standard[identifier]),
                      # cycle the 10 values of the color palette
                      color=d3['Category10'][10][index%10],
                      legend_label=identifier)

show(plots[label])

### plot line and histogram for individual customers

In [17]:
def plot_line(title, x, y, width=720, height=240):
    p = figure(
        title=title,
        plot_width=width,
        plot_height=height,
        # tools='',
        x_axis_type='datetime'
    )
    p.grid.grid_line_alpha=0.3

    p.xaxis.axis_label = 'Date'
    p.yaxis.axis_label = 'Active Power [KW]'

    p.line(x,
           y,
           color='red',
           )

    return p

In [18]:
def plot_histogram(title, histogram, edges):

    p = figure(title=title,
               plot_width=240,
               plot_height=240,
               tools='',
               background_fill_color="#fafafa")
    
    p.quad(top=histogram, bottom=0, left=edges[:-1], right=edges[1:],
           fill_color="navy", line_color="white", alpha=0.5)
    
    p.y_range.start = 0
    # p.legend.location = "center_right"
    # p.legend.background_fill_color = "#fefefe"
    p.xaxis.axis_label = 'x'
    p.yaxis.axis_label = 'Pr(x)'
    p.grid.grid_line_color="white"
    return p


In [19]:
lines = dict()

In [20]:
histograms = dict()

In [21]:
rows_list = list()

for token_id in token_ids:
    identifier = 'MT_{:03d}'.format(token_id)
    
    lines[identifier] = plot_line(identifier, data[identifier].date, data[identifier].power_usage)

    histogram, edges = np.histogram(data[identifier].power_usage, density=True, bins=50)# 
    histograms[identifier] = plot_histogram(identifier, histogram, edges)
    
    rows_list.append([lines[identifier], histograms[identifier]])
    
    # gridplot_list.append(histograms[label])

In [22]:
# gridplot_list = list()
# show(gridplot(gridplot_list, ncols=5, plot_width=180, plot_height=180, toolbar_location=None))

In [23]:
show(layout(rows_list))