In [1]:
import numpy as np
import pandas as pd

In [2]:
from scipy.stats import zscore

In [3]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
from bokeh.layouts import row, gridplot, layout
from bokeh.palettes import d3
output_notebook()

In [4]:
def plot_line(title, x, y, width=720, height=240):
    p = figure(
        title=title,
        plot_width=width,
        plot_height=height,
        # tools='',
        x_axis_type='datetime'
    )
    p.grid.grid_line_alpha=0.3

    p.xaxis.axis_label = 'Date'
    p.yaxis.axis_label = 'Active Power [KW]'

    p.line(x,
           y,
           color='red',
           )

    return p

In [5]:
pd.set_option('display.max_rows', 400)

In [6]:
PROJECT_ROOT = '/home/developer/gcp/cbidmltsf'

In [7]:
raw_ts_folder = '{}/datasets/electricity/separated_raw'.format(PROJECT_ROOT)

In [8]:
stats_columns = [
    'customer_id',
    'count',
    'mean',
    'std',
    'min',
    'q25',
    'q50',
    'q75',
    'max',
    'zeros',
    'out-q_005',
    'out-z_3_5'
]

In [9]:
start, end = 1, 370

customer_ids = ['MT_{:03d}'.format(token_id) for token_id in np.arange(start, end + 1)]

In [10]:
stats_list = list()

alpha = 0.005
factor = 1.25

z_threshold = [-3, 5]

ts = dict()

In [11]:
for customer_id in customer_ids:
        
    # build a path to the persisted time series
    ts_path = '{}/{}.pkl'.format(raw_ts_folder, customer_id)
    # read the pickle file
    ts[customer_id] = pd.read_pickle(ts_path)
    
    # start a temporary row list with the stats description of the time series
    # (count, mean, std, min, q25, q50, q75, max)
    row_list = list(ts[customer_id]['power_usage'].describe())
    
    # once the row list is ready, insert customer_id at the beginning
    row_list.insert(0, customer_id)
    
    # add count of zero-values
    row_list.append(np.sum(ts[customer_id]['power_usage'] == 0))
    
    # add outliers given ceil and floor based on alpha quantile
    lower = ts[customer_id]['power_usage'].quantile(alpha)
    upper = ts[customer_id]['power_usage'].quantile(1 - alpha)
    
    ceil, floor = lower/factor, upper*factor
    
    out_q = np.sum(ts[customer_id]['power_usage'] < ceil) + np.sum(ts[customer_id]['power_usage'] > floor)
    row_list.append(out_q)
    
    # add outliers given a z-score threshold
    low_outliers = list(zscore(ts[customer_id]['power_usage']) < z_threshold[0])
    high_outliers = list(zscore(ts[customer_id]['power_usage']) > z_threshold[1])
    out_z = int(np.sum(low_outliers) + np.sum(high_outliers))
    row_list.append(out_z)

    stats_list.append(row_list)
    
stats_df = pd.DataFrame(stats_list, columns = stats_columns)

  mns = a.mean(axis=axis, keepdims=True)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret, rcount, out=ret, casting='unsafe', subok=False)


In [12]:
stats_df = stats_df.set_index('customer_id')

In [13]:
stats_df

Unnamed: 0_level_0,count,mean,std,min,q25,q50,q75,max,zeros,out-q_005,out-z_3_5
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
MT_001,6000.0,4.656885,5.84207,0.0,1.269036,1.903553,3.489848,34.581218,28,29,1
MT_002,6000.0,27.472588,5.890997,0.0,23.648649,27.560455,32.00569,45.874822,1,23,30
MT_003,6000.0,1.715356,0.297864,0.0,1.737619,1.737619,1.737619,3.475239,71,0,84
MT_004,6000.0,120.573001,36.90281,0.0,96.036585,109.756098,136.686992,292.174797,1,1,1
MT_005,6000.0,50.958384,18.275798,0.0,36.585366,48.170732,61.585366,128.04878,1,1,0
MT_006,6000.0,183.387773,60.770527,0.0,137.64881,173.363095,215.77381,462.053571,1,1,1
MT_007,6000.0,6.647023,6.825371,0.0,3.109101,4.098361,6.500848,43.951385,1,4,14
MT_008,6000.0,248.884259,53.466133,0.0,207.070707,247.474747,283.670034,475.589226,1,2,1
MT_009,6000.0,48.503569,17.134291,0.0,37.587413,44.58042,57.255245,134.178322,1,3,1
MT_010,6000.0,51.048566,19.949703,0.0,37.365591,47.849462,61.290323,134.139785,1,6,0


In [14]:
# how many time series have less than the expected 6000 lectures?
np.sum(stats_df['count'] < 6000)

20

In [15]:
# locate the 20 incomplete time series
incomplete_time_series = list(stats_df[stats_df['count'] < 6000].index)

In [16]:
# for index_incomplete in incomplete_time_series:
#     print('{}: missing {} hourly lectures.'.\
#           format(index_incomplete,
#                  6000 - stats_df.loc[index_incomplete]['count']))

In [17]:
# how many time series have more than 24 (one day) zero values?
np.sum(stats_df['zeros'] > 24)

10

In [18]:
# locate the 8 time series with more than 24 zero values
hollow_time_series = list(stats_df[stats_df['zeros'] > 24].index)

In [19]:
# for index_hollow in hollow_time_series:
#     print('{} has {} zero-value lectures.'.\
#           format(index_hollow,
#                  stats_df.loc[index_hollow]['zeros']))

In [20]:
ts_with_issues = list(set(incomplete_time_series).union(set(hollow_time_series)))
ts_with_issues.sort()

In [21]:
len(ts_with_issues)

29

In [22]:
# distribution of missing or zero lectures over train, eval, and test datasets

In [23]:
# a dict to manage date ranges
date_range = dict()

In [24]:
date_range['train'] = pd.date_range(
    start=pd.to_datetime('2014-01-01 00:00:00'),
    end=pd.to_datetime('2014-08-07 15:00:00'),
    freq='H')

In [25]:
len(date_range['train'])

5248

In [26]:
date_range['eval'] = pd.date_range(
    start=pd.to_datetime('2014-08-07 16:00:00'),
    end=pd.to_datetime('2014-08-31 23:00:00'),
    freq='H')

In [27]:
len(date_range['eval'])

584

In [28]:
date_range['test'] = pd.date_range(
    start=pd.to_datetime('2014-09-01 00:00:00'),
    end=pd.to_datetime('2014-09-07 23:00:00'),
    freq='H')

In [29]:
len(date_range['test'])

168

In [30]:
dates_missing = dict()
dates_zero = dict()

In [31]:
for customer_id in ts_with_issues:
    # make subdictionaries for this customer_id
    dates_missing[customer_id] = dict()
    dates_zero[customer_id] = dict()
    
    for stage in ['train', 'eval', 'test']:
        
        # set operation: missing values = all dates (difference) existing dates
        dates_missing[customer_id][stage] = list(set(date_range[stage]).\
                                                   difference(set(ts[customer_id]['date'])))
        # sort the resulting list
        dates_missing[customer_id][stage].sort()
        
        dates_zero_values = ts[customer_id][ts[customer_id]['power_usage'] == 0]['date']
        # set operation: zero values = existing dates (intersection) dates with zero-value
        dates_zero[customer_id][stage] = list(set(dates_zero_values).\
                                              intersection(set(date_range[stage])))
        # sort the resulting list
        dates_zero[customer_id][stage].sort()

In [32]:
issues_columns = [
    'customer_id',
    'missing_train', 'missing_eval', 'missing_test', 'missing_total',
    'zero_train', 'zero_eval', 'zero_test', 'zero_total',
    'issues_total'
]

In [33]:
buffer_list = list()

for customer_id in ts_with_issues:
    
    missing_train = len(dates_missing[customer_id]['train'])
    missing_eval = len(dates_missing[customer_id]['eval'])
    missing_test = len(dates_missing[customer_id]['test'])
    
    zero_train = len(dates_zero[customer_id]['train'])
    zero_eval = len(dates_zero[customer_id]['eval'])
    zero_test = len(dates_zero[customer_id]['test'])

    row_list = [
        customer_id,
        missing_train, missing_eval, missing_test,
        missing_train + missing_eval + missing_test,
        
        zero_train, zero_eval, zero_test,
        zero_train + zero_eval + zero_test,
        
        missing_train + missing_eval + missing_test + zero_train + zero_eval + zero_test        
    ]
    
    buffer_list.append(row_list)

In [34]:
issues_df = pd.DataFrame(buffer_list, columns=issues_columns)

In [35]:
issues_df = issues_df.set_index('customer_id')

In [36]:
customer_id = ts_with_issues[0]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_001,0,0,0,0,28,0,0,28,28


In [37]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [38]:
# use this time series with no preprocessing

In [39]:
customer_id = ts_with_issues[1]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_003,0,0,0,0,61,8,2,71,71


In [40]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [41]:
# use this time series with no preprocessing

In [42]:
customer_id = ts_with_issues[2]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_066,0,0,0,0,1787,0,0,1787,1787


In [43]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [44]:
# trim series starting at 2014-07-15 16:00:00, which gives aprox 3 weeks of training data

In [45]:
customer_id = ts_with_issues[3]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_106,312,0,0,312,1,0,0,1,313


In [46]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [47]:
# use time series as if trimmed starting at 2014-01-14 00:00:00

In [48]:
customer_id = ts_with_issues[4]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_107,312,0,0,312,1,0,0,1,313


In [49]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [50]:
# use time series as if trimmed starting at 2014-01-14 00:00:00

In [51]:
customer_id = ts_with_issues[5]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_108,312,0,0,312,1,0,0,1,313


In [52]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [53]:
# use time series as if trimmed starting at 2014-01-14 00:00:00

In [54]:
customer_id = ts_with_issues[6]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_109,1152,0,0,1152,1,0,0,1,1153


In [55]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [56]:
# use time series as if trimmed starting at 2014-02-18 00:00:00

In [57]:
customer_id = ts_with_issues[7]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_110,312,0,0,312,1,0,0,1,313


In [58]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [59]:
# use time series as if trimmed starting at 2014-01-14 00:00:00

In [60]:
customer_id = ts_with_issues[8]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_111,312,0,0,312,1,0,0,1,313


In [61]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [62]:
# use time series as if trimmed starting at 2014-01-14 00:00:00

In [63]:
customer_id = ts_with_issues[9]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_112,1008,0,0,1008,1,0,0,1,1009


In [64]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [65]:
# use time series as if trimmed starting at 2014-02-12 00:00:00

In [66]:
customer_id = ts_with_issues[10]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_113,312,0,0,312,1,0,0,1,313


In [67]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [68]:
# use time series as if trimmed starting at 2014-01-14 00:00:00

In [69]:
customer_id = ts_with_issues[11]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_115,312,0,0,312,4,0,0,4,316


In [70]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [71]:
# use time series as if trimmed starting at 2014-01-14 00:00:00

In [72]:
customer_id = ts_with_issues[12]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_116,1152,0,0,1152,1,0,0,1,1153


In [73]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [74]:
# use time series as if trimmed starting at 2014-02-18 00:00:00

In [75]:
customer_id = ts_with_issues[13]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_117,312,0,0,312,1,0,0,1,313


In [76]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [77]:
# use time series as if trimmed starting at 2014-01-14 00:00:00

In [78]:
customer_id = ts_with_issues[14]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_120,312,0,0,312,1,0,0,1,313


In [79]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [80]:
# use time series as if trimmed starting at 2014-01-14 00:00:00

In [81]:
customer_id = ts_with_issues[15]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_121,312,0,0,312,1,0,0,1,313


In [82]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [83]:
# use time series as if trimmed starting at 2014-01-14 00:00:00

In [84]:
customer_id = ts_with_issues[16]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_122,312,0,0,312,1,0,0,1,313


In [85]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [86]:
# use time series as if trimmed starting at 2014-01-14 00:00:00

In [87]:
customer_id = ts_with_issues[17]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_127,0,0,0,0,364,194,14,572,572


In [88]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [89]:
# use this time series with no preprocessing

In [90]:
customer_id = ts_with_issues[18]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_130,0,0,0,0,2875,300,94,3269,3269


In [91]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [92]:
# use this time series with no preprocessing

In [93]:
customer_id = ts_with_issues[19]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_131,0,0,0,0,2417,237,64,2718,2718


In [94]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [95]:
# use this time series with no preprocessing

In [96]:
customer_id = ts_with_issues[20]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_132,0,0,0,0,2449,251,74,2774,2774


In [97]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [98]:
# use this time series with no preprocessing

In [99]:
customer_id = ts_with_issues[21]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_133,1720,0,0,1720,1176,250,49,1475,3195


In [100]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [101]:
# use time series as if trimmed starting at 2014-03-13 16:00:00

In [102]:
customer_id = ts_with_issues[22]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_160,816,0,0,816,0,0,0,0,816


In [103]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [104]:
# use time series as if trimmed starting at 2014-02-04 00:00:00

In [105]:
customer_id = ts_with_issues[23]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_178,4752,0,0,4752,0,0,0,0,4752


In [106]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [107]:
# use time series as if trimmed starting at 2014-07-18 00:00:00

In [108]:
customer_id = ts_with_issues[24]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_181,1512,0,0,1512,0,0,0,0,1512


In [109]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [110]:
# use time series as if trimmed starting at 2014-03-05 00:00:00

In [111]:
customer_id = ts_with_issues[25]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_223,5248,584,168,6000,0,0,0,0,6000


In [112]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [113]:
# DO NOT USE THIS TIME SERIES

In [114]:
customer_id = ts_with_issues[26]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_337,384,0,0,384,0,0,0,0,384


In [115]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [116]:
# use time series as if trimmed starting at 2014-01-17 00:00:00

In [117]:
customer_id = ts_with_issues[27]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_347,0,0,0,0,1443,119,0,1562,1562


In [118]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [119]:
# use time series as if trimmed starting at 2014-02-28 00:00:00

In [120]:
customer_id = ts_with_issues[28]
issues_df.loc[[customer_id]]

Unnamed: 0_level_0,missing_train,missing_eval,missing_test,missing_total,zero_train,zero_eval,zero_test,zero_total,issues_total
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MT_348,0,0,0,0,2570,356,103,3029,3029


In [121]:
show(plot_line(customer_id, ts[customer_id]['date'], ts[customer_id]['power_usage']))

In [122]:
# use this time series with no preprocessing

In [142]:
first_available_index = 4752

counter = 0
for starting_point in np.arange(first_available_index, 5248 - (168 + 168) + 1):
    counter += 1
    print(date_range['train'][starting_point], date_range['train'][starting_point + 336 - 1])

print(counter)

2014-07-18 00:00:00 2014-07-31 23:00:00
2014-07-18 01:00:00 2014-08-01 00:00:00
2014-07-18 02:00:00 2014-08-01 01:00:00
2014-07-18 03:00:00 2014-08-01 02:00:00
2014-07-18 04:00:00 2014-08-01 03:00:00
2014-07-18 05:00:00 2014-08-01 04:00:00
2014-07-18 06:00:00 2014-08-01 05:00:00
2014-07-18 07:00:00 2014-08-01 06:00:00
2014-07-18 08:00:00 2014-08-01 07:00:00
2014-07-18 09:00:00 2014-08-01 08:00:00
2014-07-18 10:00:00 2014-08-01 09:00:00
2014-07-18 11:00:00 2014-08-01 10:00:00
2014-07-18 12:00:00 2014-08-01 11:00:00
2014-07-18 13:00:00 2014-08-01 12:00:00
2014-07-18 14:00:00 2014-08-01 13:00:00
2014-07-18 15:00:00 2014-08-01 14:00:00
2014-07-18 16:00:00 2014-08-01 15:00:00
2014-07-18 17:00:00 2014-08-01 16:00:00
2014-07-18 18:00:00 2014-08-01 17:00:00
2014-07-18 19:00:00 2014-08-01 18:00:00
2014-07-18 20:00:00 2014-08-01 19:00:00
2014-07-18 21:00:00 2014-08-01 20:00:00
2014-07-18 22:00:00 2014-08-01 21:00:00
2014-07-18 23:00:00 2014-08-01 22:00:00
2014-07-19 00:00:00 2014-08-01 23:00:00
