In [1]:
import numpy as np
import pandas as pd

In [2]:
from statistics import mean

In [3]:
data_folder = '/home/developer/gcp/cbidmltsf/datasets/electricity'

In [4]:
! ls -l /home/developer/gcp/cbidmltsf/datasets/electricity

total 2258164
-rw-rw-r-- 1 developer developer 920855104 ago 23 16:54 hourly_electricity_complete.pkl
-rw-rw-r-- 1 developer developer 208129432 ago  9 10:38 hourly_electricity.csv
-rw-rw-r-- 1 developer developer 211016244 sep 29 13:27 hourly_electricity_filtered_academic_papers.pkl
-rw-rw-r-- 1 developer developer 710998915 ago  9 09:56 LD2011_2014.txt
-rw-rw-r-- 1 developer developer 261335609 ago  9 09:56 LD2011_2014.txt.zip


In [5]:
# get the filtered dataframe (to match the range used by other academic papers)
filtered_output = pd.read_pickle('{}/hourly_electricity_filtered_academic_papers.pkl'.format(data_folder))

### analysis of the time series

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [7]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
from bokeh.layouts import row, gridplot, layout
from bokeh.palettes import d3
output_notebook()

In [29]:
# a dictionary to manage data per individual customer_id
data = dict()
data['raw'] = dict()
data['preprocessed'] = dict()

In [21]:
# a dictionary to manage line plots per individual customer_id
lines = dict()
lines['raw'] = dict()
lines['preprocessed'] = dict()

### plot line and histogram for individual customers

In [8]:
def plot_line(title, x, y, width=720, height=240):
    p = figure(
        title=title,
        plot_width=width,
        plot_height=height,
        # tools='',
        x_axis_type='datetime'
    )
    p.grid.grid_line_alpha=0.3

    p.xaxis.axis_label = 'Date'
    p.yaxis.axis_label = 'Active Power [KW]'

    p.line(x,
           y,
           color='red',
           )

    return p

In [9]:
def plot_histogram(title, histogram, edges):

    p = figure(title=title,
               plot_width=240,
               plot_height=240,
               tools='',
               background_fill_color="#fafafa")
    
    p.quad(top=histogram, bottom=0, left=edges[:-1], right=edges[1:],
           fill_color="navy", line_color="white", alpha=0.5)
    
    p.y_range.start = 0
    # p.legend.location = "center_right"
    # p.legend.background_fill_color = "#fefefe"
    p.xaxis.axis_label = 'x'
    p.yaxis.axis_label = 'Pr(x)'
    p.grid.grid_line_color="white"
    return p


In [33]:
# build a function to adjust the outlier
def adjust_outlier (index_of_lecture):
    
    # given the index of the lecture to be adjusted,
    # get the average of the same hour-lecture the day before and the day after
    average = mean([data[customer_id]['power_usage'][index_of_lecture - 24],
                    data[customer_id]['power_usage'][index_of_lecture + 24]])

    # and adjust the outlier lecture to the average
    data[customer_id]['power_usage'][index_of_lecture] = average
    
    return True

### analyze and preprocess time series, one by one

In [30]:
token_id = 320
customer_id = 'MT_{:03d}'.format(token_id)

# pass raw data to dictionary
data['raw'][customer_id] = filtered_output[filtered_output['token_id'] == token_id]

# make a copy for pre-processed data
data['preprocessed'][customer_id] = filtered_output[filtered_output['token_id'] == token_id].copy()

In [32]:
lines['raw'][customer_id] = plot_line(title='Raw data for {}'.format(customer_id),
                                      x=data['raw'][customer_id].date,
                                      y=data['raw'][customer_id].power_usage,
                                      width=960,
                                      height=320)

show(lines['raw'][customer_id])

In [35]:
# visually define the outliers as 'too low' values
low_limit = 5.
data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit]

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
9101984,0.602007,320,2014-03-30 01:00:00,28417.0,1184,1,6,30,89,13,3
9101985,0.602007,320,2014-03-30 02:00:00,28418.0,1184,2,6,30,89,13,3


In [36]:
# manually adjust the outliers, one by one, by index
indexes_to_correct = [9101984, 9101985]

In [38]:
for index in indexes_to_correct:
    # set the lecture to the mean value of the lectures one day before and one day after
    data['preprocessed'][customer_id]['power_usage'][index] = mean(
        [data['raw'][customer_id]['power_usage'][index - 24],
         data['raw'][customer_id]['power_usage'][index + 24]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [51]:
# review preprocessed data
lines['preprocessed'][customer_id] = plot_line(
    title='Preprocessed data for {}'.format(customer_id),
    x=data['preprocessed'][customer_id].date,
    y=data['preprocessed'][customer_id].power_usage,
    width=960,
    height=320)

show(lines['preprocessed'][customer_id])

In [42]:
token_id = 321
customer_id = 'MT_{:03d}'.format(token_id)

# pass raw data to dictionary
data['raw'][customer_id] = filtered_output[filtered_output['token_id'] == token_id]

# make a copy for pre-processed data
data['preprocessed'][customer_id] = filtered_output[filtered_output['token_id'] == token_id].copy()

In [43]:
lines['raw'][customer_id] = plot_line(title='Raw data for {}'.format(customer_id),
                                      x=data['raw'][customer_id].date,
                                      y=data['raw'][customer_id].power_usage,
                                      width=960,
                                      height=320)

show(lines['raw'][customer_id])

In [64]:
# visually define the outliers as 'too low' values
low_limit = 20.
data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit]

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
9137049,0.954927,321,2014-03-30 01:00:00,28417.0,1184,1,6,30,89,13,3
9137050,0.954927,321,2014-03-30 02:00:00,28418.0,1184,2,6,30,89,13,3
9137217,0.954927,321,2014-04-06 01:00:00,28585.0,1191,1,6,6,96,14,4
9137218,0.954927,321,2014-04-06 02:00:00,28586.0,1191,2,6,6,96,14,4
9137385,0.954927,321,2014-04-13 01:00:00,28753.0,1198,1,6,13,103,15,4
9137386,0.954927,321,2014-04-13 02:00:00,28754.0,1198,2,6,13,103,15,4
9137553,0.954927,321,2014-04-20 01:00:00,28921.0,1205,1,6,20,110,16,4
9137554,0.954927,321,2014-04-20 02:00:00,28922.0,1205,2,6,20,110,16,4
9137648,15.479374,321,2014-04-24 00:00:00,29016.0,1209,0,3,24,114,17,4
9137649,0.954927,321,2014-04-24 01:00:00,29017.0,1209,1,3,24,114,17,4


In [65]:
# for MT_321 start removing outliers on 2014-04-24, as their day-before/day-after values are fine
indexes_to_correct = [9137648, 9137649, 9137650, 9137651, 9137652, 9137653, 9137654, 9137655]

In [66]:
for index in indexes_to_correct:
    # set the lecture to the mean value of the lectures one day before and one day after
    data['preprocessed'][customer_id]['power_usage'][index] = mean(
        [data['raw'][customer_id]['power_usage'][index - 24],
         data['raw'][customer_id]['power_usage'][index + 24]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [67]:
# now remove remaining outliers
indexes_to_correct = [9137049, 9137050, 9137217, 9137218, 9137385, 9137386, 9137553, 9137554]

In [68]:
for index in indexes_to_correct:
    # set the lecture to the mean value of the lectures one day before and one day after
    data['preprocessed'][customer_id]['power_usage'][index] = mean(
        [data['raw'][customer_id]['power_usage'][index - 24],
         data['raw'][customer_id]['power_usage'][index + 24]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [69]:
# review preprocessed data
lines['preprocessed'][customer_id] = plot_line(
    title='Preprocessed data for {}'.format(customer_id),
    x=data['preprocessed'][customer_id].date,
    y=data['preprocessed'][customer_id].power_usage,
    width=960,
    height=320)

show(lines['preprocessed'][customer_id])

In [70]:
token_id = 322
customer_id = 'MT_{:03d}'.format(token_id)

# pass raw data to dictionary
data['raw'][customer_id] = filtered_output[filtered_output['token_id'] == token_id]

# make a copy for pre-processed data
data['preprocessed'][customer_id] = filtered_output[filtered_output['token_id'] == token_id].copy()

In [71]:
lines['raw'][customer_id] = plot_line(title='Raw data for {}'.format(customer_id),
                                      x=data['raw'][customer_id].date,
                                      y=data['raw'][customer_id].power_usage,
                                      width=960,
                                      height=320)

show(lines['raw'][customer_id])

In [72]:
# visually define the outliers as 'too low' values
low_limit = 30.
data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit]

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
9148786,0.0,322,2014-03-30 01:00:00,28417.0,1184,1,6,30,89,13,3
9149541,3.815715,322,2014-04-30 12:00:00,29172.0,1215,12,2,30,120,18,4
9149542,0.0,322,2014-04-30 13:00:00,29173.0,1215,13,2,30,120,18,4
9149543,0.0,322,2014-04-30 14:00:00,29174.0,1215,14,2,30,120,18,4


In [73]:
indexes_to_correct = [9148786, 9149541, 9149542, 9149543]

In [74]:
for index in indexes_to_correct:
    # set the lecture to the mean value of the lectures one day before and one day after
    data['preprocessed'][customer_id]['power_usage'][index] = mean(
        [data['raw'][customer_id]['power_usage'][index - 24],
         data['raw'][customer_id]['power_usage'][index + 24]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [75]:
# review preprocessed data
lines['preprocessed'][customer_id] = plot_line(
    title='Preprocessed data for {}'.format(customer_id),
    x=data['preprocessed'][customer_id].date,
    y=data['preprocessed'][customer_id].power_usage,
    width=960,
    height=320)

show(lines['preprocessed'][customer_id])

In [76]:
token_id = 323
customer_id = 'MT_{:03d}'.format(token_id)

# pass raw data to dictionary
data['raw'][customer_id] = filtered_output[filtered_output['token_id'] == token_id]

# make a copy for pre-processed data
data['preprocessed'][customer_id] = filtered_output[filtered_output['token_id'] == token_id].copy()

In [77]:
lines['raw'][customer_id] = plot_line(title='Raw data for {}'.format(customer_id),
                                      x=data['raw'][customer_id].date,
                                      y=data['raw'][customer_id].power_usage,
                                      width=960,
                                      height=320)

show(lines['raw'][customer_id])

In [78]:
# visually define the outliers as 'too low' values
low_limit = 500.
data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit]

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
9183851,15.243902,323,2014-03-30 01:00:00,28417.0,1184,1,6,30,89,13,3


In [79]:
indexes_to_correct = [9183851]

In [80]:
for index in indexes_to_correct:
    # set the lecture to the mean value of the lectures one day before and one day after
    data['preprocessed'][customer_id]['power_usage'][index] = mean(
        [data['raw'][customer_id]['power_usage'][index - 24],
         data['raw'][customer_id]['power_usage'][index + 24]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [81]:
# review preprocessed data
lines['preprocessed'][customer_id] = plot_line(
    title='Preprocessed data for {}'.format(customer_id),
    x=data['preprocessed'][customer_id].date,
    y=data['preprocessed'][customer_id].power_usage,
    width=960,
    height=320)

show(lines['preprocessed'][customer_id])

In [82]:
token_id = 324
customer_id = 'MT_{:03d}'.format(token_id)

# pass raw data to dictionary
data['raw'][customer_id] = filtered_output[filtered_output['token_id'] == token_id]

# make a copy for pre-processed data
data['preprocessed'][customer_id] = filtered_output[filtered_output['token_id'] == token_id].copy()

lines['raw'][customer_id] = plot_line(title='Raw data for {}'.format(customer_id),
                                      x=data['raw'][customer_id].date,
                                      y=data['raw'][customer_id].power_usage,
                                      width=960,
                                      height=320)

show(lines['raw'][customer_id])

In [83]:
# visually define the outliers as 'too low' values
low_limit = 100.
data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit]

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
9218916,2.200704,324,2014-03-30 01:00:00,28417.0,1184,1,6,30,89,13,3


In [84]:
indexes_to_correct = [9218916]

In [85]:
for index in indexes_to_correct:
    # set the lecture to the mean value of the lectures one day before and one day after
    data['preprocessed'][customer_id]['power_usage'][index] = mean(
        [data['raw'][customer_id]['power_usage'][index - 24],
         data['raw'][customer_id]['power_usage'][index + 24]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [86]:
# review preprocessed data
lines['preprocessed'][customer_id] = plot_line(
    title='Preprocessed data for {}'.format(customer_id),
    x=data['preprocessed'][customer_id].date,
    y=data['preprocessed'][customer_id].power_usage,
    width=960,
    height=320)

show(lines['preprocessed'][customer_id])

In [88]:
token_id = 325
customer_id = 'MT_{:03d}'.format(token_id)

# pass raw data to dictionary
data['raw'][customer_id] = filtered_output[filtered_output['token_id'] == token_id]

# make a copy for pre-processed data
data['preprocessed'][customer_id] = filtered_output[filtered_output['token_id'] == token_id].copy()

lines['raw'][customer_id] = plot_line(title='Raw data for {}'.format(customer_id),
                                      x=data['raw'][customer_id].date,
                                      y=data['raw'][customer_id].power_usage,
                                      width=960,
                                      height=320)

show(lines['raw'][customer_id])

In [95]:
# visually define the outliers as 'too low' values
low_limit = 120.
data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit]

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
9253333,102.396514,325,2014-03-03 01:00:00,27769.0,1157,1,0,3,62,10,3
9253981,0.0,325,2014-03-30 01:00:00,28417.0,1184,1,6,30,89,13,3
9254991,95.315904,325,2014-05-11 03:00:00,29427.0,1226,3,6,11,131,19,5
9254992,83.877996,325,2014-05-11 04:00:00,29428.0,1226,4,6,11,131,19,5
9254993,113.28976,325,2014-05-11 05:00:00,29429.0,1226,5,6,11,131,19,5
9256404,0.0,325,2014-07-09 00:00:00,30840.0,1285,0,2,9,190,28,7


In [96]:
indexes_to_correct = data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit].index
indexes_to_correct

Int64Index([9253333, 9253981, 9254991, 9254992, 9254993, 9256404], dtype='int64')

In [97]:
for index in indexes_to_correct:
    # set the lecture to the mean value of the lectures one day before and one day after
    data['preprocessed'][customer_id]['power_usage'][index] = mean(
        [data['raw'][customer_id]['power_usage'][index - 24],
         data['raw'][customer_id]['power_usage'][index + 24]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [98]:
# review preprocessed data
lines['preprocessed'][customer_id] = plot_line(
    title='Preprocessed data for {}'.format(customer_id),
    x=data['preprocessed'][customer_id].date,
    y=data['preprocessed'][customer_id].power_usage,
    width=960,
    height=320)

show(lines['preprocessed'][customer_id])

In [99]:
token_id = 326
customer_id = 'MT_{:03d}'.format(token_id)

# pass raw data to dictionary
data['raw'][customer_id] = filtered_output[filtered_output['token_id'] == token_id]

# make a copy for pre-processed data
data['preprocessed'][customer_id] = filtered_output[filtered_output['token_id'] == token_id].copy()

lines['raw'][customer_id] = plot_line(title='Raw data for {}'.format(customer_id),
                                      x=data['raw'][customer_id].date,
                                      y=data['raw'][customer_id].power_usage,
                                      width=960,
                                      height=320)

show(lines['raw'][customer_id])

In [100]:
# visually define the outliers as 'too low' values
low_limit = 50.
data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit]

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
9289046,1.558603,326,2014-03-30 01:00:00,28417.0,1184,1,6,30,89,13,3


In [101]:
indexes_to_correct = data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit].index
indexes_to_correct

Int64Index([9289046], dtype='int64')

In [102]:
for index in indexes_to_correct:
    # set the lecture to the mean value of the lectures one day before and one day after
    data['preprocessed'][customer_id]['power_usage'][index] = mean(
        [data['raw'][customer_id]['power_usage'][index - 24],
         data['raw'][customer_id]['power_usage'][index + 24]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [103]:
# review preprocessed data
lines['preprocessed'][customer_id] = plot_line(
    title='Preprocessed data for {}'.format(customer_id),
    x=data['preprocessed'][customer_id].date,
    y=data['preprocessed'][customer_id].power_usage,
    width=960,
    height=320)

show(lines['preprocessed'][customer_id])

In [104]:
token_id = 327
customer_id = 'MT_{:03d}'.format(token_id)

# pass raw data to dictionary
data['raw'][customer_id] = filtered_output[filtered_output['token_id'] == token_id]

# make a copy for pre-processed data
data['preprocessed'][customer_id] = filtered_output[filtered_output['token_id'] == token_id].copy()

lines['raw'][customer_id] = plot_line(title='Raw data for {}'.format(customer_id),
                                      x=data['raw'][customer_id].date,
                                      y=data['raw'][customer_id].power_usage,
                                      width=960,
                                      height=320)

show(lines['raw'][customer_id])

In [105]:
# visually define the outliers as 'too low' values
low_limit = 50.
data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit]

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
9324111,3.472669,327,2014-03-30 01:00:00,28417.0,1184,1,6,30,89,13,3


In [106]:
indexes_to_correct = data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit].index
indexes_to_correct

Int64Index([9324111], dtype='int64')

In [107]:
for index in indexes_to_correct:
    # set the lecture to the mean value of the lectures one day before and one day after
    data['preprocessed'][customer_id]['power_usage'][index] = mean(
        [data['raw'][customer_id]['power_usage'][index - 24],
         data['raw'][customer_id]['power_usage'][index + 24]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [108]:
# review preprocessed data
lines['preprocessed'][customer_id] = plot_line(
    title='Preprocessed data for {}'.format(customer_id),
    x=data['preprocessed'][customer_id].date,
    y=data['preprocessed'][customer_id].power_usage,
    width=960,
    height=320)

show(lines['preprocessed'][customer_id])

In [109]:
token_id = 328
customer_id = 'MT_{:03d}'.format(token_id)

# pass raw data to dictionary
data['raw'][customer_id] = filtered_output[filtered_output['token_id'] == token_id]

# make a copy for pre-processed data
data['preprocessed'][customer_id] = filtered_output[filtered_output['token_id'] == token_id].copy()

lines['raw'][customer_id] = plot_line(title='Raw data for {}'.format(customer_id),
                                      x=data['raw'][customer_id].date,
                                      y=data['raw'][customer_id].power_usage,
                                      width=960,
                                      height=320)

show(lines['raw'][customer_id])

In [112]:
# visually define the outliers as 'too low' values
low_limit = 300
data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit]

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
9359176,0.0,328,2014-03-30 01:00:00,28417.0,1184,1,6,30,89,13,3
9361991,222.222222,328,2014-07-25 08:00:00,31232.0,1301,8,4,25,206,30,7


In [113]:
indexes_to_correct = data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit].index
indexes_to_correct

Int64Index([9359176, 9361991], dtype='int64')

In [114]:
for index in indexes_to_correct:
    # set the lecture to the mean value of the lectures one day before and one day after
    data['preprocessed'][customer_id]['power_usage'][index] = mean(
        [data['raw'][customer_id]['power_usage'][index - 24],
         data['raw'][customer_id]['power_usage'][index + 24]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [115]:
# review preprocessed data
lines['preprocessed'][customer_id] = plot_line(
    title='Preprocessed data for {}'.format(customer_id),
    x=data['preprocessed'][customer_id].date,
    y=data['preprocessed'][customer_id].power_usage,
    width=960,
    height=320)

show(lines['preprocessed'][customer_id])

In [116]:
token_id = 329
customer_id = 'MT_{:03d}'.format(token_id)

# pass raw data to dictionary
data['raw'][customer_id] = filtered_output[filtered_output['token_id'] == token_id]

# make a copy for pre-processed data
data['preprocessed'][customer_id] = filtered_output[filtered_output['token_id'] == token_id].copy()

lines['raw'][customer_id] = plot_line(title='Raw data for {}'.format(customer_id),
                                      x=data['raw'][customer_id].date,
                                      y=data['raw'][customer_id].power_usage,
                                      width=960,
                                      height=320)

show(lines['raw'][customer_id])

In [118]:
# visually define the outliers as 'too low' values
low_limit = 30
data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit]

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
9394241,1.588235,329,2014-03-30 01:00:00,28417.0,1184,1,6,30,89,13,3


In [119]:
indexes_to_correct = data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit].index
indexes_to_correct

Int64Index([9394241], dtype='int64')

In [120]:
for index in indexes_to_correct:
    # set the lecture to the mean value of the lectures one day before and one day after
    data['preprocessed'][customer_id]['power_usage'][index] = mean(
        [data['raw'][customer_id]['power_usage'][index - 24],
         data['raw'][customer_id]['power_usage'][index + 24]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [121]:
# review preprocessed data
lines['preprocessed'][customer_id] = plot_line(
    title='Preprocessed data for {}'.format(customer_id),
    x=data['preprocessed'][customer_id].date,
    y=data['preprocessed'][customer_id].power_usage,
    width=960,
    height=320)

show(lines['preprocessed'][customer_id])

In [122]:
token_id = 330
customer_id = 'MT_{:03d}'.format(token_id)

# pass raw data to dictionary
data['raw'][customer_id] = filtered_output[filtered_output['token_id'] == token_id]

# make a copy for pre-processed data
data['preprocessed'][customer_id] = filtered_output[filtered_output['token_id'] == token_id].copy()

lines['raw'][customer_id] = plot_line(title='Raw data for {}'.format(customer_id),
                                      x=data['raw'][customer_id].date,
                                      y=data['raw'][customer_id].power_usage,
                                      width=960,
                                      height=320)

show(lines['raw'][customer_id])

In [123]:
# visually define the outliers as 'too low' values
low_limit = 30
data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit]

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
9429306,0.675,330,2014-03-30 01:00:00,28417.0,1184,1,6,30,89,13,3


In [124]:
indexes_to_correct = data['raw'][customer_id][data['raw'][customer_id]['power_usage'] < low_limit].index
indexes_to_correct

Int64Index([9429306], dtype='int64')

In [125]:
for index in indexes_to_correct:
    # set the lecture to the mean value of the lectures one day before and one day after
    data['preprocessed'][customer_id]['power_usage'][index] = mean(
        [data['raw'][customer_id]['power_usage'][index - 24],
         data['raw'][customer_id]['power_usage'][index + 24]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [126]:
# review preprocessed data
lines['preprocessed'][customer_id] = plot_line(
    title='Preprocessed data for {}'.format(customer_id),
    x=data['preprocessed'][customer_id].date,
    y=data['preprocessed'][customer_id].power_usage,
    width=960,
    height=320)

show(lines['preprocessed'][customer_id])

In [140]:
# persist separated, raw and preprocessed, dataframes

In [141]:
start, end = 320, 330

customer_ids = ['MT_{:03d}'.format(token_id) for token_id in np.arange(start, end + 1)]

In [142]:
for state in ['raw', 'preprocessed']:
    for customer_id in customer_ids:
        data[state][customer_id].to_pickle('{}/separated_{}/{}.pkl'.format(data_folder, state, customer_id))