# Data preprocessing

## Preparations

### options

In [1]:
%reload_ext autoreload
%autoreload 2
dir_data = '../data/power_consumption/'

### modules

In [2]:
import pandas as pd  # data mangling and transforming
import numpy as np  # handling vectors and matrices
from preproc_functions import fill_missing, split_dataset, to_supervised  # own preprocessing functions

### Load data & fill missing

In [3]:
# load all data
df = pd.read_csv(dir_data+'household_power_consumption.txt', 
                 sep=';', header=0, low_memory=False, 
                 infer_datetime_format=True, 
                 parse_dates={'datetime':[0,1]}, index_col=['datetime'])
# mark all missing values
df.replace('?', np.nan, inplace=True)
# make dataset numeric
df = df.astype('float32')
# fill missing
fill_missing(df.values)

### create additional variable

In [4]:
# add a column for for the remainder of sub metering
values = df.values
df['sub_metering_4'] = (values[:,0] * 1000 / 60) - (values[:,4] + 
                                                    values[:,5] + 
                                                    values[:,6])

In [5]:
# save updated dataset
df.to_csv(dir_data+'household_power_consumption.csv')

### aggregate to days

In [6]:
# resample data to daily
daily_groups = df.resample('D')
df_daily = daily_groups.sum()

In [7]:
# summarize
print(df_daily.shape)
# save
df_daily.to_csv(dir_data+'household_power_consumption_by_day.csv')

(1442, 8)


## Transformations

### define time windows

In [8]:
print(str(len(df_daily)/7)+' weeks available.')

206.0 weeks available.


In [9]:
# first day
df_daily.index[0].weekday()

5

Dataset starts with a Saturday, we want our weeks to start with Monday however.

In [10]:
205*0.7  # 70% of data into training set

143.5

144 weeks in training set. 61 in the test set.

In [11]:
split_day = 2+(144*7)
training_window = (2, split_day)
test_window = (split_day, -5)

In [12]:
print(training_window)
print(test_window)

(2, 1010)
(1010, -5)


### split into training and test

In [13]:
df_daily.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,sub_metering_4
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006-12-16,1209.176025,34.922001,93552.53125,5180.799805,0.0,546.0,4926.0,14680.933594
2006-12-17,3390.459961,226.005997,345725.3125,14398.599609,2033.0,4187.0,13341.0,36946.667969
2006-12-18,2203.825928,161.792007,347373.625,9247.200195,1063.0,2621.0,14018.0,19028.433594
2006-12-19,1666.19397,150.942001,348479.0,7094.0,839.0,7602.0,6197.0,13131.900391
2006-12-20,2225.748047,160.998001,348923.625,9313.0,0.0,2648.0,14063.0,20384.800781


In [14]:
train, test = split_dataset(df_daily.values, 
                            trw=training_window,
                            tew=test_window)

In [15]:
print(train.shape)  # 144 observations of matrices 7x8
print(test.shape)  # 61 observations of same dimension

(144, 7, 8)
(61, 7, 8)


The dimensions are: week, weekday, variable!

### We always want to predict the next week based on the last 7 days

#### Only standard weeks

In [16]:
# delete last week from X + first week from y
train_X = train[:-1,:,:]
train_y = train[1:,:,0]

In [17]:
# same for test set
test_X = test[:-1,:,:]
test_y = test[1:,:,0]

In [18]:
print(train_X.shape)
print(train_y.shape)

(143, 7, 8)
(143, 7)


#### Uni-variate case

In [19]:
train_Xu, train_yu = to_supervised(train, n_input=7, n_out=7)

In [20]:
print(train_Xu.shape)
print(train_yu.shape)

(994, 7, 1)
(994, 7)


In [21]:
test_Xu, test_yu = to_supervised(test, n_input=7, n_out=7)

In [22]:
print(test_Xu.shape)
print(test_yu.shape)

(413, 7, 1)
(413, 7)


#### Multi-variate case

### save locally

#### standard week

In [23]:
# input
np.save(dir_data+'train_X', train_X)
np.save(dir_data+'test_X', test_X)

In [24]:
# output
np.save(dir_data+'train_y', train_y)
np.save(dir_data+'test_y', test_y)

#### uni-variate

In [25]:
# input
np.save(dir_data+'train_Xu', train_Xu)
np.save(dir_data+'test_Xu', test_Xu)

In [26]:
# output
np.save(dir_data+'train_yu', train_yu)
np.save(dir_data+'test_yu', test_yu)

#### multi-variate