# Data preprocessing

## Preparations

### options

In [46]:
%reload_ext autoreload
%autoreload 2

### modules

In [50]:
import pandas as pd  # data mangling and transforming
import numpy as np  # handling vectors and matrices
from preproc_functions import fill_missing, split_dataset  # own preprocessing functions

### Load data & fill missing

In [2]:
# load all data
df = pd.read_csv('../data/household_power_consumption.txt', 
                 sep=';', header=0, low_memory=False, 
                 infer_datetime_format=True, 
                 parse_dates={'datetime':[0,1]}, index_col=['datetime'])
# mark all missing values
df.replace('?', np.nan, inplace=True)
# make dataset numeric
df = df.astype('float32')
# fill missing
fill_missing(df.values)

### create additional variable

In [5]:
# add a column for for the remainder of sub metering
values = df.values
df['sub_metering_4'] = (values[:,0] * 1000 / 60) - (values[:,4] + 
                                                    values[:,5] + 
                                                    values[:,6])
# save updated dataset
df.to_csv('household_power_consumption.csv')

### aggregate to days

In [6]:
# resample data to daily
daily_groups = df.resample('D')
df_daily = daily_groups.sum()

In [10]:
# summarize
print(df_daily.shape)
# save
df_daily.to_csv('household_power_consumption_by_day.csv')

(1442, 8)


## Transformations

### define time windows

In [34]:
print(str(len(df_daily)/7)+' weeks available.')

206.0 weeks available.


In [35]:
# first day
df_daily.index[0].weekday()

5

Dataset starts with a Saturday, we want our weeks to start with Monday however.

In [76]:
205*0.8  # 80% of data into training set

164.0

164 weeks in training set. 41 in the test set.

In [71]:
split_day = 2+(164*7)
training_window = (2, split_day)
test_window = (split_day, -5)

In [72]:
print(training_window)
print(test_window)

(2, 1150)
(1150, -5)


### split into training and test

In [73]:
train, test = split_dataset(df_daily.values, 
                            trw=training_window,
                            tew=test_window)

In [74]:
print(train.shape)  # 164 observations of matrices 7x8
print(test.shape)  # 41 observations of same dimension

(164, 7, 8)
(41, 7, 8)


### save locally

In [84]:
np.save('../data/training_set', train)
np.save('../data/test_set', test)