# Data preprocessing

## Preparations

### options

In [1]:
%reload_ext autoreload
%autoreload 2
dir_data = '../data/power_consumption/'

### modules

In [2]:
import pandas as pd  # data mangling and transforming
import numpy as np  # handling vectors and matrices
from preproc_functions import fill_missing, split_dataset_by_weeks  # own preprocessing functions

### Load data & fill missing

In [None]:
# load all data
df = pd.read_csv(dir_data+'household_power_consumption.txt', 
                 sep=';', header=0, low_memory=False, 
                 infer_datetime_format=True, 
                 parse_dates={'datetime':[0,1]}, index_col=['datetime'])
# mark all missing values
df.replace('?', np.nan, inplace=True)
# make dataset numeric
df = df.astype('float32')
# fill missing
fill_missing(df.values)

### create additional variable

In [None]:
# add a column for for the remainder of sub metering
values = df.values
df['Sub_metering_4'] = (values[:,0] * 1000 / 60) - (values[:,4] + 
                                                    values[:,5] + 
                                                    values[:,6])

In [None]:
# save updated dataset
df.to_csv(dir_data+'household_power_consumption.csv')

### aggregate to daily level

In [None]:
# resample data to daily
daily_groups = df.resample('D')
df_daily = daily_groups.sum()

In [None]:
# summarize
print(df_daily.shape)
# save
df_daily.to_csv(dir_data+'household_power_consumption_by_day.csv')

## Transformations

### define time windows

In [None]:
print(str(len(df_daily)/7)+' weeks available.')

In [None]:
# first day
df_daily.index[0].weekday()

Dataset starts with a Saturday, we want our weeks to start with Monday however. 

We will drop 1 week (2 days at the beginning, 5 at the end!

In [None]:
205*0.7  # 70% of data into training set

144 weeks in training set. 61 in the test set.

In [None]:
split_day = 2+(144*7)
training_window = (2, split_day)
test_window = (split_day, -5)

In [None]:
print(training_window)
print(test_window)

### split into training and test

In [None]:
df_daily.head()

In [None]:
train, test = split_dataset_by_weeks(df_daily.values, 
                                     trw=training_window,
                                     tew=test_window)

In [None]:
print(train.shape)  # 144 observations of matrices 7x8
print(test.shape)  # 61 observations of same dimension

The dimensions are: week, weekday, variable!

## Save

In [None]:
np.save(dir_data+'train', train)
np.save(dir_data+'test', test)