# Data preprocessing

## Preparations

### options

In [19]:
%reload_ext autoreload
%autoreload 2
dir_data = '../data/power_consumption/'

### modules

In [20]:
import pandas as pd  # data mangling and transforming
import numpy as np  # handling vectors and matrices
from preproc_functions import fill_missing, split_dataset  # own preprocessing functions

### Load data & fill missing

In [21]:
# load all data
df = pd.read_csv(dir_data+'household_power_consumption.txt', 
                 sep=';', header=0, low_memory=False, 
                 infer_datetime_format=True, 
                 parse_dates={'datetime':[0,1]}, index_col=['datetime'])
# mark all missing values
df.replace('?', np.nan, inplace=True)
# make dataset numeric
df = df.astype('float32')
# fill missing
fill_missing(df.values)

### create additional variable

In [22]:
# add a column for for the remainder of sub metering
values = df.values
df['sub_metering_4'] = (values[:,0] * 1000 / 60) - (values[:,4] + 
                                                    values[:,5] + 
                                                    values[:,6])

In [23]:
# save updated dataset
df.to_csv(dir_data+'household_power_consumption.csv')

### aggregate to days

In [24]:
# resample data to daily
daily_groups = df.resample('D')
df_daily = daily_groups.sum()

In [25]:
# summarize
print(df_daily.shape)
# save
df_daily.to_csv(dir_data+'household_power_consumption_by_day.csv')

(1442, 8)
