# Introduction
The objective of this notebook is to read in, explore and set up data for time series forecasting using neural networks. Key steps are:
1. Load the data
2. Inspect the data
3. Adjust data to fix any problems (e.g. missing data points)
4. Aggregate data
5. Save as a new data set

# Step 1: Load the data

In [2]:
# Load the data
from pandas import read_csv
dataset = read_csv('~/data/household_power_consumption.txt', sep=';', header=0, low_memory=False, infer_datetime_format=True, parse_dates={'datetime':[0,1]}, index_col=['datetime'])

# Step 2: Inspect the data 

In [5]:
print(dataset.head())

                    Global_active_power Global_reactive_power  Voltage  \
datetime                                                                 
2006-12-16 17:24:00               4.216                 0.418  234.840   
2006-12-16 17:25:00               5.360                 0.436  233.630   
2006-12-16 17:26:00               5.374                 0.498  233.290   
2006-12-16 17:27:00               5.388                 0.502  233.740   
2006-12-16 17:28:00               3.666                 0.528  235.680   

                    Global_intensity Sub_metering_1 Sub_metering_2  \
datetime                                                             
2006-12-16 17:24:00           18.400          0.000          1.000   
2006-12-16 17:25:00           23.000          0.000          1.000   
2006-12-16 17:26:00           23.000          0.000          2.000   
2006-12-16 17:27:00           23.000          0.000          1.000   
2006-12-16 17:28:00           15.800          0.000          

In [6]:
print(dataset.tail())

                    Global_active_power Global_reactive_power  Voltage  \
datetime                                                                 
2010-11-26 20:58:00               0.946                 0.000  240.430   
2010-11-26 20:59:00               0.944                 0.000  240.000   
2010-11-26 21:00:00               0.938                 0.000  239.820   
2010-11-26 21:01:00               0.934                 0.000  239.700   
2010-11-26 21:02:00               0.932                 0.000  239.550   

                    Global_intensity Sub_metering_1 Sub_metering_2  \
datetime                                                             
2010-11-26 20:58:00            4.000          0.000          0.000   
2010-11-26 20:59:00            4.000          0.000          0.000   
2010-11-26 21:00:00            3.800          0.000          0.000   
2010-11-26 21:01:00            3.800          0.000          0.000   
2010-11-26 21:02:00            3.800          0.000          

In [12]:
# Check the data type of 'dataset'
type(dataset)


pandas.core.frame.DataFrame

In [11]:
dataset.describe()

Unnamed: 0,Sub_metering_3
count,2049280.0
mean,6.458447
std,8.437154
min,0.0
25%,0.0
50%,1.0
75%,17.0
max,31.0


The results only show one column, suggesting there is only one numeric (i.e. float type) column in the dataframe. Check the other types of columns:

In [13]:
dataset.dtypes

Global_active_power       object
Global_reactive_power     object
Voltage                   object
Global_intensity          object
Sub_metering_1            object
Sub_metering_2            object
Sub_metering_3           float64
dtype: object

All of the columns should be type: float. Convert the object types to float

# Step 3: Adjust the data

In [25]:
# coerce the data to numeric by converting non-numeric characters to NaN
import pandas as pd
headings = ['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2']
for heading in headings:
    dataset[heading] = pd.to_numeric(dataset[heading], errors='coerce')

In [26]:
dataset.dtypes

Global_active_power      float64
Global_reactive_power    float64
Voltage                  float64
Global_intensity         float64
Sub_metering_1           float64
Sub_metering_2           float64
Sub_metering_3           float64
dtype: object

In [27]:
# inspect the data with describe
dataset.describe()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
count,2049280.0,2049280.0,2049280.0,2049280.0,2049280.0,2049280.0,2049280.0
mean,1.091615,0.1237145,240.8399,4.627759,1.121923,1.29852,6.458447
std,1.057294,0.112722,3.239987,4.444396,6.153031,5.822026,8.437154
min,0.076,0.0,223.2,0.2,0.0,0.0,0.0
25%,0.308,0.048,238.99,1.4,0.0,0.0,0.0
50%,0.602,0.1,241.01,2.6,0.0,0.0,1.0
75%,1.528,0.194,242.89,6.4,0.0,1.0,17.0
max,11.122,1.39,254.15,48.4,88.0,80.0,31.0


In [31]:
count_nan = len(dataset) - dataset.count()
print(count_nan)

Global_active_power      25979
Global_reactive_power    25979
Voltage                  25979
Global_intensity         25979
Sub_metering_1           25979
Sub_metering_2           25979
Sub_metering_3           25979
dtype: int64


In [32]:
dataset['Global_active_power'].count()

2049280

In [36]:
# create an additional variable called 'remainder'
dataset['Remainder'] = ( dataset['Global_active_power'] * 1000 / 60 ) - ( dataset['Sub_metering_1'] + dataset['Sub_metering_2'] + dataset['Sub_metering_3'] )
dataset.describe()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,remainder,Remainder
count,2049280.0,2049280.0,2049280.0,2049280.0,2049280.0,2049280.0,2049280.0,2049280.0,2049280.0
mean,1.091615,0.1237145,240.8399,4.627759,1.121923,1.29852,6.458447,9.314693,9.314693
std,1.057294,0.112722,3.239987,4.444396,6.153031,5.822026,8.437154,9.585916,9.585916
min,0.076,0.0,223.2,0.2,0.0,0.0,0.0,-2.4,-2.4
25%,0.308,0.048,238.99,1.4,0.0,0.0,0.0,3.8,3.8
50%,0.602,0.1,241.01,2.6,0.0,0.0,1.0,5.5,5.5
75%,1.528,0.194,242.89,6.4,0.0,1.0,17.0,10.36667,10.36667
max,11.122,1.39,254.15,48.4,88.0,80.0,31.0,124.8333,124.8333


In [41]:
# create a function to fill missing values with a value at the same time one day ago
def fill_missing(values):
    from numpy import isnan
    one_day = 60 * 24
    for row in range(values.shape[0]):
        for col in range(values.shape[1]):
            if isnan(values[row, col]):
                values[row, col] = values[row - one_day, col]

In [42]:
# use the function created above to fill in missing values
fill_missing(dataset.values)

# Step 4: Aggregate data
Convert from hourly to daily intervals

In [43]:
# resmaple minute data to daily
daily_groups = dataset.resample('D')
daily_data = daily_groups.sum()

In [47]:
# summarise the dataset
print(daily_data.shape)
print(daily_data.head())

(1442, 9)
            Global_active_power  Global_reactive_power    Voltage  \
datetime                                                            
2006-12-16             1209.176                 34.922   93552.53   
2006-12-17             3390.460                226.006  345725.32   
2006-12-18             2203.826                161.792  347373.64   
2006-12-19             1666.194                150.942  348479.01   
2006-12-20             2225.748                160.998  348923.61   

            Global_intensity  Sub_metering_1  Sub_metering_2  Sub_metering_3  \
datetime                                                                       
2006-12-16            5180.8             0.0           546.0          4926.0   
2006-12-17           14398.6          2033.0          4187.0         13341.0   
2006-12-18            9247.2          1063.0          2621.0         14018.0   
2006-12-19            7094.0           839.0          7602.0          6197.0   
2006-12-20            9313

# Step 5: Save as a new data set

In [48]:
# save the daily data
daily_data.to_csv('household_power_consumption_days.csv')

End of notebook