In [None]:
## Imports
import math as math
import json
import pandas as pd
import numpy as np
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Dropout
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.python.client import device_lib

# Register matplotlib converters
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [None]:
## Fill missing values with a value at the same time one day ago
def fill_missing(values):
    one_day = 60 * 24
    for row in range(values.shape[0]):
        for col in range(values.shape[1]):
            if math.isnan(values[row, col]):
                values[row, col] = values[row - one_day, col]

In [None]:
# Load all data
dataset = pd.read_csv('data/household_power_consumption.txt', sep=';', header=0, low_memory=False, infer_datetime_format=True, parse_dates={'datetime':[0,1]}, index_col=['datetime'])
# summarize
print(dataset.shape)
print(dataset.head())

In [None]:
# Mark all missing values
dataset.replace('?', np.nan, inplace=True)
# Make dataset numeric
dataset = dataset.astype('float32')

# Fill missing values
fill_missing(dataset.values)

# Add a column for for the remainder of sub metering
values = dataset.values.astype('float32')
dataset['sub_metering_4'] = (values[:,0] * 1000 / 60) - (values[:,4] + values[:,5] + values[:,6])

# Save updated dataset
dataset.to_csv('household_power_consumption.csv')

# Load the new file and summarize
dataset = pd.read_csv('household_power_consumption.csv', header=0, infer_datetime_format=True, parse_dates=['datetime'], index_col=['datetime'])
print(dataset.head())

In [None]:
## Resample data to daily
daily_groups = dataset.resample('D')
daily_data = daily_groups.sum()
# Summarize
print(daily_data.shape)
print(daily_data.head())
# Save
daily_data.to_csv('household_power_consumption_days.csv')

In [None]:
## Line plots -----------------------------------------------------------------
#from pandas import read_csv
# Load the new file
#dataset = pd.read_csv('household_power_consumption.csv', header=0, infer_datetime_format=True, parse_dates=['datetime'], index_col=['datetime'])
# Line plot for each variable
pyplot.figure(figsize=(8,8))
for i in range(len(dataset.columns)):
    pyplot.subplot(len(dataset.columns), 1, i+1)
    name = dataset.columns[i]
    pyplot.plot(dataset[name])
    pyplot.title(name, y=0)
    pyplot.tight_layout()
pyplot.show()

In [None]:
## Yearly line plots

# Plot active power for each year
years = ['2007', '2008', '2009', '2010']
pyplot.figure(figsize=(8,8))
for i in range(len(years)):
    # prepare subplot
    ax = pyplot.subplot(len(years), 1, i+1)
    # determine the year to plot
    year = years[i]
    # get all observations for the year
    result = dataset[str(year)]
    # plot the active power for the year
    pyplot.plot(result['Global_active_power'])
    # add a title to the subplot
    pyplot.title(str(year), y=0, loc='left')
    pyplot.tight_layout()
pyplot.show()

In [None]:
## Monthly plots
# Plot active power for each year
months = [x for x in range(1, 13)]
pyplot.figure(figsize=(8,8))
for i in range(len(months)):
    # prepare subplot
    ax = pyplot.subplot(len(months), 1, i+1)
    # determine the month to plot
    month = '2007-' + str(months[i])
    # get all observations for the month
    result = dataset[month]
    # plot the active power for the month
    pyplot.plot(result['Global_active_power'])
    # add a title to the subplot
    pyplot.title(month, y=0, loc='left')
    #pyplot.tight_layout()
pyplot.show()

In [None]:
## Daily plots
# plot active power for each year
days = [x for x in range(1, 20)]
pyplot.figure(figsize=(10,20),constrained_layout=True)
for i in range(len(days)):
    # prepare subplot
    ax = pyplot.subplot(len(days), 1, i+1)
    # determine the day to plot
    day = '2007-01-' + str(days[i])
    # get all observations for the day
    result = dataset[day]
    # plot the active power for the day
    pyplot.plot(result['Global_active_power'])
    # add a title to the subplot
    pyplot.title(day, y=0, loc='left')
    #pyplot.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
pyplot.show()

In [None]:
# Histogram plot for each variable
pyplot.figure(figsize=(10,15))
for i in range(len(dataset.columns)):
    pyplot.subplot(len(dataset.columns), 1, i+1)
    name = dataset.columns[i]
    dataset[name].hist(bins=100)
    pyplot.title(name, y=0)
    pyplot.tight_layout()
pyplot.show()

In [None]:
## Active Power Consumption Plots ##
# Plot active power for each year
years = ['2007', '2008', '2009', '2010']
pyplot.figure(figsize=(10,10))
for i in range(len(years)):
    # prepare subplot
    ax = pyplot.subplot(len(years), 1, i+1)
    # determine the year to plot
    year = years[i]
    # get all observations for the year
    result = dataset[str(year)]
    # plot the active power for the year
    result['Global_active_power'].hist(bins=100)
    # zoom in on the distribution
    ax.set_xlim(0, 5)
    # add a title to the subplot
    pyplot.title(str(year), y=0, loc='right')
pyplot.show()

In [None]:
# Plot active power for each year
months = [x for x in range(1, 13)]
pyplot.figure(figsize=(10,10))
for i in range(len(months)):
    # prepare subplot
    ax = pyplot.subplot(len(months), 1, i+1)
    # determine the month to plot
    month = '2007-' + str(months[i])
    # get all observations for the month
    result = dataset[month]
    # plot the active power for the month
    result['Global_active_power'].hist(bins=100)
    # zoom in on the distribution
    ax.set_xlim(0, 5)
    # add a title to the subplot
    pyplot.title(month, y=0, loc='right')
pyplot.show()