Time-series forecasting from IoT Home Automation Data.

Tuomas Eerola - 2019

Data source: https://github.com/eerolat/home-automation-data-logger

# Run either of the following cells.

This will to connect you to the data:


1.   Some test data from the Internet; or
2.   the actual sensor data.


In [0]:
import pandas as pd

from urllib.request import urlopen

log_url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/daily-min-temperatures.csv"

url = urlopen(log_url)

series_example = pd.read_csv(log_url, header=0, index_col=0)

series = series_example

print ("Data loading ready.")

In [0]:
import pandas as pd

from urllib.request import urlopen

log_url = "http://eerola.dy.fi/temp/temperature.log"

series_own = pd.read_csv(log_url, sep=" ", parse_dates=[[0, 1]])
series_own.columns=['Date Time', 'SourceInfo1', 'SourceInfo2', 'MeasurementInfo1', 'Temp', 'MeasurementInfo2', 'Measurement2']
#series_own.insert(6, "Target", "NaN")
dropcolumns = ['SourceInfo1', 'SourceInfo2', 'MeasurementInfo1', 'MeasurementInfo2', 'Measurement2']
series_own.drop(dropcolumns, inplace=True, axis=1)
series_own.set_index('Date Time', inplace=True)

series = series_own

series = series.resample('H').mean()

series = series_own.tail(1000)

print ("Data loading ready.")

# Visualize the data to see what we've got.

In [0]:
from matplotlib import pyplot


series.plot()
pyplot.show()

# Split the data into training and validation datasets.

In [0]:
#How many percent of the source data will be used as validation data

validation_part = 0.2

In [0]:
split_point = len(series) - (round(len(series) * validation_part))
dataset, validation = series[0:split_point], series[split_point:]
print('Dataset %d, Validation %d' % (len(dataset), len(validation)))
dataset.to_csv('dataset.csv', index=False)
validation.to_csv('validation.csv', index=False)

# Let's define some fuctions that we will use later.

In [0]:
import numpy

# create a differenced series
def difference(dataset, interval=1):
  diff = list()
  for i in range(interval, len(dataset)):
    if (i != 0 and i - interval != 0):
      value = float(dataset[i]) - float(dataset[i - interval])
      diff.append(value)
  return numpy.array(diff)

print("Ready.")

In [0]:
# invert differenced value
def inverse_difference(history, yhat, interval=1):
	return yhat + float(history[-interval])

print("Ready.")

# Let's run a forecast for the following 6 hours (12 time steps).

In [0]:
from pandas import read_csv
from statsmodels.tsa.arima_model import ARIMA

# load dataset
series = read_csv('dataset.csv', header=None)
# seasonal difference
X = series.values
#days_in_year = 365
samples_per_day = 24*2
differenced = difference(X, samples_per_day)
# fit model
model = ARIMA(differenced, order=(12,0,1))
model_fit = model.fit(disp=0)
# multi-step out-of-sample forecast
start_index = len(differenced)
end_index = start_index + 11
forecast = model_fit.predict(start=start_index, end=end_index)
# invert the differenced forecast to something usable
history = [x for x in X]
measurement = 1 
for yhat in forecast:
	inverted = inverse_difference(history, yhat, samples_per_day)
	print('Measurement %d: %f' % (measurement, inverted))
	history.append(inverted)
	measurement += 1

# We can compare the forecast to current data.

In [0]:
print(dataset.tail(1))

In [0]:
print(validation.head(12))