# Timeseries prediction of one pollutant at one station for one hour in the future

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import sys, os, pickle
import pandas as pd

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src import plotting, modelling

%matplotlib inline

In [None]:
# Load data and filter for a single station

df = pd.read_pickle('../data/processed/SO2_0.pkl')
df = df.loc[df['station'] == 101]


In [None]:
# Create data for model

df_data = df['value']
df_data.index = df['datetime']
df_data.plot(subplots=True)


In [None]:
# drop datetime

df_data = df_data.values


In [None]:
# Standardize

train_split = 24*365*2

df_train_mean = df_data[:train_split].mean()
df_train_std = df_data[:train_split].std()

df_data = (df_data-df_train_mean)/df_train_std


In [None]:
# split timeseries into train and validation

history_size = 20
future_size = 0

x_train, y_train = modelling.chop_data(df_data, 0, train_split,
                                       history_size,
                                       future_size)
x_val, y_val = modelling.chop_data(df_data, train_split, None,
                                   history_size,
                                   future_size)

## Begin with model definition in tensorflow

In [None]:
# create train and validation objects using tensorflow Dataset

BATCH_SIZE = 256
BUFFER_SIZE = 10000

train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train = train.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()

val = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val = val.batch(BATCH_SIZE).repeat()

### Model Definition

In [None]:
# define sequential tf.keras model

units = 1

model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(units, input_shape=x_train.shape[-2:]),
    tf.keras.layers.Dense(1)
])

In [None]:
OPT = 'adam'
LOSS = 'mae'

model.compile(optimizer=OPT, loss=LOSS, metrics=['mae'])


In [None]:
# check the output of the model is the correct shape

for x, y in val.take(1):
    print(model.predict(x).shape)
    

In [None]:
EVALUATION_INTERVAL = 200
EPOCHS = 2

model.fit(train, epochs=EPOCHS,
          steps_per_epoch=EVALUATION_INTERVAL,
          validation_data=val, validation_steps=50)

In [None]:
model.summary()

In [None]:
# save model and history

model_name = 'LSTM{}'.format(units)
subd = 'SO2/'
model_path = '../src/models/{}'.format(subd)

try:
    os.makedirs(model_path)
except FileExistsError:
    pass

model.save(model_path+'{}.h5'.format(model_name), save_format='tf')

modelling.save_history(model.history.history, model_name, subd)


### Plot loss history

In [None]:
plotting.plot_train_history(model.history.history,
                            'Single Step Training and validation loss')

In [None]:
# visualizing some predictions vs actuals

for x, y in val.take(3):
    plot = plotting.y_vs_yhat([x[0].numpy(),
                      y[0].numpy(),
                      model.predict(x)[0]],
                      0,
                      'Simple LSTM model')
    plot.show()