Air Pollution Forecasting - Many To One - RNN LSTM

We can use this data and frame a forecasting problem where, given the
pollution for prior hours, we forecast the pollution at the next hour

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

In [None]:
%cd '/content/gdrive/My Drive/../'

In [None]:
# LSTM for air pollution problem with regression framing
from datetime import datetime
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
dataset = pd.read_csv('pollution_new_1.csv', index_col= 0)
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
values = dataset.values
# specify columns to plot
groups = [0, 1, 2, 3, 5, 6, 7] # vì cột 4 là cột kiểu chuỗi
i = 1
# plot each column
plt.figure(figsize=(20,20))
for group in groups:
  plt.subplot(len(groups), 1, i)
  plt.plot(values[:, group])
  plt.title(dataset.columns[group], y=0.5, loc='right')
  i += 1
plt.show()

In [None]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
  n_vars = 1 if type(data) is list else data.shape[1]
  df = pd.DataFrame(data)
  cols, names = list(), list()
  # input sequence (t-n, ... t-1)
  for i in range(n_in, 0, -1):
    cols.append(df.shift(i))
    names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
  # forecast sequence (t, t+1, ... t+n)
  for i in range(0, n_out):
    cols.append(df.shift(-i))
    if i == 0:
      names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
    else:
      names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
  # put it all together
  agg = pd.concat(cols, axis=1)
  agg.columns = names
  # drop rows with NaN values
  if dropnan:
    agg.dropna(inplace=True) # kết quả sau khi drop gán luôn vào agg
  return agg

In [None]:
# # load dataset
values = dataset.values

In [None]:
# # load dataset
values = dataset.values
# integer encode direction
# convert string to int
encoder = LabelEncoder()
encoder.fit(values[:,4])
values[:,4] = encoder.transform(values[:,4])
# ensure all data is float
values = values.astype('float32')
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(values)
scaled = scaler.transform(values)
print("Frame as Series:")
print(scaled[:5])

In [None]:
# frame as supervised learning
reframed = series_to_supervised(scaled, 1, 1)
print("Frame as supervised learning:")
print(reframed.head())
# predict only var1(t), var2(t) not predicted =>drop
# drop columns we don't want to predict
reframed.drop(reframed.columns[[9,10,11,12,13,14,15]], axis=1, inplace=True)
print("Frame will use:")
print(reframed.head())

In [None]:
# split into train and test sets
values = reframed.values
n_train_hours = 365 * 24 * 4 # vì dữ liệu theo giờ (lấy 4 năm đầu train)
train = values[:n_train_hours, :]
# the left is used for Test
test = values[n_train_hours:, :]
# split into input and outputs (first col,last col)
train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]
print("Before reshape:")
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print("After reshape:")
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

In [None]:
import numpy as np
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,LSTM

In [None]:
# design network
model = Sequential()
model.add(LSTM(32, input_shape=(train_X.shape[1], train_X.shape[2]))) # 1 sample has 8 featur
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y,
                    epochs=50,
                    batch_size=72,
                    validation_data=(test_X, test_y),
                    verbose=2
                    )

In [None]:
model.summary()

How to calculate Params

lstm (LSTM) = [(num_units + input_dim + 1) x num_units] x 4 = [(32 + 8 + 1) x 32] x 4 = 5248

dense = ((current layer n x previous layer n) + bias) = 1 x 32 + 1 = 33

In [None]:
# plot history
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

In [None]:
test_X.shape

In [None]:
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

In [None]:
def invert_scaling(y, X, s):
  # invert scaling for forecast
  inv_y = np.concatenate((y, X[:, 1:]), axis=1)
  print(s, "shape:", inv_y.shape)
  inv_y = scaler.inverse_transform(inv_y)
  print(s, inv_y.shape)
  # trả lại hình dạng ban đầu
  inv_y = inv_y[:,0]
  return inv_y

In [None]:
# make a prediction
yhat = model.predict(test_X)
print("Test_x_shape:", test_X.shape)
test_X_now = test_X.reshape((test_X.shape[0], test_X.shape[2]))
print("Test_x_now_shape:", test_X_now.shape)

In [None]:
# invert scaling for forecast
inv_yhat = np.concatenate((yhat, test_X_now[:, 1:]), axis=1)
print("inv_y_hat_shape:", inv_yhat.shape)
inv_yhat = scaler.inverse_transform(inv_yhat)
print("inv_yhat:", inv_yhat.shape)
# revert to origin
inv_yhat = inv_yhat[:,0]
inv_yhat = invert_scaling(yhat, test_X_now, "inv_yhat")

In [None]:
#invert scaling for actual
inv_y = np.concatenate((test_y, test_X_now[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
test_y = test_y.reshape((len(test_y), 1))
inv_y = invert_scaling(test_y, test_X_now, "inv_y")

In [None]:
# calculate RMSE
rmse = math.sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)
mae = mean_absolute_error(inv_y, inv_yhat)
print('Test MAE: %.3f' % mae)

In [None]:
plt.figure(figsize=(15,10))
plt.plot(inv_y - inv_yhat, label='Diff between y_test and y_test_hat')
plt.legend(title="Notes")
plt.show()

In [None]:
# make a prediction of y
y_train_hat = model.predict(train_X)
train_X_now = train_X.reshape((train_X.shape[0], train_X.shape[2]))

In [None]:
#invert scaling for forecast
inv_y_train_hat = np.concatenate((y_train_hat, train_X_now[:, 1:]), axis=1)
inv_y_train_hat = scaler.inverse_transform(inv_y_train_hat)
inv_y_train_hat = inv_y_train_hat[:,0]

In [None]:
inv_y_train_hat = invert_scaling(y_train_hat, train_X_now, "inv_y_train_hat")

In [None]:
# plot baseline and predictions
plt.figure(figsize=(15,10))
plt.plot(test_y, label='Test Real Data', color='red')
plt.plot(yhat, label='Test Prediction', color='green')
plt.legend(title="Notes")
plt.show()

In [None]:
# plot baseline and predictions
plt.figure(figsize=(15,10))
plt.plot(test_y, label='Test Real Data', color='red')
plt.plot(yhat, label='Test Prediction', color='green')
plt.legend(title="Notes")
plt.show()

In [None]:
print('Val_loss train')
model.evaluate(train_X, train_y)

In [None]:
print('Val_loss test')
model.evaluate(test_X, test_y)

Make new prediction

In [None]:
dataset_new = pd.read_csv('pollution_new_predict.csv', index_col= 0)
dataset_new.head()

In [None]:
values_new = dataset_new.values
values_new

In [None]:
values_new[:,4] = encoder.transform(values_new[:,4])
# ensure all data is float
values_new = values_new.astype('float32')
values_new.size

In [None]:
print(values_new)

In [None]:
scaled_new = scaler.transform(values_new)
print(scaled_new)

In [None]:
# convert dataframe as supervised learning
reframed_new = series_to_supervised(scaled_new, 1, 1)
reframed_new.drop(reframed_new.columns[[9,10,11,12,13,14,15]], axis=1, inplace=True)
print(reframed_new.head())

In [None]:
reframed_new.shape

In [None]:
values_new = reframed_new.values
# split into input and outputs
new_pre = values_new[:, :-1]
# reshape input to be 3D [samples, timesteps, features]
new_pre = new_pre.reshape((new_pre.shape[0], 1, new_pre.shape[1]))
print(new_pre.shape)

In [None]:
yhat_new_pre = model.predict(new_pre)
yhat_new_pre

In [None]:
# invert scaling for actual
yhat_new_pre = yhat_new_pre.reshape((len(yhat_new_pre), 1))
new_pre_now = new_pre.reshape((new_pre.shape[0], new_pre.shape[2]))

In [None]:
yhat_new_pre = np.concatenate((yhat_new_pre, new_pre_now[:, 1:]), axis=1)
yhat_new_pre = scaler.inverse_transform(yhat_new_pre)
yhat_new_pre = yhat_new_pre[:,0]
yhat_new_pre = invert_scaling(yhat_new_pre, new_pre_now, "yhat_new_pre")
yhat_new_pre