<a href="https://colab.research.google.com/github/gagandt/modelling-air-pollution/blob/master/multivariate/GRID_MLP_on_PM2_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from pandas import DataFrame
import pandas
from pandas import concat

# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [None]:
import os
from pandas import read_csv
from datetime import datetime
# load data
os.chdir('/content/drive/My Drive/Colab Notebooks')
def parse(x):
	return datetime.strptime(x, '%Y %m %d %H')
dataset = read_csv('./data/beijing.csv',  parse_dates = [['year', 'month', 'day', 'hour']], index_col=0, date_parser=parse)
dataset.drop('No', axis=1, inplace=True)
# manually specify column names
dataset.columns = ['pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain']
dataset.index.name = 'date'
# mark all NA values with 0
dataset['pollution'].fillna(0, inplace=True)
# drop the first 24 hours
dataset = dataset[24:]
# summarize first 5 rows
print(dataset.head(5))
# save to file
dataset.to_csv('pollution.csv')


                     pollution  dew  temp   press wnd_dir  wnd_spd  snow  rain
date                                                                          
2010-01-02 00:00:00      129.0  -16  -4.0  1020.0      SE     1.79     0     0
2010-01-02 01:00:00      148.0  -15  -4.0  1020.0      SE     2.68     0     0
2010-01-02 02:00:00      159.0  -11  -5.0  1021.0      SE     3.57     0     0
2010-01-02 03:00:00      181.0   -7  -5.0  1022.0      SE     5.36     1     0
2010-01-02 04:00:00      138.0   -7  -5.0  1022.0      SE     6.25     2     0


In [None]:
import copy
import keras
from math import sqrt
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.metrics import mean_squared_error
import numpy as np

def grid_search(dataset, lookback, nodes, layers, batch_sizes, split_percentage):
  scores = list()
  
  for lookback_period in lookback:
    values = series_to_supervised(dataset, lookback_period, 1).values
    
    n_train_hours = (int)(split_percentage * len(values))
    train = values[:n_train_hours, :]
    test = values[n_train_hours:, :]
    
    n_features = 8
    n_hours = lookback_period
    n_obs = n_hours * n_features
    train_X, train_y = train[:, :n_obs], train[:, -n_features]
    test_X, test_y = test[:, :n_obs], test[:, -n_features]

    # reshape input to be 3D [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], n_obs))
    test_X = test_X.reshape((test_X.shape[0], n_obs))
    print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)
    
    for number_of_nodes in nodes:
      
      for number_of_layers in layers:
        model = Sequential()
        for i in range(number_of_layers):
          if i == 0 :
            print(n_obs)
            model.add(Dense(number_of_nodes, input_shape=(n_obs,)))
          elif i == number_of_layers - 1 :
            model.add(Dense(number_of_nodes))
          else :
            model.add(Dense(number_of_nodes))
        
        model.add(Dense(1))
        model.compile(loss='mae', optimizer='adam')
        
        ref_model = copy.deepcopy(model)
        
        for batch_size in batch_sizes:
          for number_of_epochs in epochs:         
            model = copy.deepcopy(ref_model)

            name = str(lookback_period)+'_'+str(number_of_nodes)+'_'+str(number_of_layers)+'_'+str(batch_size)+'_'+str(number_of_epochs)
            print("Starting training for : ", name)
            
            history = model.fit(train_X, train_y, epochs=number_of_epochs, batch_size=batch_size, validation_data=(test_X, test_y), verbose=2, shuffle=False)
            yhat = model.predict(test_X)

            np.save('LSTM_runtime/history/' + name, history)
            np.save('LSTM_runtime/test_y/' + name, test_y)
            np.save('LSTM_runtime/yhat/' + name, yhat)
            
            rmse = mean_squared_error(yhat, test_y)
            print("RMSE for ", name, " = " , rmse)

            scores.append([rmse, lookback_period, number_of_nodes, number_of_layers, batch_size, number_of_epochs])
            np.save('LSTM_scores', np.array(scores))
            keras.backend.clear_session()
  return scores
          
  

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

# load dataset
dataset = read_csv('pollution.csv', header=0, index_col=0)
values = dataset.values

# integer encode direction
encoder = LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])

# ensure all data is float
values = values.astype('float32')

# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

In [None]:
layers = [2,4,16,32]
lookback = [1,3,5,7]
nodes = [25,50,75]
# batch_size = [50, 75, 100]
batch_size = [50, 75, 100]
# epochs = [50, 100]
epochs = [1]
split_percentage = 0.8

scores = grid_search(values, lookback, nodes, layers, batch_size, split_percentage)
print(scores)
np.save('LSTM_scores', np.array(scores))

(35039, 8) (35039,) (8760, 8) (8760,)
8
Starting training for :  1_25_2_50_1
Train on 35039 samples, validate on 8760 samples
Epoch 1/1
 - 1s - loss: 26.9670 - val_loss: 23.7717
RMSE for  1_25_2_50_1  =  1088.9563
Starting training for :  1_25_2_75_1
Train on 35039 samples, validate on 8760 samples
Epoch 1/1
 - 1s - loss: 30.3775 - val_loss: 19.5419
RMSE for  1_25_2_75_1  =  832.6146
Starting training for :  1_25_2_100_1
Train on 35039 samples, validate on 8760 samples
Epoch 1/1
 - 0s - loss: 36.5360 - val_loss: 23.2347
RMSE for  1_25_2_100_1  =  1052.7308
8
Starting training for :  1_25_4_50_1
Train on 35039 samples, validate on 8760 samples
Epoch 1/1
 - 1s - loss: 25.8252 - val_loss: 15.1252
RMSE for  1_25_4_50_1  =  661.5668
Starting training for :  1_25_4_75_1
Train on 35039 samples, validate on 8760 samples
Epoch 1/1
 - 1s - loss: 32.4545 - val_loss: 16.1284
RMSE for  1_25_4_75_1  =  702.9793
Starting training for :  1_25_4_100_1
Train on 35039 samples, validate on 8760 samples
Ep

KeyboardInterrupt: ignored