In [22]:
%matplotlib inline
import pandas
import matplotlib
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Number of future samples to mean for prediction
prediction_window = 24

# Average window_stride elements together to form a single row
window_stride = 12

# Length of the windowed sequence
sequence_length = 24 * 7

# Number of features we take from the data
input_features = 8

fft_features = True

# Number of total features from data + generated
if fft_features:
    num_inputs = input_features + window_stride * input_features
else: 
    num_inputs = input_features

# Number of things we are doing regression to predict
num_outputs = 4

# Input Features
columns = ['hour', 'temp', 'windspd', 'winddir', 'no', 'no2', 'nox', 'o3']

# Read the data
df = pandas.read_csv('../data-sample/ready/d00_single.csv')

# Drop useless columns
df = df.drop(['AQS_Code', 'Latitude', 'Longitude', 'epoch', 'day'], axis=1)

# Unprocessed dataset
nd = df[columns].values

# Windowed dataset
nd_window = np.zeros((int(nd.shape[0] / window_stride), num_inputs))

row = 0
while row < nd.shape[0]:
    for i in range(0, input_features):
        # Mean features
        try:
            nd_window[int(row/window_stride)][i] = np.mean(nd[row:row+window_stride,i])
        except IndexError:
            break

        if fft_features:
            # Frequency features
            bins = np.real(np.fft.fft(nd[row:row+window_stride, i]))        
            nd_window[int(row/window_stride)][input_features + window_stride*i:input_features + window_stride*i + window_stride] = bins
        
    row += window_stride

scaler = MinMaxScaler()
scaler.fit(nd_window)
nd_window = scaler.transform(nd_window)

# Create sequences
data = []
labels = []

rows = deque(maxlen=sequence_length)

for idx, r in enumerate(nd_window):

    rows.append([a for a in r])
    
    # We need the entire sequence filled to make a prediction about the future mean
    if len(rows) < sequence_length:
        continue
    
    # Since we are predicting the mean, make sure we do not go out of bounds in the future
    if idx+1 + prediction_window > nd_window.shape[0]:
        break
        
    data.append(rows.copy())
        
    # We are predicting the future mean values
    u_24_no = np.mean( nd_window[idx+1 : idx+1 + prediction_window, 4] )
    u_24_no2 = np.mean( nd_window[idx+1 : idx+1 + prediction_window, 5] )
    u_24_nox = np.mean( nd_window[idx+1 : idx+1 + prediction_window, 6] )
    u_24_o3 = np.mean( nd_window[idx+1 : idx+1 + prediction_window, 7] )
    
    labels.append([u_24_no, u_24_no2, u_24_nox, u_24_o3])

data = np.array(data)
labels = np.array(labels)


In [20]:
from keras.models import Sequential
from keras import backend as K
from keras.layers import Dense, LSTM
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

def r2(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

model = Sequential()
model.add(Dense(num_inputs, input_shape=(sequence_length,num_inputs)))
model.add(LSTM(256, return_sequences=False, dropout=0.2))
model.add(Dense(512, activation='relu'))
model.add(Dense(num_outputs))
model.compile(optimizer='adam', loss='mean_squared_error', metrics=[r2])
model.summary()

model.fit(x=data, y=labels, batch_size=128, epochs=20, validation_split=0.33, verbose=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 168, 104)          10920     
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               369664    
_________________________________________________________________
dense_5 (Dense)              (None, 512)               131584    
_________________________________________________________________
dense_6 (Dense)              (None, 4)                 2052      
Total params: 514,220
Trainable params: 514,220
Non-trainable params: 0
_________________________________________________________________
Train on 5455 samples, validate on 2687 samples
Epoch 1/20
Epoch 2/20
 896/5455 [===>..........................] - ETA: 1:22 - loss: 0.0023 - r2_keras: 0.7664

KeyboardInterrupt: 