In [1]:
%matplotlib inline
import pandas
import matplotlib
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Number of future samples to mean for prediction
prediction_window = 24

# Average window_stride elements together to form a single row
window_stride = 12

# Length of the windowed sequence
sequence_length = 24 * 7

# Number of features we take from the data
input_features = 8

fft_features = True

# Number of total features from data + generated
if fft_features:
    num_inputs = input_features + window_stride * input_features
else: 
    num_inputs = input_features

# Number of things we are doing regression to predict
num_outputs = 4

# Input Features
columns = ['hour', 'temp', 'windspd', 'winddir', 'no', 'no2', 'nox', 'o3']

# Read the data
df = pandas.read_csv('../data-sample/ready/d00_single.csv')

# Drop useless columns
df = df.drop(['AQS_Code', 'Latitude', 'Longitude', 'epoch', 'day'], axis=1)

# Unprocessed dataset
nd = df[columns].values

# Windowed dataset
nd_window = np.zeros((int(nd.shape[0] / window_stride), num_inputs))

row = 0
while row < nd.shape[0]:
    for i in range(0, input_features):
        # Mean features
        try:
            nd_window[int(row/window_stride)][i] = np.mean(nd[row:row+window_stride,i])
        except IndexError:
            break

        if fft_features:
            # Frequency features
            bins = np.real(np.fft.fft(nd[row:row+window_stride, i]))        
            nd_window[int(row/window_stride)][input_features + window_stride*i:input_features + window_stride*i + window_stride] = bins
        
    row += window_stride

scaler = MinMaxScaler()
scaler.fit(nd_window)
nd_window = scaler.transform(nd_window)

# Create sequences
data = []
labels = []

rows = deque(maxlen=sequence_length)

for idx, r in enumerate(nd_window):

    rows.append([a for a in r])
    
    # We need the entire sequence filled to make a prediction about the future mean
    if len(rows) < sequence_length:
        continue
    
    # Since we are predicting the mean, make sure we do not go out of bounds in the future
    if idx+1 + prediction_window > nd_window.shape[0]:
        break
        
    data.append(rows.copy())
        
    # We are predicting the future mean values
    u_24_no = np.mean( nd_window[idx+1 : idx+1 + prediction_window, 4] )
    u_24_no2 = np.mean( nd_window[idx+1 : idx+1 + prediction_window, 5] )
    u_24_nox = np.mean( nd_window[idx+1 : idx+1 + prediction_window, 6] )
    u_24_o3 = np.mean( nd_window[idx+1 : idx+1 + prediction_window, 7] )
    
    labels.append([u_24_no, u_24_no2, u_24_nox, u_24_o3])


data = np.array(data)
data.resize((len(data), sequence_length*num_inputs))
labels = np.array(labels)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data, labels, test_size=0.33, random_state=42)

In [2]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(random_state=0, n_estimators=100, n_jobs=-1)
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

KeyboardInterrupt: 

In [None]:
name = {}

f = 0
for s in range(0, sequence_length):
    key = s - (sequence_length-1)
    name[f+0] = "%d_hour" % key
    name[f+1] = "%d_temp" % key
    name[f+2] = "%d_windspd" % key
    name[f+3] = "%d_winddir" % key
    name[f+4] = "%d_no" % key
    name[f+5] = "%d_no2" % key
    name[f+6] = "%d_nox" % key
    name[f+7] = "%d_o3" % key
    
    f += 8
    
    for i in range(0, 8):
        for r in range(0, 12):
            name[f] = "%d_fft_%d_%d" % (key, i, r)
            f += 1

pairs = []

print(len(regr.feature_importances_))
for idx, imp in enumerate(regr.feature_importances_):    
    pairs.append([imp, name[idx]])
    
    
pairs.sort(reverse=True)
for v in pairs:
    value, key = v
    print("%s:\t\t\t%f" % (key, value))