In [1]:
# prepare data for lstm
import pandas as pd
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import time

In [None]:
# load data
def parse(x):
    return datetime.strptime(x, '%Y %m %d %H')
dataset = read_csv('../../data/raw.csv',  parse_dates = [['year', 'month', 'day', 'hour']], index_col=0, date_parser=parse)
dataset.drop('No', axis=1, inplace=True)
# manually specify column names
dataset.columns = ['pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain']
dataset.index.name = 'date'
# mark all NA values with 0
dataset['pollution'].fillna(0, inplace=True)
# drop the first 24 hours
dataset = dataset[24:]
# summarize first 5 rows
print(dataset.head(5))
# save to file
dataset.to_csv('pollution.csv')

In [7]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    # Operation 1
    start_time = time.time()
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    print("--- %s seconds ---" % (time.time() - start_time))

    # Operation 2
    # input sequence (t-n, ... t-1)
    start_time = time.time()
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    print("--- %s seconds ---" % (time.time() - start_time))


    # Operation 3
    start_time = time.time()
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    print("--- %s seconds ---" % (time.time() - start_time))

    #Operation 4
    # put it all together
    start_time = time.time()
    agg = concat(cols, axis=1)
    agg.columns = names
    print("--- %s seconds ---" % (time.time() - start_time))

    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)

    del start_time, n_vars, df, cols, names, data, n_in, n_out, dropnan, i

    return agg 

In [18]:
# load dataset
dataset = read_csv('../../data/pollution.csv', header=0, index_col=0)
values = dataset.values
# integer encode direction
encoder = LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])
# ensure all data is float
values = values.astype('float32')
# normalize features
#scaler = MinMaxScaler(feature_range=(0, 1))
#scaled = scaler.fit_transform(values)
# frame as supervised learning
reframed = series_to_supervised(values, 2, 2)
# drop columns we don't want to predict
#reframed.drop(reframed.columns[[17,18,19,20,21,22,23]], axis=1, inplace=True)
print(reframed.head())

--- 0.0001068115234375 seconds ---
--- 0.0005869865417480469 seconds ---
--- 0.0005867481231689453 seconds ---
--- 0.0039119720458984375 seconds ---
   var1(t-2)  var2(t-2)  var3(t-2)  var4(t-2)  var5(t-2)  var6(t-2)  \
2      129.0      -16.0       -4.0     1020.0        2.0       1.79   
3      148.0      -15.0       -4.0     1020.0        2.0       2.68   
4      159.0      -11.0       -5.0     1021.0        2.0       3.57   
5      181.0       -7.0       -5.0     1022.0        2.0       5.36   
6      138.0       -7.0       -5.0     1022.0        2.0       6.25   

   var7(t-2)  var8(t-2)  var1(t-1)  var2(t-1)  ...  var7(t)  var8(t)  \
2        0.0        0.0      148.0      -15.0  ...      0.0      0.0   
3        0.0        0.0      159.0      -11.0  ...      1.0      0.0   
4        0.0        0.0      181.0       -7.0  ...      2.0      0.0   
5        1.0        0.0      138.0       -7.0  ...      3.0      0.0   
6        2.0        0.0      109.0       -7.0  ...      4.0     

## Our dataset

In [3]:
# load dataset
completed_data_30days = pd.read_hdf('../../data/completed_data_30days_v1.h5', key='df')
print(f'30 days dataset shape:', completed_data_30days.shape)

30 days dataset shape: (49740, 8705)


In [19]:
completed_data_30days.shape

(49740, 8705)

In [36]:
# completed_data_30days.columns[3:100]

In [4]:
df_test = completed_data_30days[0:10000]

In [19]:
df_test

Unnamed: 0,1 leg bridge (left)_reps_1,1 leg bridge (left)_reps_10,1 leg bridge (left)_reps_11,1 leg bridge (left)_reps_12,1 leg bridge (left)_reps_13,1 leg bridge (left)_reps_14,1 leg bridge (left)_reps_15,1 leg bridge (left)_reps_16,1 leg bridge (left)_reps_17,1 leg bridge (left)_reps_2,...,Plank balance_reps_10,Plank balance_reps_11,Plank balance_reps_12,Plank balance_reps_13,Plank balance_reps_14,Plank balance_reps_15,Plank balance_reps_16,Plank balance_reps_17,Plank balance_reps_18,Plank balance_reps_19
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:

#values = dataset.values

# normalize features
#scaler = MinMaxScaler(feature_range=(0, 1))
#scaled = scaler.fit_transform(values)
# frame as supervised learning
reframed = series_to_supervised(df_test.values, 7, 3)
# drop columns we don't want to predict
#reframed.drop(reframed.columns[[17,18,19,20,21,22,23]], axis=1, inplace=True)
print(reframed.head())

--- 2.2960500717163086 seconds ---
--- 5.801149368286133 seconds ---
--- 2.46720814704895 seconds ---
--- 8.169317722320557 seconds ---
   var1(t-7)  var2(t-7)                   var3(t-7) var4(t-7) var5(t-7)  \
7        172 2021-11-13  2021-11-13 10:11:42.357218       0.0       0.0   
8        172 2021-11-14  2021-11-14 12:05:10.670652       0.0       0.0   
9        172 2021-11-15  2021-11-15 12:53:48.363087       0.0       0.0   
10       172 2021-11-16                           0       0.0       0.0   
11       172 2021-11-17                           0       0.0       0.0   

   var6(t-7) var7(t-7) var8(t-7) var9(t-7) var10(t-7)  ... var8696(t+2)  \
7        0.0       0.0       0.0       0.0        0.0  ...            0   
8        0.0       0.0       0.0       0.0        0.0  ...            0   
9        0.0       0.0       0.0       0.0        0.0  ...            0   
10       0.0       0.0       0.0       0.0        0.0  ...            0   
11       0.0       0.0       0.0      

In [9]:
import gc
gc.collect()

0

In [10]:
reframed

Unnamed: 0,var1(t-7),var2(t-7),var3(t-7),var4(t-7),var5(t-7),var6(t-7),var7(t-7),var8(t-7),var9(t-7),var10(t-7),...,var8696(t+2),var8697(t+2),var8698(t+2),var8699(t+2),var8700(t+2),var8701(t+2),var8702(t+2),var8703(t+2),var8704(t+2),var8705(t+2)
7,172,2021-11-13,2021-11-13 10:11:42.357218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,79075.0,3.0,0.0,1960-12-31 00:00:00,179.0,92.0,45.0,215.0,2021-11-22 20:01:41.072564
8,172,2021-11-14,2021-11-14 12:05:10.670652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0
9,172,2021-11-15,2021-11-15 12:53:48.363087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0
10,172,2021-11-16,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0
11,172,2021-11-17,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,79075.0,3.0,0.0,1960-12-31 00:00:00,179.0,92.0,45.0,158.0,2021-11-26 06:37:33.113317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9993,3770,2022-01-18,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0
9994,3770,2022-01-19,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0
9995,3770,2022-01-20,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0
9996,3770,2022-01-21,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0
