In [9]:
# prepare data for lstm
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

In [10]:
# load data
def parse(x):
    return datetime.strptime(x, '%Y %m %d %H')
dataset = read_csv('../../data/raw.csv',  parse_dates = [['year', 'month', 'day', 'hour']], index_col=0, date_parser=parse)
dataset.drop('No', axis=1, inplace=True)
# manually specify column names
dataset.columns = ['pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain']
dataset.index.name = 'date'
# mark all NA values with 0
dataset['pollution'].fillna(0, inplace=True)
# drop the first 24 hours
dataset = dataset[24:]
# summarize first 5 rows
print(dataset.head(5))
# save to file
dataset.to_csv('pollution.csv')

  dataset = read_csv('../../data/raw.csv',  parse_dates = [['year', 'month', 'day', 'hour']], index_col=0, date_parser=parse)
  dataset = read_csv('../../data/raw.csv',  parse_dates = [['year', 'month', 'day', 'hour']], index_col=0, date_parser=parse)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['pollution'].fillna(0, inplace=True)


                     pollution  dew  temp   press wnd_dir  wnd_spd  snow  rain
date                                                                          
2010-01-02 00:00:00      129.0  -16  -4.0  1020.0      SE     1.79     0     0
2010-01-02 01:00:00      148.0  -15  -4.0  1020.0      SE     2.68     0     0
2010-01-02 02:00:00      159.0  -11  -5.0  1021.0      SE     3.57     0     0
2010-01-02 03:00:00      181.0   -7  -5.0  1022.0      SE     5.36     1     0
2010-01-02 04:00:00      138.0   -7  -5.0  1022.0      SE     6.25     2     0


In [11]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()

    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]

    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
        # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg 

In [54]:
# load dataset
dataset = read_csv('../../data/pollution.csv', header=0, index_col=0)
values = dataset.values
# integer encode direction
encoder = LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])
# ensure all data is float
values = values.astype('float32')
# normalize features
#scaler = MinMaxScaler(feature_range=(0, 1))
#scaled = scaler.fit_transform(values)
# frame as supervised learning
reframed = series_to_supervised(values, 2, 2)
# drop columns we don't want to predict
#reframed.drop(reframed.columns[[17,18,19,20,21,22,23]], axis=1, inplace=True)
print(reframed.head())

   var1(t-2)  var2(t-2)  var3(t-2)  var4(t-2)  var5(t-2)  var6(t-2)  \
2      129.0      -16.0       -4.0     1020.0        2.0       1.79   
3      148.0      -15.0       -4.0     1020.0        2.0       2.68   
4      159.0      -11.0       -5.0     1021.0        2.0       3.57   
5      181.0       -7.0       -5.0     1022.0        2.0       5.36   
6      138.0       -7.0       -5.0     1022.0        2.0       6.25   

   var7(t-2)  var8(t-2)  var1(t-1)  var2(t-1)  var3(t-1)  var4(t-1)  \
2        0.0        0.0      148.0      -15.0       -4.0     1020.0   
3        0.0        0.0      159.0      -11.0       -5.0     1021.0   
4        0.0        0.0      181.0       -7.0       -5.0     1022.0   
5        1.0        0.0      138.0       -7.0       -5.0     1022.0   
6        2.0        0.0      109.0       -7.0       -6.0     1022.0   

   var5(t-1)  var6(t-1)  var7(t-1)  var8(t-1)  var1(t)  var2(t)  var3(t)  \
2        2.0       2.68        0.0        0.0    159.0    -11.0     -5

In [50]:
dataset

Unnamed: 0_level_0,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0
2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0
2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0
2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0
2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0
...,...,...,...,...,...,...,...,...
2014-12-31 19:00:00,8.0,-23,-2.0,1034.0,NW,231.97,0,0
2014-12-31 20:00:00,10.0,-22,-3.0,1034.0,NW,237.78,0,0
2014-12-31 21:00:00,10.0,-22,-3.0,1034.0,NW,242.70,0,0
2014-12-31 22:00:00,8.0,-22,-4.0,1034.0,NW,246.72,0,0


In [55]:
import pandas as pd

# Assuming 'df' is your pandas DataFrame
pd.set_option('display.max_columns', None)  # Ensure all columns are displayed
reframed


Unnamed: 0,var1(t-2),var2(t-2),var3(t-2),var4(t-2),var5(t-2),var6(t-2),var7(t-2),var8(t-2),var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var6(t-1),var7(t-1),var8(t-1),var1(t),var2(t),var3(t),var4(t),var5(t),var6(t),var7(t),var8(t),var1(t+1),var2(t+1),var3(t+1),var4(t+1),var5(t+1),var6(t+1),var7(t+1),var8(t+1)
2,129.0,-16.0,-4.0,1020.0,2.0,1.790000,0.0,0.0,148.0,-15.0,-4.0,1020.0,2.0,2.680000,0.0,0.0,159.0,-11.0,-5.0,1021.0,2.0,3.570000,0.0,0.0,181.0,-7.0,-5.0,1022.0,2.0,5.360000,1.0,0.0
3,148.0,-15.0,-4.0,1020.0,2.0,2.680000,0.0,0.0,159.0,-11.0,-5.0,1021.0,2.0,3.570000,0.0,0.0,181.0,-7.0,-5.0,1022.0,2.0,5.360000,1.0,0.0,138.0,-7.0,-5.0,1022.0,2.0,6.250000,2.0,0.0
4,159.0,-11.0,-5.0,1021.0,2.0,3.570000,0.0,0.0,181.0,-7.0,-5.0,1022.0,2.0,5.360000,1.0,0.0,138.0,-7.0,-5.0,1022.0,2.0,6.250000,2.0,0.0,109.0,-7.0,-6.0,1022.0,2.0,7.140000,3.0,0.0
5,181.0,-7.0,-5.0,1022.0,2.0,5.360000,1.0,0.0,138.0,-7.0,-5.0,1022.0,2.0,6.250000,2.0,0.0,109.0,-7.0,-6.0,1022.0,2.0,7.140000,3.0,0.0,105.0,-7.0,-6.0,1023.0,2.0,8.930000,4.0,0.0
6,138.0,-7.0,-5.0,1022.0,2.0,6.250000,2.0,0.0,109.0,-7.0,-6.0,1022.0,2.0,7.140000,3.0,0.0,105.0,-7.0,-6.0,1023.0,2.0,8.930000,4.0,0.0,124.0,-7.0,-5.0,1024.0,2.0,10.720000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43794,8.0,-23.0,0.0,1032.0,1.0,214.089996,0.0,0.0,9.0,-22.0,-1.0,1033.0,1.0,221.240005,0.0,0.0,10.0,-22.0,-2.0,1033.0,1.0,226.160004,0.0,0.0,8.0,-23.0,-2.0,1034.0,1.0,231.970001,0.0,0.0
43795,9.0,-22.0,-1.0,1033.0,1.0,221.240005,0.0,0.0,10.0,-22.0,-2.0,1033.0,1.0,226.160004,0.0,0.0,8.0,-23.0,-2.0,1034.0,1.0,231.970001,0.0,0.0,10.0,-22.0,-3.0,1034.0,1.0,237.779999,0.0,0.0
43796,10.0,-22.0,-2.0,1033.0,1.0,226.160004,0.0,0.0,8.0,-23.0,-2.0,1034.0,1.0,231.970001,0.0,0.0,10.0,-22.0,-3.0,1034.0,1.0,237.779999,0.0,0.0,10.0,-22.0,-3.0,1034.0,1.0,242.699997,0.0,0.0
43797,8.0,-23.0,-2.0,1034.0,1.0,231.970001,0.0,0.0,10.0,-22.0,-3.0,1034.0,1.0,237.779999,0.0,0.0,10.0,-22.0,-3.0,1034.0,1.0,242.699997,0.0,0.0,8.0,-22.0,-4.0,1034.0,1.0,246.720001,0.0,0.0
