# Data Preprocessing for LSTM  
Because of our dataset is stationary, hence we need to transform the time series as supervised data. This can be done by shifting the time
series data and then compare the shifted time series data with the original one. In our case, we will do 7 different lags.

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [2]:
data=pd.read_csv('datasets.csv',
                 header=None, sep=',', skiprows=17, na_values='-999', 
                 names=['Year','Month','Day','Sky_clearness','Temperature','Tem_max','Tem_min','Humidity','Precipitation','Pressure','Wind Speed'])

data['Date'] = pd.to_datetime(data[['Year', 'Month', 'Day']],errors='ignore')
'''The reason why we have to use errors="ignore" is because not all the dates we are parsing that are in the correct format.
If we use errors="coerce" then any dates that cannot be converted will be set to NaT.'''
data=data.drop(columns=['Year','Month','Day'])
data=data.set_index('Date')
data=data.dropna()
data.tail()

Unnamed: 0_level_0,Sky_clearness,Temperature,Tem_max,Tem_min,Humidity,Precipitation,Pressure,Wind Speed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-11-27,0.61,21.82,26.18,16.95,12.76,0.0,100.83,1.37
2023-11-28,0.63,20.19,25.62,14.91,11.23,0.0,100.75,2.01
2023-11-29,0.52,20.86,26.1,15.41,12.27,0.0,100.66,1.26
2023-11-30,0.32,21.31,25.77,17.84,14.16,2.02,100.85,2.52
2023-12-01,0.13,16.73,18.66,14.86,9.95,7.22,101.17,3.74


In [3]:
data_temp=data.Temperature

In [4]:
def prepare_data_lstm(df, look_back):
    for i in range(1,look_back+1):
        df['lag_{}'.format(i)] = df['Temperature'].shift(i)
    df.dropna(inplace=True)
    return df

In [5]:
look_back=7
data_lstm=prepare_data_lstm(data_temp.to_frame(), look_back)
data_lstm.head(7)

Unnamed: 0_level_0,Temperature,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1990-01-08,21.87,20.6,19.73,21.01,17.86,17.85,17.09,18.62
1990-01-09,23.03,21.87,20.6,19.73,21.01,17.86,17.85,17.09
1990-01-10,23.58,23.03,21.87,20.6,19.73,21.01,17.86,17.85
1990-01-11,23.13,23.58,23.03,21.87,20.6,19.73,21.01,17.86
1990-01-12,20.3,23.13,23.58,23.03,21.87,20.6,19.73,21.01
1990-01-13,20.53,20.3,23.13,23.58,23.03,21.87,20.6,19.73
1990-01-14,23.42,20.53,20.3,23.13,23.58,23.03,21.87,20.6


In [6]:
# Convert the data to a numpy array for the LSTM model
data_lstm_np= data_lstm.to_numpy()
data_lstm_np

array([[21.87, 20.6 , 19.73, ..., 17.85, 17.09, 18.62],
       [23.03, 21.87, 20.6 , ..., 17.86, 17.85, 17.09],
       [23.58, 23.03, 21.87, ..., 21.01, 17.86, 17.85],
       ...,
       [20.86, 20.19, 21.82, ..., 21.77, 21.64, 20.39],
       [21.31, 20.86, 20.19, ..., 21.43, 21.77, 21.64],
       [16.73, 21.31, 20.86, ..., 21.53, 21.43, 21.77]])

Next, we proceed to split our supervised data into training and test set. For this case, I am going to use the last month as test data. But before that, we need to normalize the value of our data.

In [7]:
# Normalize the data (optional)
scaler=MinMaxScaler(feature_range=(-1,1))
data_lstm_scaled=scaler.fit_transform(data_lstm_np)

Note that, it's essential to flip in the horizontal direction (lag_7 -> lag_1) since the LSTM model gets the updated results until reaching the most close value to the current time.

In [8]:
# Split the data into input and output
X_lstm=np.flip(data_lstm_scaled[:,1:],axis=1) 
y_lstm=data_lstm_scaled[:,0]

In [9]:
# Split the data into train and test
split_index = int(len(data_lstm[:'2023-10'])) # Test data from 2023-11-01
X_lstm_train=X_lstm[:split_index]
X_lstm_test=X_lstm[split_index:]

y_lstm_train=y_lstm[:split_index]
y_lstm_test=y_lstm[split_index:]

X_lstm_train.shape, X_lstm_test.shape, y_lstm_train.shape, y_lstm_test.shape

((12350, 7), (31, 7), (12350,), (31,))

In [10]:
#Make sure that the our data has an extra dimension for LSTM model
X_lstm_train = np.expand_dims(X_lstm_train, axis=-1)
X_lstm_test = np.expand_dims(X_lstm_test, axis=-1)

y_lstm_train = np.expand_dims(y_lstm_train, axis=-1)
y_lstm_test = np.expand_dims(y_lstm_test, axis=-1)

X_lstm_train.shape, X_lstm_test.shape, y_lstm_train.shape, y_lstm_test.shape

((12350, 7, 1), (31, 7, 1), (12350, 1), (31, 1))

In [11]:
X_lstm_test

array([[[ 0.32131049],
        [ 0.33273219],
        [ 0.33754133],
        [ 0.26059513],
        [ 0.22813345],
        [ 0.1746318 ],
        [ 0.19206492]],

       [[ 0.33273219],
        [ 0.33754133],
        [ 0.26059513],
        [ 0.22813345],
        [ 0.1746318 ],
        [ 0.19206492],
        [ 0.15840096]],

       [[ 0.33754133],
        [ 0.26059513],
        [ 0.22813345],
        [ 0.1746318 ],
        [ 0.19206492],
        [ 0.15840096],
        [ 0.16561467]],

       [[ 0.26059513],
        [ 0.22813345],
        [ 0.1746318 ],
        [ 0.19206492],
        [ 0.15840096],
        [ 0.16561467],
        [ 0.23715059]],

       [[ 0.22813345],
        [ 0.1746318 ],
        [ 0.19206492],
        [ 0.15840096],
        [ 0.16561467],
        [ 0.23715059],
        [ 0.29546138]],

       [[ 0.1746318 ],
        [ 0.19206492],
        [ 0.15840096],
        [ 0.16561467],
        [ 0.23715059],
        [ 0.29546138],
        [ 0.31469793]],

       [[ 0.19206492],

In [None]:
# Building LSTM model
n_neurons=64
model_lstm = Sequential()
model_lstm.add(LSTM(n_neurons, input_shape=(look_back, 1)))
model_lstm.add(Dense(1))
model_lstm.compile(optimizer='adam', loss='mse')

model_lstm.summary()

In [None]:
model_lstm.fit(X_lstm_train, y_lstm_train, epochs=300, batch_size=10, verbose=0)

In [None]:
# After model training, we can use the model to predict the test data. After that, we need preprocess the prediction because we have normalized the data before. Hence, we basically need to inverse the scale back to the original so that we can compare the prediction result with the original time series data.
pred_temp_lstm=model_lstm.predict(X_lstm_test)
pred_data=scaler.inverse_transform(np.concatenate((pred_temp_lstm, X_lstm_test.reshape(-1,look_back)), axis=1))[:,0]
true_data=scaler.inverse_transform(np.concatenate((y_lstm_test, X_lstm_test.reshape(-1,look_back)), axis=1))[:,0]
df_lstm=pd.DataFrame({'True':true_data, 'Predicted':pred_data}, index=data_lstm['2023-11-01':].index)