In [207]:
# Imports and Libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import holidays
import plotly.express as px
import plotly.graph_objects as go

In [208]:
# Data and Preprocessing
# Load your time series data
data = pd.read_csv('00_MRT_2023_LSTM.csv')

In [209]:
# Ensure 'Date' column is in datetime format
data['Date'] = pd.to_datetime(data['Date'])

# Add a column to determine if the date is a holiday in the Philippines
ph_holidays = holidays.PH()
data['Holiday'] = data['Date'].apply(lambda x: 1 if x in ph_holidays else 0)

# One-hot encode the 'rain_desc' column
encoder = OneHotEncoder(sparse_output=False)
rain_desc_encoded = encoder.fit_transform(data[['rain_desc']])
rain_desc_encoded_df = pd.DataFrame(rain_desc_encoded, columns=encoder.get_feature_names_out(['rain_desc']))


# Combine the encoded 'rain_desc' with the original data
data = pd.concat([data, rain_desc_encoded_df], axis=1)

# Ensure 'is_weekend' and 'Holiday' are numerical
data['is_weekend'] = data['is_weekend'].astype(int)
data['Holiday'] = data['Holiday'].astype(int)

In [210]:
# Extract year, month, and day
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

# Aggregate data by year
yearly_data = data.groupby('Year')['Entry'].sum().reset_index()

# Aggregate data by month
monthly_data = data.groupby(['Year', 'Month'])['Entry'].sum().reset_index()
monthly_data['Date'] = pd.to_datetime(monthly_data[['Year', 'Month']].assign(DAY=1)) + pd.offsets.MonthEnd(1)

# Aggregate data by day
daily_data = data.groupby(['Year', 'Month', 'Day'])['Entry'].sum().reset_index()
daily_data['Date'] = pd.to_datetime(daily_data[['Year', 'Month', 'Day']])

In [211]:
yearly_data.head()

Unnamed: 0,Year,Entry
0,2023,18357202


In [212]:
daily_data.head()


Unnamed: 0,Year,Month,Day,Entry,Date
0,2023,1,1,30264,2023-01-01
1,2023,1,2,37952,2023-01-02
2,2023,1,3,46134,2023-01-03
3,2023,1,4,48791,2023-01-04
4,2023,1,5,42915,2023-01-05


In [213]:
monthly_data.head(12)

Unnamed: 0,Year,Month,Entry,Date
0,2023,1,1466919,2023-01-31
1,2023,2,1406923,2023-02-28
2,2023,3,1525850,2023-03-31
3,2023,4,1241611,2023-04-30
4,2023,5,1520365,2023-05-31
5,2023,6,1494518,2023-06-30
6,2023,7,1561701,2023-07-31
7,2023,8,1659979,2023-08-31
8,2023,9,1630212,2023-09-30
9,2023,10,1618931,2023-10-31


In [214]:
data.head()

Unnamed: 0,Day,Date,Time,Entry,is_weekend,rain_desc,Holiday,rain_desc_Heavy Rain,rain_desc_Light Rain,rain_desc_Moderate Rain,rain_desc_No Rain,rain_desc_Trace Rain,Year,Month
0,1,2023-01-01,4:00,0,1,Light Rain,1,0.0,1.0,0.0,0.0,0.0,2023,1
1,1,2023-01-01,5:00,0,1,Light Rain,1,0.0,1.0,0.0,0.0,0.0,2023,1
2,1,2023-01-01,6:00,956,1,Light Rain,1,0.0,1.0,0.0,0.0,0.0,2023,1
3,1,2023-01-01,7:00,1076,1,Light Rain,1,0.0,1.0,0.0,0.0,0.0,2023,1
4,1,2023-01-01,8:00,1081,1,Light Rain,1,0.0,1.0,0.0,0.0,0.0,2023,1


In [215]:
# Extract the 'Entry' column for prediction
entry_data = data['Entry'].values

# Combine features
features = data[['is_weekend', 'Holiday'] + list(rain_desc_encoded_df.columns)].values

# Normalize the 'Entry' column
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_entry_data = scaler.fit_transform(entry_data.reshape(-1, 1))

# Combine the scaled 'Entry' data with the other features
combined_data = np.hstack((scaled_entry_data, features))

In [216]:
# Plot hourly data by year
fig_year = px.line(data, x='Date', y='Entry', color='Year',
                   title='Hourly Data by Year', labels={'color': 'Year'})
fig_year.update_xaxes(title_text='Date')
fig_year.update_yaxes(title_text='Entry')
fig_year.show()

In [217]:
# Plot the total sum of 'Entry' by month
fig_month = px.line(monthly_data, x='Date', y='Entry',
                    title='Total Sum of Entry by Month', labels={'Entry': 'Total Entry'})
fig_month.update_xaxes(title_text='Date')
fig_month.update_yaxes(title_text='Total Entry')
fig_month.show()

In [218]:
# Plot the total sum of 'Entry' by day
fig_day = px.line(daily_data, x='Date', y='Entry',
                  title='Total Sum of Entry by Day', labels={'Entry': 'Total Entry'})
fig_day.update_xaxes(title_text='Date')
fig_day.update_yaxes(title_text='Total Entry')
fig_day.show()

In [219]:
# Extract the 'Entry' column for prediction
entry_data = data['Entry'].values

# Combine features
features = data[['is_weekend', 'Holiday'] + list(rain_desc_encoded_df.columns)].values

# Normalize the 'Entry' column
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_entry_data = scaler.fit_transform(entry_data.reshape(-1, 1))

# Combine the scaled 'Entry' data with the other features
combined_data = np.hstack((scaled_entry_data, features))

In [220]:
# Prepare the data for LSTM
def create_sequences(data, seq_length):
    sequences = []
    labels = []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i + seq_length])
        labels.append(data[i + seq_length])
    return np.array(sequences), np.array(labels)

seq_length = 100  # Example sequence length
X, y = create_sequences(combined_data, seq_length)

# Extract the corresponding dates for the sequences
date_data = data['Date'].values[seq_length:]

print(f'Shape of X: {X.shape}')
print(f'Shape of y: {y.shape}')
print(f'Shape of date_data: {date_data.shape}')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test, date_train, date_test = train_test_split(X, y, date_data, test_size=0.2, random_state=42)

print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of y_test: {y_test.shape}')
print(f'Shape of date_train: {date_train.shape}')
print(f'Shape of date_test: {date_test.shape}')


Shape of X: (7200, 100, 8)
Shape of y: (7200, 8)
Shape of date_data: (7200,)
Shape of X_train: (5760, 100, 8)
Shape of X_test: (1440, 100, 8)
Shape of y_train: (5760, 8)
Shape of y_test: (1440, 8)
Shape of date_train: (5760,)
Shape of date_test: (1440,)


In [221]:
# Build the LSTM model with dropout layers
model = Sequential()
model.add(Input(shape=(seq_length, X_train.shape[2])))
model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Use early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping])



Epoch 1/100
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 148ms/step - loss: 0.1474 - val_loss: 0.1420
Epoch 2/100
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 137ms/step - loss: 0.1441 - val_loss: 0.1419
Epoch 3/100
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 137ms/step - loss: 0.1440 - val_loss: 0.1419
Epoch 4/100
[1m141/144[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 126ms/step - loss: 0.1430

KeyboardInterrupt: 

In [198]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Make predictions
predictions = model.predict(X_test)
predictions = scaler.inverse_transform(predictions)

# Clip the predictions to ensure non-negative values
predictions = np.clip(predictions, 0, None)

# Extract the first column of y_test (the 'Entry' column)
y_test_entry = y_test[:, 0].reshape(-1, 1)

# Inverse transform the test data for comparison
y_test_entry = scaler.inverse_transform(y_test_entry)


print(f'Shape of predictions: {predictions.shape}')
print(f'Shape of y_test: {y_test.shape}')


# Calculate evaluation metrics
mse = mean_squared_error(y_test_entry, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_entry, predictions)
r2 = r2_score(y_test_entry, predictions)

# Print the evaluation metrics
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R²: {r2}')

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step
Shape of predictions: (1440, 1)
Shape of y_test: (1440, 8)
MSE: 3262683.3394637755
RMSE: 1806.289937818338
MAE: 1544.3484647962782
R²: -0.7251646980592199


In [204]:
# Prepare the data for plotting
plot_data = pd.DataFrame({
    'Date': date_test,
    'True Values': y_test_entry.flatten(),
    'Predictions': predictions.flatten()
})

# Sort the data by Date
plot_data = plot_data.sort_values(by='Date')

# Melt the DataFrame to have a long format suitable for bar plotting
plot_data_melted = plot_data.melt(id_vars=['Date'], value_vars=['True Values', 'Predictions'], 
                                  var_name='Type', value_name='Entry')

# Plot the results using Plotly Express as a bar graph
fig = px.line(plot_data_melted, x='Date', y='Entry', color='Type',
             title='True Values vs Predictions', labels={'Entry': 'Entry', 'Type': 'Legend'})
fig.update_xaxes(title_text='Date')
fig.update_yaxes(title_text='Entry')
fig.show()