In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Read the data from the CSV file
data = pd.read_csv("/kaggle/input/building-data-genome-project-v1/Office_Garman.csv", index_col="timestamp", parse_dates=True)

# Remove duplicate entries
data = data.drop_duplicates()

# Set the frequency to hourly
data = data.asfreq('H')

# Fill missing values using the forward-fill method
data = data.fillna(method='ffill')

# Perform Z-score normalization
# Z-score normalization is performed using StandardScaler.

# Z-score normalization, also known as standardization, 
# is a common technique used to normalize or standardize 
# the distribution of a dataset. It transforms the data such that 
# it has a mean of zero and a standard deviation of one. 
# This transformation allows data points to be expressed in terms of 
# their relative position within the dataset.

scaler = StandardScaler()
normalized_data = scaler.fit_transform(data.values)

# Define window size and prediction horizon
# (setting the parameters that determine the input and output 
# sequences used for training the LSTM model)

# The window size refers to the number of consecutive data points 
# used as input to the LSTM model. It defines the length of the input 
# sequence that the model will analyze at each time step. In this code, 
# the window size is set to 24, which means that the LSTM model will 
# consider a sequence of 24 data points as input.

# The prediction horizon refers to the number of data points to be 
# predicted by the LSTM model. It determines the length of the output 
# sequence that the model will generate. In this code, the prediction 
# horizon is set to 1, meaning that the LSTM model will predict the next 
# data point in the sequence.

window_size = 24
prediction_horizon = 1

# Prepare the data for LSTM

# The input sequences (X) are created by selecting a window of length 
# window_size from the normalized data.

# The output sequences (y) are created by selecting the next value 
# (prediction_horizon) after the input window.

X = []
y = []
for i in range(len(normalized_data) - window_size - prediction_horizon + 1):
    X.append(normalized_data[i:i+window_size])
    y.append(normalized_data[i+window_size:i+window_size+prediction_horizon, 0])  # Adjust the column index if needed

X = np.array(X)
y = np.array(y)

# Split the data into train and test sets
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Build the LSTM model

# The LSTM model is defined using Sequential.
# The model consists of an LSTM layer with 64 units and a ReLU activation function.
# The input shape of the LSTM layer is set to (window_size, number_of_features).
# A dense output layer is added with prediction_horizon units.

model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(window_size, X.shape[2])))
model.add(Dense(prediction_horizon))

# Compile the model
# The model is compiled with the Adam optimizer and mean squared error (MSE) loss.

model.compile(optimizer='adam', loss='mean_squared_error')

# Define early stopping
# Early stopping is defined to monitor validation loss and restore the best weights when training stagnates.

early_stopping = EarlyStopping(patience=3, restore_best_weights=True)

# Train the model
# The model is trained on the training data for a specified number of epochs and batch size.
# The validation data is used to monitor the model's performance during training.
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Make predictions
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

# Rescale the predictions
train_pred = scaler.inverse_transform(train_pred)
test_pred = scaler.inverse_transform(test_pred)

# Calculate evaluation metrics
train_rmse = np.sqrt(mean_squared_error(data[window_size:train_size+window_size, 0], train_pred[:, 0]))
test_rmse = np.sqrt(mean_squared_error(data[train_size+window_size:train_size+window_size+len(test_pred), 0], test_pred[:, 0]))

train_mae = mean_absolute_error(data[window_size:train_size+window_size, 0], train_pred[:, 0])
test_mae = mean_absolute_error(data[train_size+window_size:train_size+window_size+len(test_pred), 0], test_pred[:, 0])

train_mape = mean_absolute_percentage_error(data[window_size:train_size+window_size, 0], train_pred[:, 0])
test_mape = mean_absolute_percentage_error(data[train_size+window_size:train_size+window_size+len(test_pred), 0], test_pred[:, 0])

# Calculate NMBE
train_nmbe = np.mean((train_pred[:, 0] - data[window_size:train_size+window_size, 0]) / data[window_size:train_size+window_size, 0]) * 100
test_nmbe = np.mean((test_pred[:, 0] - data[train_size+window_size:train_size+window_size+len(test_pred), 0]) / data[train_size+window_size:train_size+window_size+len(test_pred), 0]) * 100


# Calculate CVRMSE
train_cvrmsme = train_rmse / np.mean(data[window_size:train_size+window_size, 0]) * 100
test_cvrmsme = test_rmse / np.mean(data[train_size+window_size:train_size+window_size+len(test_pred), 0]) * 100

# Plot the prediction waveform
plt.figure(figsize=(10, 6))
plt.plot(data[window_size:train_size+window_size, 0], label='Actual (Train)')
plt.plot(range(train_size+window_size, train_size+window_size+len(test_pred)), data[train_size+window_size:train_size+window_size+len(test_pred), 0], label='Actual (Test)')
plt.plot(range(window_size, window_size+train_size), train_pred[:, 0], label='Predicted (Train)')
plt.plot(range(train_size+window_size, train_size+window_size+len(test_pred)), test_pred[:, 0], label='Predicted (Test)')
plt.xlabel('Time')
plt.ylabel('Amplitude')
plt.title('Prediction Waveform')
plt.legend()
plt.show()

# Print evaluation metrics
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train MAE:', train_mae)
print('Test MAE:', test_mae)
print('Train MAPE:', train_mape)
print('Test MAPE:', test_mape)
print('Train NMBE:', train_nmbe)
print('Test NMBE:', test_nmbe)
print('Train CVRMSE:', train_cvrmsme)
print('Test CVRMSE:', test_cvrmsme)