In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Read the data from the CSV file
data = pd.read_csv("/kaggle/input/building-data-genome-project-v1/Office_Garman.csv", index_col="timestamp", parse_dates=True)

# Remove duplicate entries
data = data.drop_duplicates()

# Set the frequency to hourly
data = data.asfreq('H')

# Fill missing values using the forward-fill method
data = data.fillna(method='ffill')

# Perform Z-score normalization
scaler = StandardScaler()
normalized_data = scaler.fit_transform(data.values)

# Define window size and prediction horizon
window_size = 24
prediction_horizon = 1

# Prepare the data for LSTM
X = []
y = []
for i in range(len(normalized_data) - window_size - prediction_horizon + 1):
    X.append(normalized_data[i:i+window_size])
    y.append(normalized_data[i+window_size:i+window_size+prediction_horizon, 0])  # Adjust the column index if needed

X = np.array(X)
y = np.array(y)

# Split the data into train and test sets
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Build the LSTM model
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(window_size, X.shape[2])))
model.add(Dense(prediction_horizon))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Define early stopping
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Make predictions
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

# Rescale the predictions
train_pred = scaler.inverse_transform(train_pred)
test_pred = scaler.inverse_transform(test_pred)

# Calculate RMSE
train_rmse = np.sqrt(mean_squared_error(data[window_size:train_size+window_size, 0], train_pred[:, 0]))
test_rmse = np.sqrt(mean_squared_error(data[train_size+window_size:train_size+window_size+len(test_pred), 0], test_pred[:, 0]))

# Calculate MAE
train_mae = mean_absolute_error(data[window_size:train_size+window_size, 0], train_pred[:, 0])
test_mae = mean_absolute_error(data[train_size+window_size:train_size+window_size+len(test_pred), 0], test_pred[:, 0])

# Calculate MAPE
def calculate_mape(actual, predicted):
    return np.mean(np.abs((actual - predicted) / actual)) * 100

train_mape = calculate_mape(data[window_size:train_size+window_size, 0], train_pred[:, 0])
test_mape = calculate_mape(data[train_size+window_size:train_size+window_size+len(test_pred), 0], test_pred[:, 0])

# Plot the prediction waveform
plt.figure(figsize=(10, 6))
plt.plot(data[window_size:train_size+window_size, 0], label='Actual (Train)')
plt.plot(range(train_size+window_size, train_size+window_size+len(test_pred)), data[train_size+window_size:train_size+window_size+len(test_pred), 0], label='Actual (Test)')
plt.plot(range(window_size, train_size+window_size), train_pred[:, 0], label='Predicted (Train)')
plt.plot(range(train_size+window_size, train_size+window_size+len(test_pred)), test_pred[:, 0], label='Predicted (Test)')
plt.xlabel('Time')
plt.ylabel('Amplitude')
plt.title('Prediction Waveform')
plt.legend()
plt.show()

# Print evaluation metrics
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train MAE:', train_mae)
print('Test MAE:', test_mae)
print('Train MAPE:', train_mape)
print('Test MAPE:', test_mape)
