In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.metrics import r2_score

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [None]:
# prompt: read all the csv files "D:\OneDrive - The Pennsylvania State University\Research DATA\Dr. Habib & Dr. Reza Data\Energy Price Market Data\Day Ahead Price Data_Processed\USA\NYISO" and merge into a one dataframe df

folder_path = r"D:\OneDrive - The Pennsylvania State University\Research DATA\Dr. Habib & Dr. Reza Data\Energy Price Market Data\Day Ahead Price Data_Processed\USA\NYISO"

all_nyiso_data = pd.DataFrame()

for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.endswith('.csv'):
            file_path = os.path.join(root, file)
            try:
                df_temp = pd.read_csv(file_path)
                all_nyiso_data = pd.concat([all_nyiso_data, df_temp], ignore_index=True)
            except Exception as e:
                print(f"Error reading file {file_path}: {e}")

df = all_nyiso_data.copy() # Assign the concatenated dataframe to df
print(df)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Data Preprocessing
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df.set_index('Timestamp', inplace=True)
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df['CAPITL'].values.reshape(-1,1))


# Creating a data structure with 60 timesteps and 1 output
X, y = [], []
for i in range(60, len(scaled_data)):
    X.append(scaled_data[i-60:i, 0])
    y.append(scaled_data[i, 0])
X, y = np.array(X), np.array(y)

# Splitting the data into training and testing sets based on the year
train_df = df[df.index < "2023-01-01"]
test_df = df[df.index >= "2023-01-01"]

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Train years:", train_df.index.year.unique())
print("Test years:", test_df.index.year.unique())



train_scaled = scaler.transform(train_df['CAPITL'].values.reshape(-1,1))
test_scaled = scaler.transform(test_df['CAPITL'].values.reshape(-1,1))

In [None]:


X_train, y_train, X_test, y_test = [], [], [], []
for i in range(60, len(train_scaled)):
    X_train.append(train_scaled[i-60:i, 0])
    y_train.append(train_scaled[i, 0])
for i in range(60, len(test_scaled)):
    X_test.append(test_scaled[i-60:i, 0])
    y_test.append(test_scaled[i, 0])
X_train, y_train, X_test, y_test = np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)

# Reshaping for RNN
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# RNN Network
model_rnn = Sequential([
    SimpleRNN(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)),
    Dropout(0.2),
    SimpleRNN(units=50),
    Dropout(0.2),
    Dense(units=1)
])
model_rnn.compile(optimizer='adam', loss='mean_squared_error')

# Fit the model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history_rnn = model_rnn.fit(X_train, y_train, epochs=2, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Predictions
predicted_prices_rnn = model_rnn.predict(X_test)
predicted_prices_rnn = scaler.inverse_transform(predicted_prices_rnn)

# Evaluation
test_actual_rnn = scaler.inverse_transform(y_test.reshape(-1, 1))
mse_rnn = mean_squared_error(test_actual_rnn, predicted_prices_rnn)
rmse_rnn = np.sqrt(mse_rnn)
mae_rnn = mean_absolute_error(test_actual_rnn, predicted_prices_rnn)
mape_rnn = np.mean(np.abs((test_actual_rnn - predicted_prices_rnn) / test_actual_rnn)) * 100

# Calculate R² value
r2_rnn = r2_score(test_actual_rnn, predicted_prices_rnn)

print(f"MAPE: {mape_rnn}%, R² Value: {r2_rnn}")
print(f"RNN MAE: {mae_rnn}, RMSE: {rmse_rnn}")

# Truncate the test_actual_rnn array to match the length of predicted_prices_rnn
test_actual_truncated_rnn = test_actual_rnn[-len(predicted_prices_rnn):]

# Adjust the test_df index to match the length of the predictions
test_df_index_truncated_rnn = test_df.index[-len(predicted_prices_rnn):]

# Visualization
plt.figure(figsize=(15, 5))
plt.plot(test_df_index_truncated_rnn, test_actual_truncated_rnn, label='Actual Price')
plt.plot(test_df_index_truncated_rnn, predicted_prices_rnn, label='Predicted Price')
plt.title('Price Prediction for 2023 using RNN')
plt.xlabel('Time')
plt.ylabel('Price')

# Improve date formatting on x-axis
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H:%M'))
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=30))  # Adjust interval as needed
plt.gcf().autofmt_xdate()  # Improve date label formatting

plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Plot training & validation loss values
plt.figure(figsize=(10, 6))
plt.plot(history_rnn.history['loss'], label='Training Loss')
plt.plot(history_rnn.history['val_loss'], label='Validation Loss')
plt.title('Training vs. Validation Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()


In [None]:
# Predictions
predicted_prices = model_rnn.predict(X_test)
predicted_prices = scaler.inverse_transform(predicted_prices)
test_actual = scaler.inverse_transform(y_test.reshape(-1, 1))

# Calculate residuals
residuals = test_actual - predicted_prices

# Plotting residuals
plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=25, alpha=0.75, edgecolor='black')
plt.xlabel('Residual')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.show()

# Scatter plot of residuals
plt.figure(figsize=(10, 6))
plt.scatter(x=test_actual, y=residuals, alpha=0.75)
plt.xlabel('Actual Prices')
plt.ylabel('Residuals')
plt.axhline(y=0, color='r', linestyle='-')
plt.title('Residuals vs. Actual Prices')
plt.show()
