In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

<h1> Reading the Dataset </h1>

In [None]:
# prompt: read all the csv files "D:\OneDrive - The Pennsylvania State University\Research DATA\Dr. Habib & Dr. Reza Data\Energy Price Market Data\Day Ahead Price Data_Processed\USA\NYISO" and merge into a one dataframe df

folder_path = r"D:\OneDrive - The Pennsylvania State University\Research DATA\Dr. Habib & Dr. Reza Data\Energy Price Market Data\Day Ahead Price Data_Processed\USA\NYISO"

all_nyiso_data = pd.DataFrame()

for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.endswith('.csv'):
            file_path = os.path.join(root, file)
            try:
                df_temp = pd.read_csv(file_path)
                all_nyiso_data = pd.concat([all_nyiso_data, df_temp], ignore_index=True)
            except Exception as e:
                print(f"Error reading file {file_path}: {e}")

df = all_nyiso_data.copy() # Assign the concatenated dataframe to df
print(df)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Create a date range from January 1, 2016, to December 31, 2023, with hourly frequency
date_range = pd.date_range(start='2016-01-01', end='2023-12-31 23:00:00', freq='H')

len(date_range)


In [None]:
df.info()

<h1>Plotting the DATA</h1>

In [None]:
import matplotlib.pyplot as plt

# Plotting time series
df.set_index('Timestamp')['CAPITL'].plot(figsize=(15, 6))
plt.show()


In [None]:
df1 = df.copy()

In [None]:
df.head()

In [None]:
df1.head()

<h1> Energy Price Data by YEAR </h1>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Ensure your DataFrame is named df and has columns 'Date' and 'Price'
# Convert 'Date' to datetime if it's not already

df1['Timestamp'] = pd.to_datetime(df1['Timestamp'])

# Extract year and month from the Date
df1['Year'] = df1['Timestamp'].dt.year
df1['Month'] = df1['Timestamp'].dt.month

# Pivot the data for easier plotting
df1_pivot = df1.pivot_table(values='CAPITL', index='Month', columns='Year')

# Plotting
plt.figure(figsize=(15, 6))
sns.lineplot(data=df1_pivot)
plt.title('Seasonal Plot by Year')
plt.xlabel('Month')
plt.ylabel('Price')
plt.xticks(range(1, 13))  # To show each month on the x-axis
plt.legend(title='Year', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()



In [None]:
df.head()

In [None]:
df1.head()

<h1> Price By Individual Year </h1>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming your DataFrame is named df with columns 'Date' and 'Price'
# Convert 'Date' to datetime if it's not already
df1['Timestamp'] = pd.to_datetime(df1['Timestamp'])

# Extracting year and month
df1['Year'] = df1['Timestamp'].dt.year
df1['Month'] = df1['Timestamp'].dt.strftime('%b')  # %b gives the month abbreviation

# Creating a FacetGrid to plot each year's data in a separate subplot
g = sns.FacetGrid(df1, col='Year', col_wrap=4, height=3, aspect=1.5)
g = g.map(sns.lineplot, 'Month', 'Price')

# Adjusting the x-axis labels for better readability
for ax in g.axes:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

plt.subplots_adjust(top=0.9)
g.fig.suptitle('Seasonal Subseries Plot')
plt.show()


<h1> Energy price by Individual Month </h1>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming your DataFrame is named df with columns 'Date' and 'Price'
# Convert 'Date' to datetime if it's not already
df1['Timestamp'] = pd.to_datetime(df1['Timestamp'])

# Extracting year and month
df1['Year'] = df1['Timestamp'].dt.year
df1['Month'] = df1['Timestamp'].dt.month

# Convert month numbers to names for better readability
import calendar
df1['Month'] = df1['Month'].apply(lambda x: calendar.month_abbr[x])

# Creating a FacetGrid to plot each month's data in a separate subplot
g = sns.FacetGrid(df1, col='Month', col_wrap=4, height=3, aspect=1.5)
g = g.map_dataframe(sns.lineplot, x='Year', y='CAPITL')

plt.subplots_adjust(top=0.9)
g.fig.suptitle('Seasonal Subseries Plot by Month')
plt.show()


<h1> Time Series Decomposition </h1>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

# Ensure that 'Date' is the index and in the correct DateTime format
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df.set_index('Timestamp', inplace=True)

# Decompose the time series with daily frequency for hourly data
decomposition = seasonal_decompose(df['CAPITL'], model='additive', period=24)  # adjust period if different seasonality is suspected

# Plot the decomposed components
plt.rcParams.update({'figure.figsize': (10,10)})
decomposition.plot()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

# Ensure that 'Date' is the index and in the correct DateTime format
#df['Date'] = pd.to_datetime(df['Date'])
#df.set_index('Date', inplace=True)

# Decompose the time series with daily frequency for hourly data
decomposition = seasonal_decompose(df['CAPITL'], model='additive', period=24*7)  # adjust period if different seasonality is suspected

# Plot the decomposed components
plt.rcParams.update({'figure.figsize': (10,10)})
decomposition.plot()
plt.show()


<h1> Model 1: XGBOOST </h1>

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt

# Assuming df is your DataFrame with 'Date' as the index and 'Price' as the target.


# Feature Engineering
for lag in range(1, 25):  # 24-hour lags to capture one full day's cycle
    df[f'lag_{lag}'] = df['CAPITL'].shift(lag)

# Remove the first 24 rows because of NaNs introduced by lagging
df = df.dropna()

# Splitting the data
train = df[:'2022']
test = df['2023']

# Separate features and target
X_train = train.drop('CAPITL', axis=1)
y_train = train['CAPITL']
X_test = test.drop('CAPITL', axis=1)
y_test = test['CAPITL']

# Model Training
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5)

# Fit the model
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}, RMSE: {rmse}")

# Visualization
plt.figure(figsize=(15, 5))
plt.plot(y_test.index, y_test, label='Actual')
plt.plot(y_test.index, y_pred, label='Predicted')
plt.legend()
plt.show()


In [None]:
# Assuming y_test and y_pred are pandas Series with a DateTime index
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# If you want to see the comparison for one specific day
# Let's say we want to see this for January 1st, 2023
date_to_compare = '2023-01-01'
one_day_comparison = comparison_df.loc[date_to_compare]

# Now let's display this as a table
one_day_comparison


<h1> Model 2: Random Forest </h1>

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Assuming the same feature engineering, and train/test split as with XGBoost

# Model Training
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model
rf_model.fit(X_train, y_train)

# Prediction
rf_y_pred = rf_model.predict(X_test)

# Evaluation
rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_rmse = np.sqrt(rf_mse)
rf_mae = mean_absolute_error(y_test, rf_y_pred)
print(f"Random Forest MAE: {rf_mae}, RMSE: {rf_rmse}")

# Visualization
plt.figure(figsize=(15, 5))
plt.plot(y_test.index, y_test, label='Actual')
plt.plot(y_test.index, rf_y_pred, label='Predicted by Random Forest')
plt.legend()
plt.show()


In [None]:
# Assuming y_test and rf_y_pred are pandas Series with a DateTime index
rf_comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': rf_y_pred})

# If you want to see the comparison for one specific day
# Let's say we want to see this for January 1st, 2023
date_to_compare = '2023-01-01'
one_day_rf_comparison = rf_comparison_df.loc[date_to_compare]

# Now let's display this as a table
one_day_rf_comparison


<h1> Model 3: FBPROPHET </h1>

In [None]:
from prophet import Prophet


# Assuming df is your DataFrame with 'Date' as the index and 'Price' as the target.

# Prophet requires the data to be in a DataFrame with two columns: 'ds' and 'y'
prophet_df = df.reset_index().rename(columns={'Timestamp': 'ds', 'CAPITL': 'y'})

# Split the data into training and testing sets
train_prophet_df = prophet_df[prophet_df['ds'] < '2023-01-01']
test_prophet_df = prophet_df[prophet_df['ds'] >= '2023-01-01']

# Initialize the Prophet model
prophet_model = Prophet()

# Fit the model on the training data
prophet_model.fit(train_prophet_df)

# Create a DataFrame for predictions
future = prophet_model.make_future_dataframe(periods=len(test_prophet_df), freq='H')

# Predict
forecast = prophet_model.predict(future)

# Extract the predicted values for the test set
predicted_prophet = forecast['yhat'][-len(test_prophet_df):]

# Evaluation - calculating metrics
prophet_mse = mean_squared_error(test_prophet_df['y'], predicted_prophet)
prophet_rmse = np.sqrt(prophet_mse)
prophet_mae = mean_absolute_error(test_prophet_df['y'], predicted_prophet)
print(f"Prophet MAE: {prophet_mae}, RMSE: {prophet_rmse}")

# Visualization
plt.figure(figsize=(15, 5))
plt.plot(test_prophet_df['ds'], test_prophet_df['y'], label='Actual')
plt.plot(test_prophet_df['ds'], predicted_prophet.values, label='Predicted by Prophet')
plt.legend()
plt.show()


<h1> Model 4: LSTM </h1>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

# Assuming df is your DataFrame with 'Date' as the index and 'Price' as the target.

# Data Preprocessing
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df['CAPITL'].values.reshape(-1,1))

# Creating a data structure with 60 timesteps and 1 output
X = []
y = []
for i in range(60, len(scaled_data)):
    X.append(scaled_data[i-60:i, 0])
    y.append(scaled_data[i, 0])

X, y = np.array(X), np.array(y)

# Splitting the data into training and testing sets
train_size = int(len(df) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Reshaping for LSTM
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

# LSTM Network
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=1))

model.compile(optimizer='adam', loss='mean_squared_error')



# Fit the model
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.fit(X_train, y_train, epochs=2, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Predictions
predicted_prices = model.predict(X_test)
predicted_prices = scaler.inverse_transform(predicted_prices)

# Evaluation
test_actual = scaler.inverse_transform(y_test.reshape(-1, 1))
mse = mean_squared_error(test_actual, predicted_prices)
rmse = np.sqrt(mse)
mae = mean_absolute_error(test_actual, predicted_prices)
print(f"LSTM MAE: {mae}, RMSE: {rmse}")

# Visualization
plt.figure(figsize=(15, 5))
plt.plot(test_actual, label='Actual Price')
plt.plot(predicted_prices, label='Predicted Price')
plt.title('Price Prediction')
plt.xlabel('Time')
plt.ylabel('Price')
plt.legend()
plt.show()


In [None]:
import pandas as pd

# Assuming 'df' is your original DataFrame indexed by dates
# Get the corresponding dates for the test set
test_dates = df.index[-len(predicted_prices):]

# Create a DataFrame for the predicted values with correct dates
predicted_df = pd.DataFrame(predicted_prices, index=test_dates, columns=['Predicted'])

# Assuming 'test_actual' contains the actual test set values
# Create a DataFrame for the actual values with the same dates
actual_df = pd.DataFrame(test_actual, index=test_dates, columns=['Actual'])

# Combine actual and predicted values into a single DataFrame for comparison
comparison_df = pd.concat([actual_df, predicted_df], axis=1)

# Select a specific day for comparison
specific_day = '2023-01-01'  # replace with the date you are interested in
one_day_comparison = comparison_df.loc[specific_day]

# Display the comparison table for that day
print(one_day_comparison)


In [None]:
import pandas as pd

# Assuming 'df' is your original DataFrame indexed by dates
# Get the corresponding dates for the test set of the year 2023
test_dates = df['2023'].index  # This selects only the dates for the year 2023

# Create a DataFrame for the predicted values with correct dates
predicted_df = pd.DataFrame(predicted_prices, index=test_dates, columns=['Predicted'])

# Assuming 'test_actual' contains the actual test set values
# Create a DataFrame for the actual values with the same dates
actual_df = pd.DataFrame(test_actual, index=test_dates, columns=['Actual'])

# Combine actual and predicted values into a single DataFrame for comparison
comparison_df = pd.concat([actual_df, predicted_df], axis=1)

# Save the DataFrame to an Excel file
comparison_df.to_excel('predicted_vs_actual_2023.xlsx')


<h1> Model 5: TCN </h1>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Conv1D, GlobalAveragePooling1D

# Assuming df is your DataFrame with 'Date' as the index and 'Price' as the target.

# Data Preprocessing
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df['CAPITL'].values.reshape(-1,1))

# Create dataset with 60 timesteps (you can adjust this)
X = []
y = []
for i in range(60, len(scaled_data)):
    X.append(scaled_data[i-60:i, 0])
    y.append(scaled_data[i, 0])

X, y = np.array(X), np.array(y)

# Split the data
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Reshape for TCN
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

# TCN Model
model = Sequential([
    Conv1D(filters=64, kernel_size=6, padding='causal', activation='relu', input_shape=(X_train.shape[1], 1)),
    Conv1D(filters=64, kernel_size=6, padding='causal', activation='relu'),
    GlobalAveragePooling1D(),
    Dense(1)
])

model.compile(optimizer=Adam(lr=1e-3), loss='mean_squared_error')

# Model Training
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.fit(X_train, y_train, epochs=2, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Predictions
predicted_prices = model.predict(X_test)
predicted_prices = scaler.inverse_transform(predicted_prices)

# Evaluation
test_actual = scaler.inverse_transform(y_test.reshape(-1, 1))
mse = mean_squared_error(test_actual, predicted_prices)
rmse = np.sqrt(mse)
mae = mean_absolute_error(test_actual, predicted_prices)
print(f"TCN MAE: {mae}, RMSE: {rmse}")

# Visualization
plt.figure(figsize=(15, 5))
plt.plot(test_actual, label='Actual Price')
plt.plot(predicted_prices, label='Predicted Price')
plt.title('Price Prediction')
plt.xlabel('Time')
plt.ylabel('Price')
plt.legend()
plt.show()


In [None]:
import pandas as pd

# Assuming 'df' is your original DataFrame with 'Date' as the index
# Ensure the test set extraction matches the creation of y_test

# If y_test corresponds to a specific subset of the year 2023, adjust this line accordingly
test_dates = df.loc[your_specific_criteria].index  # Replace 'your_specific_criteria' appropriately

# Ensure y_test and predicted_prices are of the same length
if len(predicted_prices) != len(test_dates):
    raise ValueError("Length of predicted prices does not match the length of test dates")

# Create a DataFrame for the predicted values with correct dates
predicted_df = pd.DataFrame(predicted_prices, index=test_dates, columns=['Predicted'])

# Since y_test is likely a NumPy array, convert it back to a DataFrame with the test index
actual_df = pd.DataFrame(test_actual, index=test_dates, columns=['Actual'])

# Combine actual and predicted values into a single DataFrame for comparison
comparison_df = pd.concat([actual_df, predicted_df], axis=1)

# If you want to display the comparison for the entire test set
print(comparison_df)
