In [459]:
import pandas as pd
import numpy as np
import os
import re
import random
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import glob
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from datetime import datetime

from dateutil.relativedelta import relativedelta

pio.renderers.default = "notebook_connected"

# Tree-Based method CRR prediction
---

## Data loading
---

In [460]:
def read_lmp_hourly(folder_path):
    """
    Reads all CSV files in the specified folder and combines them into a single DataFrame.
    
    Parameters:
    folder_path (str): Path to the folder containing the CSV files.
    
    Returns:
    pandas.DataFrame: Combined DataFrame of all CSV files.
    """
    # Use glob to find all CSV files in the folder
    csv_files = glob.glob(folder_path + '*.csv')
    
    # Read each CSV file and concatenate them into a single DataFrame
    df_list = [pd.read_csv(file) for file in csv_files]
    combined_df = pd.concat(df_list, ignore_index=True)
    combined_df['date'] = pd.to_datetime(combined_df['DeliveryDate'].apply(str)).dt.to_period('D').dt.to_timestamp()
    combined_df['month'] = combined_df['date'].dt.to_period('M').dt.to_timestamp()
    
    def extract_hour(time_str): # extract hour for dam price , LMPhourly
        return int(time_str.split(':')[0])
    combined_df['hour'] = combined_df['HourEnding'].apply(extract_hour)
    
    return combined_df

In [481]:
top_df = pd.read_csv("tree_method_top.csv")
tail_df = pd.read_csv("tree_method_tail.csv")
LMP_hourly = read_lmp_hourly("Dayahead_Price/")
top_df = top_df[top_df['ShadowPricePerMWH']>0]

## Feature Handling
---

In [487]:
top_df['StartDate'] = pd.to_datetime(top_df['StartDate'])
top_df['EndDate'] = pd.to_datetime(top_df['EndDate'])
top_df['auction_date'] = pd.to_datetime(top_df['auction_date'])
tail_df['StartDate'] = pd.to_datetime(tail_df['StartDate'])
tail_df['EndDate'] = pd.to_datetime(tail_df['EndDate'])
tail_df['auction_date'] = pd.to_datetime(tail_df['auction_date'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [488]:
LMP_hourly = LMP_hourly.sort_values(by = ['date', 'hour'], ascending = True)

In [490]:
top_df['dist_days'] = (top_df['StartDate'] - top_df['auction_date']).dt.days
top_df['predict_result'] = top_df['CRRPerMWH'] - top_df['ShadowPricePerMWH']
tail_df['dist_days'] = (tail_df['StartDate'] - tail_df['auction_date']).dt.days
tail_df['predict_result'] = tail_df['CRRPerMWH'] - tail_df['ShadowPricePerMWH']



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [491]:
top_df['tree_y'] = [1 if x > 0 else 0 for x in top_df['predict_result']]
tail_df['tree_y'] = [1 if x > 0 else 0 for x in tail_df['predict_result']]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [492]:
tail_df['auction_date'] = pd.to_datetime(tail_df['auction_date'])

In [493]:
import pandas as pd

# Ensure the columns are in datetime format
top_df['auction_date'] = pd.to_datetime(top_df['auction_date'])

# Function to create hourly prices for source and sink
def get_hourly_prices(row, lmp_data):
    auction_date = row['auction_date']
    source = row['Source']  # Assumes column name 'Source' for SettlementPoint
    sink = row['Sink']      # Assumes column name 'Sink' for SettlementPoint

    start_time = auction_date - pd.Timedelta(days=2)  # Start 2 days before the auction date
    end_time = auction_date + pd.Timedelta(hours=17)  # End at 5:00 PM on the auction date

    # Ensure 'date' is datetime and 'hour' is numeric
    lmp_data['date'] = pd.to_datetime(lmp_data['date'])  # Convert 'date' to datetime
    lmp_data['hour'] = lmp_data['hour'].astype(int)      # Ensure 'hour' is integer

    # Add a datetime column for filtering
    lmp_data['datetime'] = lmp_data['date'] + pd.to_timedelta(lmp_data['hour'], unit='h')

    # Filter LMP_hourly data for the time window and SettlementPoints
    source_data = lmp_data[
        (lmp_data['datetime'] >= start_time) &
        (lmp_data['datetime'] < end_time) &
        (lmp_data['SettlementPoint'] == source)
    ].copy()  # Use .copy() to avoid SettingWithCopyWarning

    sink_data = lmp_data[
        (lmp_data['datetime'] >= start_time) &
        (lmp_data['datetime'] < end_time) &
        (lmp_data['SettlementPoint'] == sink)
    ].copy()

    # Calculate lagged column names for source and sink
    source_data['col'] = source_data.apply(
        lambda r: f"source_lag_{int(((auction_date + pd.Timedelta(hours=17)) - r['datetime']).total_seconds() / 3600)}",
        axis=1
    )

    sink_data['col'] = sink_data.apply(
        lambda r: f"sink_lag_{int(((auction_date + pd.Timedelta(hours=17)) - r['datetime']).total_seconds() / 3600)}",
        axis=1
    )

    # Prepare dataframes for pivoting
    source_data = source_data[['col', 'SettlementPointPrice']].set_index('col').T
    sink_data = sink_data[['col', 'SettlementPointPrice']].set_index('col').T

    # Merge source and sink data
    lookback_data = pd.concat([sink_data, source_data], axis=1)

    # Return as a single row with flattened data
    return pd.Series(lookback_data.iloc[0])

# Apply the function to top_df
price_data = top_df.apply(
    lambda row: get_hourly_prices(row, LMP_hourly), axis=1
)

# Merge the generated price data back into top_df
top_df = pd.concat([top_df, price_data], axis=1)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Correlation Matrix for selected data
---

In [494]:
# examine correlation between variables
vars = top_df.drop(columns = ['ShadowPricePerMWH', 'StartDate', 'EndDate', 'date', 'auction_info', 'auction_date', 'agg_MW', 'CRRPerMWH', 'predict_result', 'Sink', 'Source', 'TimeOfUse'])
vars_corr = vars.corr(method='pearson')
mask = np.triu(np.ones_like(vars_corr, dtype=bool))
vars_corr = vars_corr.mask(mask)

fig = px.imshow(vars_corr, text_auto = True, color_continuous_scale="Tropic")
fig.update_layout(
    title=f'Correlation heatmap for my data',
    width=1000, height=1000,
    title_font=dict(size=16, color='black'),  # Customize title font
    title_x=0.5  # Center the title
)
fig.show()

## Tree Method
---

In [530]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import itertools

# Step 1: Split the data into training, validation, and testing
test_df = top_df[top_df['auction_date'] > '2023-04-30'].drop(columns = ['ShadowPricePerMWH','StartDate', 'EndDate', 'date', 'auction_info', 'auction_date', 'agg_MW', 'CRRPerMWH', 'predict_result']).dropna()
valid_df = top_df[(top_df['auction_date'] <= '2023-04-30')&(top_df['auction_date'] > '2022-11-30')].drop(columns = ['ShadowPricePerMWH', 'StartDate', 'EndDate', 'date', 'auction_info', 'auction_date', 'agg_MW', 'CRRPerMWH', 'predict_result']).dropna()
train_df = top_df[(top_df['auction_date'] <= '2022-11-30')].drop(columns = ['ShadowPricePerMWH','StartDate', 'EndDate', 'date', 'auction_info', 'auction_date', 'agg_MW', 'CRRPerMWH', 'predict_result']).dropna()

y_test = test_df['tree_y']
y_val = valid_df['tree_y']
y_train = train_df['tree_y']

X_test= test_df.drop(columns = [ 'tree_y'])
X_val = valid_df.drop(columns = ['tree_y'])
X_train = train_df.drop(columns = ['tree_y'])


categorical_columns = X_test.select_dtypes(include=['object']).columns
for col in categorical_columns:
    X_test[col] = X_test[col].astype('category')
    X_val[col] = X_val[col].astype('category')
    X_train[col] = X_train[col].astype('category')


# Step 3: Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.2],
    'max_depth': [3, 5],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

# Step 4: Manual grid search
best_model = None
best_params = None
highest_val_accuracy = 0

# Iterate over all combinations of parameters
for params in itertools.product(*param_grid.values()):
    params_dict = dict(zip(param_grid.keys(), params))
    
    # Initialize XGBoost model with categorical support
    model = xgb.XGBClassifier(
        objective='binary:logistic',  # For binary classification
        enable_categorical=True,
        random_state=42,
        **params_dict
    )
    # Train on the training set
    model.fit(X_train, y_train)
    
    # Predict on the validation set
    y_val_pred = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    
    # Update best model if validation accuracy is higher
    if val_accuracy > highest_val_accuracy:
        best_model = model
        best_params = params_dict
        highest_val_accuracy = val_accuracy

print("Best Parameters:", best_params)
print(f"Best Validation Accuracy: {highest_val_accuracy}")

# Step 5: Evaluate the best model on the test set
y_test_pred = best_model.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Additional Metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")
print(f"Test F1 Score: {test_f1}")

Best Parameters: {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 5, 'subsample': 1, 'colsample_bytree': 0.8}
Best Validation Accuracy: 0.8308351177730193
Confusion Matrix:
[[340  15]
 [114  41]]
Test Accuracy: 0.7470588235294118
Test Precision: 0.7321428571428571
Test Recall: 0.2645161290322581
Test F1 Score: 0.3886255924170616


In [497]:
# import xgboost as xgb
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
# import itertools

# # Step 1: Split the data into training, validation, and testing
# # Step 1: Split the data into training, validation, and testing
# test_df = tail_df[tail_df['auction_date'] > '2023-04-30'].drop(columns = ['StartDate', 'EndDate', 'date', 'auction_info', 'auction_date', 'agg_MW', 'CRRPerMWH', 'predict_result']).dropna()
# valid_df = tail_df[(tail_df['auction_date'] <= '2023-04-30')&(tail_df['auction_date'] > '2022-11-30')].drop(columns = ['StartDate', 'EndDate', 'date', 'auction_info', 'auction_date', 'agg_MW', 'CRRPerMWH', 'predict_result']).dropna()
# train_df = tail_df[(tail_df['auction_date'] <= '2022-11-30')].drop(columns = ['StartDate', 'EndDate', 'date', 'auction_info', 'auction_date', 'agg_MW', 'CRRPerMWH', 'predict_result']).dropna()

# y_test = test_df['tree_y']
# y_val = valid_df['tree_y']
# y_train = train_df['tree_y']

# X_test= test_df.drop(columns = [ 'tree_y','ShadowPricePerMWH'])
# X_val = valid_df.drop(columns = ['tree_y', 'ShadowPricePerMWH'])
# X_train = train_df.drop(columns = ['tree_y', 'ShadowPricePerMWH'])


# categorical_columns = X_test.select_dtypes(include=['object']).columns
# for col in categorical_columns:
#     X_test[col] = X_test[col].astype('category')
#     X_val[col] = X_val[col].astype('category')
#     X_train[col] = X_train[col].astype('category')


# # Step 3: Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200],
#     'learning_rate': [0.1, 0.2],
#     'max_depth': [3, 5],
#     'subsample': [0.8, 1],
#     'colsample_bytree': [0.8, 1]
# }

# # Step 4: Manual grid search
# best_model = None
# best_params = None
# highest_val_accuracy = 0

# # Iterate over all combinations of parameters
# for params in itertools.product(*param_grid.values()):
#     params_dict = dict(zip(param_grid.keys(), params))
    
#     # Initialize XGBoost model with categorical support
#     model = xgb.XGBClassifier(
#         objective='binary:logistic',  # For binary classification
#         enable_categorical=True,
#         random_state=42,
#         **params_dict
#     )
#     # Train on the training set
#     model.fit(X_train, y_train)
    
#     # Predict on the validation set
#     y_val_pred = model.predict(X_val)
#     val_accuracy = accuracy_score(y_val, y_val_pred)
    
#     # Update best model if validation accuracy is higher
#     if val_accuracy > highest_val_accuracy:
#         best_model = model
#         best_params = params_dict
#         highest_val_accuracy = val_accuracy

# print("Best Parameters:", best_params)
# print(f"Best Validation Accuracy: {highest_val_accuracy}")

# # Step 5: Evaluate the best model on the test set
# y_test_pred = best_model.predict(X_test)

# # Confusion Matrix
# conf_matrix = confusion_matrix(y_test, y_test_pred)
# print("Confusion Matrix:")
# print(conf_matrix)

# # Additional Metrics
# test_accuracy = accuracy_score(y_test, y_test_pred)
# test_precision = precision_score(y_test, y_test_pred)
# test_recall = recall_score(y_test, y_test_pred)
# test_f1 = f1_score(y_test, y_test_pred)

# print(f"Test Accuracy: {test_accuracy}")
# print(f"Test Precision: {test_precision}")
# print(f"Test Recall: {test_recall}")
# print(f"Test F1 Score: {test_f1}")

## MLP Price Prediction
---

By using Linear regression, we use currently available data to predict auction price, and then use this price to bid for it. If the predicted price is higher than the auction price, CRR will be rewarded, otherwise, we will lose this CRR.

In [531]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

categorical_cols = ['Source', 'Sink', 'TimeOfUse']

# Apply One-Hot Encoding to small-cardinality categorical columns
auction_top_df = pd.get_dummies(top_df, columns=categorical_cols, drop_first=True)

test_df = auction_top_df[auction_top_df['auction_date'] > '2023-04-30'].drop(columns = ['StartDate', 'EndDate', 'date', 'auction_info', 'auction_date', 'agg_MW', 'predict_result']).dropna()
valid_df = auction_top_df[(auction_top_df['auction_date'] <= '2023-04-30')&(auction_top_df['auction_date'] > '2022-11-30')].drop(columns = [ 'StartDate', 'EndDate', 'date', 'auction_info', 'auction_date', 'agg_MW',  'predict_result']).dropna()
train_df = auction_top_df[(auction_top_df['auction_date'] <= '2022-11-30')].drop(columns = ['StartDate', 'EndDate', 'date', 'auction_info', 'auction_date', 'agg_MW', 'predict_result']).dropna()

X_test= test_df.drop(columns = [ 'tree_y','ShadowPricePerMWH','CRRPerMWH'])
X_val = valid_df.drop(columns = ['tree_y','ShadowPricePerMWH','CRRPerMWH'])
X_train = train_df.drop(columns = ['tree_y','ShadowPricePerMWH','CRRPerMWH'])

# Display the transformed DataFrame
y_test_auction = test_df['ShadowPricePerMWH']
y_val_auction = valid_df['ShadowPricePerMWH']
y_train_auction = train_df['ShadowPricePerMWH']

y_test_CRR = test_df['CRRPerMWH']
y_val_CRR = valid_df['CRRPerMWH']
y_train_CRR = train_df['CRRPerMWH']

In [532]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
random.seed(42)
# Scaling numerical features
scaler = StandardScaler()

# Scale training, validation, and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Neural Network Model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),  # Input layer with 64 neurons
    Dropout(0.2),  # Dropout layer to prevent overfitting
    Dense(32, activation='relu'),  # Hidden layer with 32 neurons
    Dropout(0.2),
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

# Train the model
history = model.fit(
    X_train_scaled, y_train_auction,
    validation_data=(X_val_scaled, y_val_auction),
    epochs=50,
    batch_size=32,
    verbose=1
)

# Evaluate the model on the test set
test_loss, test_mae = model.evaluate(X_test_scaled, y_test_auction, verbose=0)
y_test_auction_pred = model.predict(X_test_scaled, verbose = 0)
# Print evaluation metrics
print(f"Test Loss (MSE): {test_loss}, Test MAE: {test_mae}")



Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



Epoch 1/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 12.6244 - mean_absolute_error: 2.5426 - val_loss: 8.1206 - val_mean_absolute_error: 1.9000
Epoch 2/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 6.6656 - mean_absolute_error: 1.8560 - val_loss: 7.2356 - val_mean_absolute_error: 1.7827
Epoch 3/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 4.7202 - mean_absolute_error: 1.5012 - val_loss: 6.8384 - val_mean_absolute_error: 1.7610
Epoch 4/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 4.6588 - mean_absolute_error: 1.4216 - val_loss: 7.4120 - val_mean_absolute_error: 1.8254
Epoch 5/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 3.8831 - mean_absolute_error: 1.2816 - val_loss: 6.6921 - val_mean_absolute_error: 1.6808
Epoch 6/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - los

In [533]:
import plotly.graph_objects as go

# Extract loss values from training history (Ensure history exists)
train_loss = history.history['loss']
val_loss = history.history['val_loss']

# Create a Plotly figure
fig = go.Figure()

# Add training loss trace
fig.add_trace(go.Scatter(
    y=train_loss,
    mode='lines',
    name='Training Loss',
    line=dict(width=2)
))

# Add validation loss trace
fig.add_trace(go.Scatter(
    y=val_loss,
    mode='lines',
    name='Validation Loss',
    line=dict(width=2, dash='dash')
))

# Customize layout
fig.update_layout(
    title='Loss Change Over Epochs',
    xaxis_title='Epochs',
    yaxis_title='Loss (MSE)',
    legend=dict(font=dict(size=12)),
    template='plotly_white'
)

# Show the figure
fig.show()


## Trading Strategy
---

### Profit
---

In [535]:
profit = 0
success = 0
y_test_auction = y_test_auction.reset_index(drop = True)
y_test_CRR = y_test_CRR.reset_index(drop = True)
for i in range(len(y_test_pred)):
    if y_test_pred[i] == 1:
        if y_test_auction_pred[i][0] >= y_test_auction[i]:
            success += 1
            profit +=  max(0,y_test_CRR[i] - y_test_auction[i])

profit

np.float64(35.85698180916414)

In [536]:
n_trails = 10000
larger_cnt = 0
for i in range(n_trails):
    # Initialize an array of 610 elements, all set to 0
    profit_compare = 0
    random_compare = np.zeros(510, dtype=int)

    # Randomly select 212 unique indices from the array
    random_indices = np.random.choice(len(random_compare), 56, replace=False)
    random_compare[random_indices] = 1
    profit_compare = 0
    for i in range(len(random_compare)):
        if random_compare[i] == 1:
            if y_test_auction_pred[i][0] >= y_test_auction[i]:
                profit_compare += max(0, y_test_CRR[i] - y_test_auction[i])

    if profit_compare >= profit:
        larger_cnt += 1
        
larger_cnt/n_trails

0.0377

### Profit and Loss
---

In [537]:
pnl = 0
y_test_auction = y_test_auction.reset_index(drop = True)
y_test_CRR = y_test_CRR.reset_index(drop = True)
for i in range(len(y_test_pred)):
    if y_test_pred[i] == 1:
        if y_test_auction_pred[i][0] >= y_test_auction[i]:
            pnl +=  max(0,y_test_CRR[i] - y_test_auction[i]) - y_test_auction[i]

pnl

np.float64(26.17756680916414)

In [538]:
n_trails = 10000
larger_cnt = 0
for i in range(n_trails):
    # Initialize an array of 610 elements, all set to 0
    pnl_compare = 0
    random_compare = np.zeros(510, dtype=int)

    # Randomly select 212 unique indices from the array
    random_indices = np.random.choice(len(random_compare), 56, replace=False)
    random_compare[random_indices] = 1
    profit_compare = 0
    for i in range(len(random_compare)):
        if random_compare[i] == 1:
            if y_test_auction_pred[i][0] >= y_test_auction[i]:
                pnl_compare += max(0, y_test_CRR[i] - y_test_auction[i]) - y_test_auction[i]

    if pnl_compare >= pnl:
        larger_cnt += 1
        
larger_cnt/n_trails

0.0027

1. 验证Tree methods
    1. 随机找n个CRR repeat the algorithm
    2. P&L
2. forcast model 是否有意义
    1. compare: directly use historical data