In [19]:
import pandas as pd

#read in data needed 
#from https://www.misoenergy.org/markets-and-operations/real-time--market-data/market-reports/#nt=%2FMarketReportType%3AHistorical%20LMP%2FMarketReportName%3ADay-Ahead%20EPNode%20LMP%20(zip)&t=10&p=0&s=MarketReportPublished&sd=desc
DA_LMP = pd.read_csv('data/cleaned/DA/miso_da_combined_clean.csv', index_col = 0)

#converting datetime to datetime on loadtempwind so it can be merged with LMP data
DA_LMP['datetime'] = pd.to_datetime(DA_LMP['datetime'])

In [20]:
# from https://www.eia.gov/electricity/wholesalemarkets/data.php?rto=miso
Load_Temp_Wind = pd.read_csv('data/cleaned/Features/miso_load-temp-wind_hr_2025.csv',  index_col = 0)

#converting datetime to datetime on loadtempwind so it can be merged with LMP data
Load_Temp_Wind['datetime'] = pd.to_datetime(Load_Temp_Wind['datetime'])


In [21]:
# This will only keep rows where dates match in both DataFrames
DA_LMP = pd.merge(DA_LMP, Load_Temp_Wind, on='datetime', how='outer')

# For any remaining NAs use forward and backward fill
DA_LMP = DA_LMP.ffill()

In [22]:
#load in day ahead constraint data
constraint_pivot = pd.read_csv('data/cleaned/DA_Constraints/combined_DA_constraints.csv')

# Preprocess constraint data 
constraint_pivot['datetime'] = pd.to_datetime(constraint_pivot['datetime'])
constraint_pivot = constraint_pivot.set_index('datetime')

# Filter for constraints with sufficient data (24+ hours)
min_obs = 24
valid_constraints = constraint_pivot.columns[constraint_pivot.notna().sum() > min_obs]
constraint_pivot = constraint_pivot[valid_constraints]

In [23]:
# Lagged correlation with temp
def calc_corr_lagged_temp(group, lag=1):
    # Shift wind by 1 hour (lag), so wind[t-1] lines up with congestion[t]
    lagged_temp = group['Temperature'].shift(lag)
    
    # Drop rows where lag introduces NaN to avoid invalid correlation
    valid = group[['congestion']].copy()
    valid['lagged_temp'] = lagged_temp
    
    valid = valid.dropna()

    return valid['congestion'].corr(valid['lagged_temp'])
    
# Lagged correlation with load
def calc_corr_lagged_load(group, lag=1):
    # Shift wind by 1 hour (lag), so wind[t-1] lines up with congestion[t]
    lagged_load = group['Total Forecast Load'].shift(lag)
    
    # Drop rows where lag introduces NaN to avoid invalid correlation
    valid = group[['congestion']].copy()
    valid['lagged_load'] = lagged_load
    
    valid = valid.dropna()

    return valid['congestion'].corr(valid['lagged_load'])
    
# Lagged correlation with wind
def calc_corr_lagged_wind(group, lag=1):
    # Shift wind by 1 hour (lag), so wind[t-1] lines up with congestion[t]
    lagged_wind = group['Total Wind Gen'].shift(lag)
    
    # Drop rows where lag introduces NaN to avoid invalid correlation
    valid = group[['congestion']].copy()
    valid['lagged_wind'] = lagged_wind
    
    valid = valid.dropna()

    return valid['congestion'].corr(valid['lagged_wind'])

In [24]:
# congestion correlation temp
temp_corr_by_node_lagged_18 = DA_LMP.groupby('node').apply(calc_corr_lagged_temp, lag = 18)

# congestion correlation temp
temp_corr_by_node_lagged_12 = DA_LMP.groupby('node').apply(calc_corr_lagged_temp, lag = 12)

# congestion correlation temp
temp_corr_by_node_lagged_6 = DA_LMP.groupby('node').apply(calc_corr_lagged_temp, lag = 6)

# congestion correlation temp
temp_corr_by_node_lagged_2 = DA_LMP.groupby('node').apply(calc_corr_lagged_temp, lag = 2)

# congestion correlation temp
temp_corr_by_node = DA_LMP.groupby('node').apply(calc_corr_lagged_temp, lag = 0)

# congestion correlation load
load_corr_by_node_lagged_18 = DA_LMP.groupby('node').apply(calc_corr_lagged_load, lag = 18)

# congestion correlation load
load_corr_by_node_lagged_12 = DA_LMP.groupby('node').apply(calc_corr_lagged_load, lag = 12)

# congestion correlation load
load_corr_by_node_lagged_6 = DA_LMP.groupby('node').apply(calc_corr_lagged_load, lag = 6)

# congestion correlation load
load_corr_by_node_lagged_2 = DA_LMP.groupby('node').apply(calc_corr_lagged_load, lag = 2)

# congestion correlation load
load_corr_by_node = DA_LMP.groupby('node').apply(calc_corr_lagged_load, lag = 0)

# congestion correlation wind
wind_corr_by_node_lagged_18 = DA_LMP.groupby('node').apply(calc_corr_lagged_wind, lag = 18)

# congestion correlation wind
wind_corr_by_node_lagged_12 = DA_LMP.groupby('node').apply(calc_corr_lagged_wind, lag = 12)

# congestion correlation wind
wind_corr_by_node_lagged_6 = DA_LMP.groupby('node').apply(calc_corr_lagged_wind, lag = 6)

# congestion correlation wind
wind_corr_by_node_lagged_2 = DA_LMP.groupby('node').apply(calc_corr_lagged_wind, lag = 2)

# congestion correlation wind
wind_corr_by_node = DA_LMP.groupby('node').apply(calc_corr_lagged_wind, lag = 0)

#combining metrics together in one dataframe
CorrCombined = pd.concat([temp_corr_by_node_lagged_18, temp_corr_by_node_lagged_12, temp_corr_by_node_lagged_6, temp_corr_by_node_lagged_2, temp_corr_by_node, load_corr_by_node_lagged_18, load_corr_by_node_lagged_12, load_corr_by_node_lagged_6, load_corr_by_node_lagged_2, load_corr_by_node, wind_corr_by_node_lagged_18, wind_corr_by_node_lagged_12, wind_corr_by_node_lagged_6, wind_corr_by_node_lagged_2, wind_corr_by_node], axis = 1, ignore_index=False)

#Renaming columns for clarity
CorrCombined.columns = ['temp correlation lagged 18 hours', 'temp correlation lagged 12 hours', 'temp correlation lagged 6 hours', 'temp correlation lagged 2 hours', 'temp correlation 0 lag', 'load correlation lagged 18 hours', 'load correlation lagged 12 hours', 'load correlation lagged 6 hours', 'load correlation lagged 2 hours', 'load correlation 0 lag', 'wind correlation lagged 18 hours', 'wind correlation lagged 12 hours', 'wind correlation lagged 6 hours', 'wind correlation lagged 2 hours', 'wind correlation 0 lag']


In [25]:
# Pivot LMP data to wide format: datetime x node
DA_LMP['datetime'] = pd.to_datetime(DA_LMP['datetime'])
lmp_wide = DA_LMP.pivot(index='datetime', columns='node', values='congestion')

# Join once on datetime index
combined = constraint_pivot.join(lmp_wide, how='inner')

# Compute correlation matrix (constraints x nodes) 
constraint_cols = constraint_pivot.columns
node_cols = lmp_wide.columns

# Create an empty DataFrame to store correlations
correlation_df = pd.DataFrame(index=constraint_cols, columns=node_cols)

# Calculate correlations between each constraint and each node
for constraint in constraint_cols:
    for node in node_cols:
        correlation_df.loc[constraint, node] = combined[constraint].corr(combined[node])

# Get top 5 correlated constraints for each node 
top_corr_list = []
for node in correlation_df.columns:
    node_corr = correlation_df[node].dropna()
    top5 = node_corr.abs().sort_values(ascending=False).head(5)
    actual_corrs = node_corr.loc[top5.index]
    
    result_row = {'node': node}
    for i, (constraint, corr_val) in enumerate(actual_corrs.items(), 1):
        result_row[f'constraint_{i}'] = constraint
        result_row[f'constraint_{i}_corr'] = corr_val

    top_corr_list.append(result_row)

# Final output
top_corr_df = pd.DataFrame(top_corr_list)

In [26]:
#setting index for top constraint correlation
top_corr_df = top_corr_df.set_index('node')

#merging top constraint based congestion drivers with top feature congestion drivers
CorrCombined = pd.merge(top_corr_df, CorrCombined, on=CorrCombined.index, how='inner')

# Rename for consistency
CorrCombined = CorrCombined.rename(columns={'key_0': 'LMP node'})

In [27]:
#save node correlations to a .csv file
CorrCombined.to_csv('Outputs/Top_LMP_Node_Correlations.csv')