In [1]:
import pandas as pd
import xarray as xr
import numpy as np
from comp_utils import *
from datetime import datetime, timedelta

In [2]:
# The train_energy_data_20200920_20240519.h5 has been created in the RemitData notebook
train_energy_data = pd.read_hdf('data/combined/train_energy_data_20200920_20240519.h5', 'df')
train_energy_data["dtm"] = pd.to_datetime(train_energy_data["dtm"])
train_energy_data.rename(columns={"dtm": "valid_time"}, inplace=True)
train_energy_data["Wind_MWh_credit"] = 0.5*train_energy_data["Wind_MW"] - train_energy_data["boa_MWh"]
train_energy_data["Solar_MWh_credit"] = 0.5*train_energy_data["Solar_MW"]
train_energy_data["total_generation_MWh"] = train_energy_data["Wind_MWh_credit"] + train_energy_data["Solar_MWh_credit"]

In [3]:
cutoff_reference_time = pd.Timestamp('2024-02-19 23:00:00+00:00')

# Wind
dwd_wind = pd.read_hdf('data/combined/dwd_wind_20200920_20240519.h5', 'df')
dwd_wind = dwd_wind[dwd_wind["valid_time"] - dwd_wind["reference_time"] < np.timedelta64(70, "h")]
dwd_wind = dwd_wind.set_index("valid_time").groupby("reference_time").resample("30min").interpolate("linear")
dwd_wind = dwd_wind.drop(columns="reference_time",axis=1).reset_index()
dwd_wind = dwd_wind[dwd_wind['valid_time'] < cutoff_reference_time]
print(dwd_wind)

ncep_wind = pd.read_hdf('data/combined/ncep_wind_20200920_20240519.h5', 'df')
ncep_wind = ncep_wind[ncep_wind["valid_time"] - ncep_wind["reference_time"] < np.timedelta64(70, "h")]
ncep_wind = ncep_wind.set_index("valid_time").groupby("reference_time").resample("30min").interpolate("linear")
ncep_wind = ncep_wind.drop(columns="reference_time",axis=1).reset_index()
ncep_wind = ncep_wind[ncep_wind['valid_time'] < cutoff_reference_time]
print(ncep_wind)

                  reference_time                valid_time  \
0      2020-09-20 00:00:00+00:00 2020-09-20 00:00:00+00:00   
1      2020-09-20 00:00:00+00:00 2020-09-20 00:30:00+00:00   
2      2020-09-20 00:00:00+00:00 2020-09-20 01:00:00+00:00   
3      2020-09-20 00:00:00+00:00 2020-09-20 01:30:00+00:00   
4      2020-09-20 00:00:00+00:00 2020-09-20 02:00:00+00:00   
...                          ...                       ...   
684835 2024-02-19 18:00:00+00:00 2024-02-19 20:30:00+00:00   
684836 2024-02-19 18:00:00+00:00 2024-02-19 21:00:00+00:00   
684837 2024-02-19 18:00:00+00:00 2024-02-19 21:30:00+00:00   
684838 2024-02-19 18:00:00+00:00 2024-02-19 22:00:00+00:00   
684839 2024-02-19 18:00:00+00:00 2024-02-19 22:30:00+00:00   

        DWD_WS_W_53.77_1.702  DWD_WS_W_53.77_1.767  DWD_WS_W_53.77_1.832  \
0                  10.064903             10.121807             10.165877   
1                   9.910870              9.957970              9.993410   
2                   9.75683

In [4]:
# Print the column names
dwd_wind_columns = dwd_wind.columns.tolist()
ncep_wind_columns = ncep_wind.columns.tolist()
train_energy_data_columns = train_energy_data.columns.tolist()

print("DWD Wind Columns:", dwd_wind_columns)
print("NCEP Wind Columns:", ncep_wind_columns)
print("Train Energy Data Columns:", train_energy_data_columns)

DWD Wind Columns: ['reference_time', 'valid_time', 'DWD_WS_W_53.77_1.702', 'DWD_WS_W_53.77_1.767', 'DWD_WS_W_53.77_1.832', 'DWD_WS_W_53.77_1.897', 'DWD_WS_W_53.77_1.962', 'DWD_WS_W_53.77_2.027', 'DWD_WS_W_53.84_1.702', 'DWD_WS_W_53.84_1.767', 'DWD_WS_W_53.84_1.832', 'DWD_WS_W_53.84_1.897', 'DWD_WS_W_53.84_1.962', 'DWD_WS_W_53.84_2.027', 'DWD_WS_W_53.97_1.702', 'DWD_WS_W_53.97_1.767', 'DWD_WS_W_53.97_1.832', 'DWD_WS_W_53.97_1.897', 'DWD_WS_W_53.97_1.962', 'DWD_WS_W_53.97_2.027', 'DWD_WS_W_53.9_1.702', 'DWD_WS_W_53.9_1.767', 'DWD_WS_W_53.9_1.832', 'DWD_WS_W_53.9_1.897', 'DWD_WS_W_53.9_1.962', 'DWD_WS_W_53.9_2.027', 'DWD_WS_W_54.03_1.702', 'DWD_WS_W_54.03_1.767', 'DWD_WS_W_54.03_1.832', 'DWD_WS_W_54.03_1.897', 'DWD_WS_W_54.03_1.962', 'DWD_WS_W_54.03_2.027', 'DWD_WS_W_54.1_1.702', 'DWD_WS_W_54.1_1.767', 'DWD_WS_W_54.1_1.832', 'DWD_WS_W_54.1_1.897', 'DWD_WS_W_54.1_1.962', 'DWD_WS_W_54.1_2.027', 'DWD_WS100_W_53.77_1.702', 'DWD_WS100_W_53.77_1.767', 'DWD_WS100_W_53.77_1.832', 'DWD_WS100_W_53.

In [5]:
# Define function to create lag 0 features
def create_lag0_features(df, columns, prefix):
    df = df.sort_values(by=['reference_time', 'valid_time'])
    df[f'{prefix}_avg_lag0'] = df[columns].mean(axis=1)
    df[f'{prefix}_var_lag0'] = df[columns].var(axis=1)
    return df

# Define function to create lagged features with a 30-minute shift
def create_lagged_features(df, columns, prefix):
    df = df.sort_values(by=['reference_time', 'valid_time'])
    for lag in range(-2, 3):  # Including lag +2
        df[f'{prefix}_avg_lag{lag}'] = df[columns].mean(axis=1).shift(lag)
        df[f'{prefix}_var_lag{lag}'] = df[columns].var(axis=1).shift(lag)
    return df

# Create lag 0 features for DWD wind data
dwd_columns_ws = [col for col in dwd_wind.columns if col.startswith('DWD_WS_W_')]
dwd_wind = create_lagged_features(dwd_wind, dwd_columns_ws, 'DWD_WS_W')
dwd_wind = dwd_wind.drop(columns=dwd_columns_ws)

dwd_columns_ws100 = [col for col in dwd_wind.columns if col.startswith('DWD_WS100_W_')]
dwd_wind = create_lagged_features(dwd_wind, dwd_columns_ws100, 'DWD_WS100_W')
dwd_wind = dwd_wind.drop(columns=dwd_columns_ws100)

dwd_columns_wd = [col for col in dwd_wind.columns if col.startswith('DWD_WD_W_')]
dwd_wind = create_lagged_features(dwd_wind, dwd_columns_wd, 'DWD_WD_W')
dwd_wind = dwd_wind.drop(columns=dwd_columns_wd)

dwd_columns_wd100 = [col for col in dwd_wind.columns if col.startswith('DWD_WD100_W_')]
dwd_wind = create_lagged_features(dwd_wind, dwd_columns_wd100, 'DWD_WD100_W')
dwd_wind = dwd_wind.drop(columns=dwd_columns_wd100)

dwd_columns_rh = [col for col in dwd_wind.columns if col.startswith('DWD_RH_W_')]
dwd_wind = create_lag0_features(dwd_wind, dwd_columns_rh, 'DWD_RH_W')
dwd_wind = dwd_wind.drop(columns=dwd_columns_rh)

# Create lag 0 features for NCEP wind data
ncep_columns_ws = [col for col in ncep_wind.columns if col.startswith('NCEP_WS_W_')]
ncep_wind = create_lagged_features(ncep_wind, ncep_columns_ws, 'NCEP_WS_W')
ncep_wind = ncep_wind.drop(columns=ncep_columns_ws)

ncep_columns_ws100 = [col for col in ncep_wind.columns if col.startswith('NCEP_WS100_W_')]
ncep_wind = create_lagged_features(ncep_wind, ncep_columns_ws100, 'NCEP_WS100_W')
ncep_wind = ncep_wind.drop(columns=ncep_columns_ws100)

ncep_columns_wd = [col for col in ncep_wind.columns if col.startswith('NCEP_WD_W_')]
ncep_wind = create_lagged_features(ncep_wind, ncep_columns_wd, 'NCEP_WD_W')
ncep_wind = ncep_wind.drop(columns=ncep_columns_wd)

ncep_columns_wd100 = [col for col in ncep_wind.columns if col.startswith('NCEP_WD100_W_')]
ncep_wind = create_lagged_features(ncep_wind, ncep_columns_wd100, 'NCEP_WD100_W')
ncep_wind = ncep_wind.drop(columns=ncep_columns_wd100)

ncep_columns_rh = [col for col in ncep_wind.columns if col.startswith('NCEP_RH_W_')]
ncep_wind = create_lag0_features(ncep_wind, ncep_columns_rh, 'NCEP_RH_W')
ncep_wind = ncep_wind.drop(columns=ncep_columns_rh)

# Create lagged features for remaining DWD wind data
dwd_columns_remaining = [col for col in dwd_wind.columns if col.startswith('DWD_T_W_')]
dwd_wind = create_lag0_features(dwd_wind, dwd_columns_remaining, 'DWD_T_W')
dwd_wind = dwd_wind.drop(columns=dwd_columns_remaining)

# Create lagged features for remaining NCEP wind data
ncep_columns_remaining = [col for col in ncep_wind.columns if col.startswith('NCEP_T_W_')]
ncep_wind = create_lag0_features(ncep_wind, ncep_columns_remaining, 'NCEP_T_W')
ncep_wind = ncep_wind.drop(columns=ncep_columns_remaining)

# Reset index after modifications
dwd_wind.reset_index(drop=True, inplace=True)
ncep_wind.reset_index(drop=True, inplace=True)

print('DWD Wind:', dwd_wind.head())
print('NCEP Wind:', ncep_wind.head())

DWD Wind:              reference_time                valid_time  DWD_WS_W_avg_lag-2  \
0 2020-09-20 00:00:00+00:00 2020-09-20 00:00:00+00:00            9.767447   
1 2020-09-20 00:00:00+00:00 2020-09-20 00:30:00+00:00            9.631038   
2 2020-09-20 00:00:00+00:00 2020-09-20 01:00:00+00:00            9.494630   
3 2020-09-20 00:00:00+00:00 2020-09-20 01:30:00+00:00            9.474915   
4 2020-09-20 00:00:00+00:00 2020-09-20 02:00:00+00:00            9.455198   

   DWD_WS_W_var_lag-2  DWD_WS_W_avg_lag-1  DWD_WS_W_var_lag-1  \
0            0.019203            9.905537            0.024825   
1            0.021607            9.767447            0.019203   
2            0.027443            9.631038            0.021607   
3            0.020213            9.494630            0.027443   
4            0.020990            9.474915            0.020213   

   DWD_WS_W_avg_lag0  DWD_WS_W_var_lag0  DWD_WS_W_avg_lag1  DWD_WS_W_var_lag1  \
0          10.043627           0.033348                

In [6]:
def find_missing_six_am_times(dataset):
    start_date = datetime(2020, 9, 21, 6, 0, 0)
    end_date = datetime(2024, 2, 19, 6, 0, 0)
    
    # Generate the complete set of 06:00 times
    all_six_am_times = {start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)}
    
    # Filter dataset to get only 06:00 reference times
    dataset = dataset[dataset['reference_time'].dt.hour == 6]
    dataset_times = set(dataset['reference_time'])
    
    # Find missing times
    missing_times = all_six_am_times - dataset_times
    
    return sorted(missing_times)

def process_dataset(dataset, missing_times):
    # Keep only 06:00 reference times
    dataset = dataset[dataset['reference_time'].dt.hour == 6]
    
    # Drop rows with missing reference times
    dataset = dataset[~dataset['reference_time'].isin(missing_times)]
    
    return dataset

In [7]:
# Get missing 06:00 reference times for each dataset

missing_times_dwd_wind = find_missing_six_am_times(dwd_wind)
missing_times_ncep_wind = find_missing_six_am_times(ncep_wind)

# Combine all missing times
all_missing_times = set(missing_times_dwd_wind) | set(missing_times_ncep_wind)

dwd_wind = process_dataset(dwd_wind, all_missing_times)
print(dwd_wind.shape)

ncep_wind = process_dataset(ncep_wind, all_missing_times)
print(ncep_wind.shape)

(171080, 46)
(173017, 46)


In [8]:
def filter_relevant_data(dataset, best_ref_time):
    DA_Market_times = day_ahead_market_times(best_ref_time).tz_convert('UTC')
    relevant_data = dataset[(dataset['reference_time'] == best_ref_time) &
                            (dataset['valid_time'].isin(DA_Market_times))]
    return relevant_data

In [9]:
dwd_wind = pd.concat([filter_relevant_data(dwd_wind, ref_time) for ref_time in dwd_wind['reference_time'].unique()])
dwd_wind = dwd_wind.drop(columns=['reference_time'])
print(dwd_wind.shape)

ncep_wind = pd.concat([filter_relevant_data(ncep_wind, ref_time) for ref_time in ncep_wind['reference_time'].unique()])
ncep_wind = ncep_wind.drop(columns=['reference_time'])
print(ncep_wind.shape)

(59097, 45)
(59762, 45)


In [10]:
# Generate half-hour intervals
def generate_half_hour_intervals(start_date, end_date):
    return pd.date_range(start=start_date, end=end_date, freq='30min')

start_date = '2020-09-20 22:00:00+00:00'
end_date = '2024-02-19 23:00:00+00:00'
half_hour_intervals = generate_half_hour_intervals(start_date, end_date)
half_hour_df = pd.DataFrame({'valid_time': half_hour_intervals})

# Merge the filtered datasets with the half-hour intervals DataFrame
datasets = [dwd_wind, ncep_wind, train_energy_data] 
merged_df = half_hour_df

for dataset in datasets:
    merged_df = merged_df.merge(dataset, on='valid_time', how='left')

# Display the shape of the merged DataFrame
print(merged_df.shape)
print(merged_df.head())  

(59859, 103)
                 valid_time  DWD_WS_W_avg_lag-2  DWD_WS_W_var_lag-2  \
0 2020-09-20 22:00:00+00:00            5.019281            0.080340   
1 2020-09-20 22:30:00+00:00            4.980490            0.051245   
2 2020-09-20 23:00:00+00:00            4.941698            0.032833   
3 2020-09-20 23:30:00+00:00            4.602533            0.087054   
4 2020-09-21 00:00:00+00:00            4.263368            0.186512   

   DWD_WS_W_avg_lag-1  DWD_WS_W_var_lag-1  DWD_WS_W_avg_lag0  \
0            5.206201            0.108340           5.393120   
1            5.019281            0.080340           5.206201   
2            4.980490            0.051245           5.019281   
3            4.941698            0.032833           4.980490   
4            4.602533            0.087054           4.941698   

   DWD_WS_W_var_lag0  DWD_WS_W_avg_lag1  DWD_WS_W_var_lag1  DWD_WS_W_avg_lag2  \
0           0.142880           5.722253           0.143480           6.051386   
1           0

In [11]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Drop specified columns from merged_df
merged_df = merged_df.drop(['MIP', 'Solar_MW', 'Solar_capacity_mwp', 'Solar_installedcapacity_mwp', 'Wind_MW', 'SS_Price', 'boa_MWh', 'DA_Price', 'Solar_MWh_credit', 'total_generation_MWh'], axis=1)

In [12]:
print(merged_df.describe())
merged_df.dropna(inplace=True)
print(merged_df.describe())
merged_df.to_hdf('data/reference_time_06/WindTrainTable.h5', key='df', mode='w')

       DWD_WS_W_avg_lag-2  DWD_WS_W_var_lag-2  DWD_WS_W_avg_lag-1  \
count        59097.000000        59097.000000        59097.000000   
mean             7.658817            0.168614            7.658674   
std              3.920252            0.539040            3.921174   
min              0.223858            0.000294            0.223858   
25%              4.562828            0.029507            4.561595   
50%              7.334394            0.067876            7.333732   
75%             10.393638            0.154132           10.391252   
max             26.820341           18.736843           26.820341   

       DWD_WS_W_var_lag-1  DWD_WS_W_avg_lag0  DWD_WS_W_var_lag0  \
count        59097.000000       59097.000000       59097.000000   
mean             0.167580           7.658735           0.166642   
std              0.527421           3.922040           0.517261   
min              0.000294           0.223858           0.000294   
25%              0.029440           4.56062