In [1]:
import pandas as pd
import xarray as xr
import numpy as np
from comp_utils import *

In [2]:
# The test_energy_data_20200920_20240519.h5 has been created in the RemitData notebook
test_energy_data = pd.read_hdf('data/combined/test_energy_data_20200920_20240519.h5', 'df')
test_energy_data["dtm"] = pd.to_datetime(test_energy_data["dtm"])
test_energy_data.rename(columns={"dtm": "valid_time"}, inplace=True)
test_energy_data["Wind_MWh_credit"] = 0.5*test_energy_data["Wind_MW"] - test_energy_data["boa_MWh"]
test_energy_data["Solar_MWh_credit"] = 0.5*test_energy_data["Solar_MW"]
test_energy_data["total_generation_MWh"] = test_energy_data["Wind_MWh_credit"] + test_energy_data["Solar_MWh_credit"]

In [3]:
cutoff_reference_time = pd.Timestamp('2024-02-18 23:00:00+00:00')
# Solar
dwd_solar = pd.read_hdf('data/combined/dwd_solar_20200920_20240519.h5', 'df')
dwd_solar = dwd_solar[dwd_solar["valid_time"] - dwd_solar["reference_time"] < np.timedelta64(70, "h")]
dwd_solar = dwd_solar.set_index("valid_time").groupby("reference_time").resample("30min").interpolate("linear")
dwd_solar = dwd_solar.drop(columns="reference_time",axis=1).reset_index()
dwd_solar = dwd_solar[dwd_solar['valid_time'] >= cutoff_reference_time]

ncep_solar = pd.read_hdf('data/combined/ncep_solar_20200920_20240519.h5', 'df')
ncep_solar = ncep_solar[ncep_solar["valid_time"] - ncep_solar["reference_time"] < np.timedelta64(70, "h")]
ncep_solar = ncep_solar.set_index("valid_time").groupby("reference_time").resample("30min").interpolate("linear")
ncep_solar = ncep_solar.drop(columns="reference_time",axis=1).reset_index()
ncep_solar = ncep_solar[ncep_solar['valid_time'] >= cutoff_reference_time]

In [4]:
import pandas as pd

# Define the function to create lagged features with a 30-minute shift
def create_lagged_features(df, columns, prefix):
    df = df.sort_values(by=['reference_time', 'valid_time'])  # Sort by reference_time and then by valid_time
    for lag in range(-2, 3):  # Including lag +2
        df[f'{prefix}_avg_lag{lag}'] = df[columns].mean(axis=1).shift(lag)
        df[f'{prefix}_var_lag{lag}'] = df[columns].var(axis=1).shift(lag)
    return df

# Define the function to create lag 0 features only
def create_lag0_features(df, columns, prefix):
    df = df.sort_values(by=['reference_time', 'valid_time'])  # Sort by reference_time and then by valid_time
    df[f'{prefix}_avg_lag0'] = df[columns].mean(axis=1)
    df[f'{prefix}_var_lag0'] = df[columns].var(axis=1)
    return df

# Example DataFrame setup (dwd_solar, ncep_solar)
# Assuming these DataFrames are already defined and loaded

# DWD_T_S columns with lag 0 features only
dwd_columns = [f'DWD_T_S_{i}' for i in range(20)]
dwd_solar = create_lag0_features(dwd_solar, dwd_columns, 'DWD_T_S')
dwd_solar = dwd_solar.drop(columns=dwd_columns)

# NCEP_T_Solar columns with lag 0 features only
ncep_columns = [f'NCEP_T_Solar_{i}' for i in range(20)]
ncep_solar = create_lag0_features(ncep_solar, ncep_columns, 'NCEP_T_Solar')
ncep_solar = ncep_solar.drop(columns=ncep_columns)

# DWD_SDR_S columns with lagged features
dwd_columns = [f'DWD_SDR_S_{i}' for i in range(20)]
dwd_solar = create_lagged_features(dwd_solar, dwd_columns, 'DWD_SDR_S')
dwd_solar = dwd_solar.drop(columns=dwd_columns)

# NCEP_SDR_Solar columns with lagged features
ncep_columns = [f'NCEP_SDR_Solar_{i}' for i in range(20)]
ncep_solar = create_lagged_features(ncep_solar, ncep_columns, 'NCEP_SDR_Solar')
ncep_solar = ncep_solar.drop(columns=ncep_columns)

# Resetting index after modifications
dwd_solar.reset_index(drop=True, inplace=True)
ncep_solar.reset_index(drop=True, inplace=True)

print('DWD Solar:', dwd_solar.shape)
print('NCEP Solar:', ncep_solar.shape)

DWD Solar: (50382, 34)
NCEP Solar: (46907, 34)


In [5]:
from datetime import datetime, timedelta
import pandas as pd
import pytz

def find_missing_reference_times(dataset):
    utc = pytz.UTC
    start_date = utc.localize(datetime(2024, 2, 18, 0, 0, 0))
    end_date = utc.localize(datetime(2024, 5, 20, 18, 0, 0))

    # Create a complete set of reference times (00:00, 06:00, 12:00, 18:00 each day) to help us identify missing ref times
    time_periods = [timedelta(hours=6 * i) for i in range(4)]
    all_reference_times = {start_date + timedelta(days=d) + tp for d in range((end_date - start_date).days + 1) for tp in time_periods}
    
    # Filter dataset to get only reference times at 00:00, 06:00, 12:00, and 18:00
    dataset['reference_time'] = pd.to_datetime(dataset['reference_time'])
    filtered_dataset = dataset[dataset['reference_time'].dt.hour.isin([0, 6, 12, 18])]
    dataset_times = set(filtered_dataset['reference_time'])

    # Find missing times
    missing_times = all_reference_times - dataset_times
    
    return sorted(missing_times)



In [6]:
missing_days_dwd_solar= find_missing_reference_times(dwd_solar)
print(missing_days_dwd_solar)

missing_days_ncep_solar= find_missing_reference_times(ncep_solar)
print(missing_days_ncep_solar)

[datetime.datetime(2024, 5, 3, 0, 0, tzinfo=<UTC>), datetime.datetime(2024, 5, 11, 12, 0, tzinfo=<UTC>), datetime.datetime(2024, 5, 11, 18, 0, tzinfo=<UTC>), datetime.datetime(2024, 5, 12, 0, 0, tzinfo=<UTC>), datetime.datetime(2024, 5, 19, 6, 0, tzinfo=<UTC>), datetime.datetime(2024, 5, 19, 12, 0, tzinfo=<UTC>), datetime.datetime(2024, 5, 19, 18, 0, tzinfo=<UTC>), datetime.datetime(2024, 5, 20, 0, 0, tzinfo=<UTC>), datetime.datetime(2024, 5, 20, 6, 0, tzinfo=<UTC>), datetime.datetime(2024, 5, 20, 12, 0, tzinfo=<UTC>), datetime.datetime(2024, 5, 20, 18, 0, tzinfo=<UTC>)]
[datetime.datetime(2024, 3, 20, 6, 0, tzinfo=<UTC>), datetime.datetime(2024, 3, 22, 0, 0, tzinfo=<UTC>), datetime.datetime(2024, 4, 4, 12, 0, tzinfo=<UTC>), datetime.datetime(2024, 4, 15, 12, 0, tzinfo=<UTC>), datetime.datetime(2024, 4, 15, 18, 0, tzinfo=<UTC>), datetime.datetime(2024, 4, 16, 0, 0, tzinfo=<UTC>), datetime.datetime(2024, 4, 16, 18, 0, tzinfo=<UTC>), datetime.datetime(2024, 4, 17, 18, 0, tzinfo=<UTC>), d

In [7]:
import pandas as pd
from datetime import datetime, timedelta

def get_best_reference_time(date, ref_time_list):
    # First check if 6 is available, if not use 0.
    for hour in [6, 0]:
        ref_time = pd.Timestamp(date.year, date.month, date.day, hour, tz='UTC')
        if ref_time in ref_time_list:
            return ref_time
    
    # If 0 is not available go back to the previous day to check for 18, 12 and 6. 
    # Check previous day's preferred hours
    previous_date = date - pd.Timedelta(days=1)
    for hour in [18, 12, 6]:
        ref_time = pd.Timestamp(previous_date.year, previous_date.month, previous_date.day, hour, tz='UTC')
        if ref_time in ref_time_list:
            return ref_time
    
    return None

def process_dataset(dataset):
    features = pd.DataFrame()
    ref_time_list = set(dataset['reference_time'])

    # Get unique dates from dataset
    unique_dates = pd.to_datetime(dataset['valid_time']).dt.normalize().unique()

    # Iterate through unique dates
    for date in unique_dates:
        best_ref_time = get_best_reference_time(date, ref_time_list)
        if best_ref_time:
            # Get day-ahead market times based on the best reference time
            DA_Market_times = day_ahead_market_times(best_ref_time).tz_convert('UTC')
            
            # Filter to include only data within the day-ahead market times for the best reference time
            relevant_data = dataset[(dataset['reference_time'] == best_ref_time) &
                                    (dataset['valid_time'].isin(DA_Market_times))]

            features = pd.concat([features, relevant_data])

    return features.reset_index(drop=True)

In [8]:
dwd_solar = process_dataset(dwd_solar)
ncep_solar = process_dataset(ncep_solar)

In [9]:
dwd_solar = dwd_solar.drop(columns=['reference_time'])
ncep_solar = ncep_solar.drop(columns=['reference_time'])

In [10]:

dwd_solar = dwd_solar.drop(columns=['index'], errors='ignore').reset_index(drop=True)
ncep_solar = ncep_solar.drop(columns=['index'], errors='ignore').reset_index(drop=True)

print(dwd_solar.shape)
print(ncep_solar.shape)
print(test_energy_data.shape)

dwd_solar = dwd_solar.drop_duplicates('valid_time', keep='first')
ncep_solar = ncep_solar.drop_duplicates('valid_time', keep='first')
test_energy_data = test_energy_data.drop_duplicates('valid_time', keep='first')

(4414, 33)
(4414, 33)
(64224, 15)


In [11]:
merged_df = dwd_solar.merge(ncep_solar, on='valid_time', how='left')
merged_df = merged_df.merge(test_energy_data, on='valid_time', how='left')

cutoff_start = pd.Timestamp('2024-02-19 23:00:00+00:00')
cutoff_end = pd.Timestamp('2024-05-19 21:30:00+00:00')
merged_df = merged_df[(merged_df['valid_time'] <= cutoff_end) & 
                                    (merged_df['valid_time'] >= cutoff_start)]

print(merged_df)

                    valid_time  DWD_CC_S_0  DWD_CC_S_1  DWD_CC_S_10  \
48   2024-02-19 23:00:00+00:00    0.999922    1.000000          1.0   
49   2024-02-19 23:30:00+00:00    0.989746    1.000000          1.0   
50   2024-02-20 00:00:00+00:00    0.979570    1.000000          1.0   
51   2024-02-20 00:30:00+00:00    0.989785    1.000000          1.0   
52   2024-02-20 01:00:00+00:00    1.000000    1.000000          1.0   
...                        ...         ...         ...          ...   
4361 2024-05-19 19:30:00+00:00    0.000000    0.076455          0.0   
4362 2024-05-19 20:00:00+00:00    0.000000    0.152910          0.0   
4363 2024-05-19 20:30:00+00:00    0.000000    0.193848          0.0   
4364 2024-05-19 21:00:00+00:00    0.000000    0.234785          0.0   
4365 2024-05-19 21:30:00+00:00    0.000576    0.285166          0.0   

      DWD_CC_S_11  DWD_CC_S_12  DWD_CC_S_13  DWD_CC_S_14  DWD_CC_S_15  \
48            1.0     1.000000     0.985723          1.0     1.000000   
4

In [12]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Drop specified columns from merged_df
merged_df = merged_df.drop(['Availability1','Availability2','Availability3','total_generation_MWh', 'boa_MWh','Wind_MWh_credit', 'Wind_MW', 'Solar_MW', 'MIP'], axis=1)

In [13]:
import pandas as pd
import pvlib
# Latitude and Longitude points
lats = [52.4872562, 52.8776682, 52.1354277, 52.4880497, 51.9563696, 52.2499177, 52.6416477, 52.2700912, 52.1960768, 52.7082618, 52.4043468, 52.0679429, 52.024023, 52.7681276, 51.8750506, 52.5582373, 52.4478922, 52.5214863, 52.8776682, 52.0780721]
lons = [0.4012455, 0.7906532, -0.2640343, -0.1267052, 0.6588173, 1.3894081, 1.3509559, 0.7082557, 0.1534462, 0.7302284, 1.0762977, 1.1751747, 0.2962684, 0.1699257, 0.9115028, 0.7137489, 0.1204872, 1.5706825, 1.1916542, -0.0113488]
# Calculate the average latitude and longitude
avg_lat = sum(lats) / len(lats)
avg_lon = sum(lons) / len(lons)

# Define the function to calculate solar position and radiance for a single location
def add_solar_features(df, lat, lon, time_column):
    # Ensure the time column is in datetime format
    df[time_column] = pd.to_datetime(df[time_column])
    
    # Initialize an empty DataFrame to store results
    solar_features = pd.DataFrame(index=df[time_column].unique())
    
    # Calculate solar position and radiance for the average location
    location = pvlib.location.Location(lat, lon)
    solar_position = location.get_solarposition(solar_features.index)
    clear_sky = location.get_clearsky(solar_features.index)

    # Add the solar position and radiance data to the solar_features DataFrame
    solar_features[f'solar_zenith'] = solar_position['zenith'].values
    solar_features[f'solar_azimuth'] = solar_position['azimuth'].values
    solar_features[f'DNI'] = clear_sky['dni'].values
    solar_features[f'GHI'] = clear_sky['ghi'].values

    # Merge the solar features with the original DataFrame based on the time column
    df = df.merge(solar_features, left_on=time_column, right_index=True, how='left')
    
    return df


# Add solar features to train and test sets using the average location
merged_df = add_solar_features(merged_df, avg_lat, avg_lon, 'valid_time')


print('Train set shape after adding solar features:', merged_df.shape)

Train set shape after adding solar features: (4318, 74)


In [14]:
print(merged_df.describe())
merged_df.dropna(inplace=True)
print(merged_df.describe())
merged_df.to_hdf('data/reference_time_06/SolarTestTable_Trading.h5', key='df', mode='w')

        DWD_CC_S_0   DWD_CC_S_1  DWD_CC_S_10  DWD_CC_S_11  DWD_CC_S_12  \
count  4318.000000  4318.000000  4318.000000  4318.000000  4318.000000   
mean      0.803544     0.799562     0.801331     0.802382     0.827227   
std       0.269739     0.274851     0.277530     0.280251     0.248394   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.661885     0.667976     0.680264     0.670195     0.711162   
50%       0.954517     0.951021     0.953857     0.961113     0.966616   
75%       1.000000     1.000000     1.000000     1.000000     1.000000   
max       1.000000     1.000000     1.000000     1.000000     1.000000   

       DWD_CC_S_13  DWD_CC_S_14  DWD_CC_S_15  DWD_CC_S_16  DWD_CC_S_17  ...  \
count  4318.000000  4318.000000  4318.000000  4318.000000  4318.000000  ...   
mean      0.795041     0.810249     0.800918     0.803769     0.787594  ...   
std       0.273085     0.274309     0.271160     0.273025     0.295836  ...   
min       0.00000