In [1]:
import pandas as pd
import xarray as xr
import numpy as np
from comp_utils import *
from datetime import datetime, timedelta

In [2]:
# The train_energy_data_20200920_20240519.h5 has been created in the RemitData notebook
train_energy_data = pd.read_hdf('data/combined/train_energy_data_20200920_20240519.h5', 'df')
train_energy_data["dtm"] = pd.to_datetime(train_energy_data["dtm"])
train_energy_data.rename(columns={"dtm": "valid_time"}, inplace=True)
train_energy_data["Wind_MWh_credit"] = 0.5*train_energy_data["Wind_MW"] - train_energy_data["boa_MWh"]
train_energy_data["Solar_MWh_credit"] = 0.5*train_energy_data["Solar_MW"]
train_energy_data["total_generation_MWh"] = train_energy_data["Wind_MWh_credit"] + train_energy_data["Solar_MWh_credit"]

In [3]:
cutoff_reference_time = pd.Timestamp('2024-02-19 23:00:00+00:00')
# Solar
dwd_solar = pd.read_hdf('data/combined/dwd_solar_20200920_20240519.h5', 'df')
dwd_solar = dwd_solar[dwd_solar["valid_time"] - dwd_solar["reference_time"] < np.timedelta64(40, "h")]
dwd_solar = dwd_solar.set_index("valid_time").groupby("reference_time").resample("30min").interpolate("linear")
dwd_solar = dwd_solar.drop(columns="reference_time",axis=1).reset_index()
dwd_solar = dwd_solar[dwd_solar['valid_time'] < cutoff_reference_time]

ncep_solar = pd.read_hdf('data/combined/ncep_solar_20200920_20240519.h5', 'df')
ncep_solar = ncep_solar[ncep_solar["valid_time"] - ncep_solar["reference_time"] < np.timedelta64(40, "h")]
ncep_solar = ncep_solar.set_index("valid_time").groupby("reference_time").resample("30min").interpolate("linear")
ncep_solar = ncep_solar.drop(columns="reference_time",axis=1).reset_index()
ncep_solar = ncep_solar[ncep_solar['valid_time'] < cutoff_reference_time]

In [4]:
import pandas as pd

# Define the function to create lagged features with a 30-minute shift
def create_lagged_features(df, columns, prefix):
    df = df.sort_values(by=['reference_time', 'valid_time'])  # Sort by reference_time and then by valid_time
    for lag in range(-2, 3):  # Including lag +2
        df[f'{prefix}_avg_lag{lag}'] = df[columns].mean(axis=1).shift(lag)
        df[f'{prefix}_var_lag{lag}'] = df[columns].var(axis=1).shift(lag)
    return df

# Define the function to create lag 0 features only
def create_lag0_features(df, columns, prefix):
    df = df.sort_values(by=['reference_time', 'valid_time'])  # Sort by reference_time and then by valid_time
    df[f'{prefix}_avg_lag0'] = df[columns].mean(axis=1)
    df[f'{prefix}_var_lag0'] = df[columns].var(axis=1)
    return df

# Example DataFrame setup (dwd_solar, ncep_solar)
# Assuming these DataFrames are already defined and loaded

# DWD_T_S columns with lag 0 features only
dwd_columns = [f'DWD_T_S_{i}' for i in range(20)]
dwd_solar = create_lag0_features(dwd_solar, dwd_columns, 'DWD_T_S')
dwd_solar = dwd_solar.drop(columns=dwd_columns)

# NCEP_T_Solar columns with lag 0 features only
ncep_columns = [f'NCEP_T_Solar_{i}' for i in range(20)]
ncep_solar = create_lag0_features(ncep_solar, ncep_columns, 'NCEP_T_Solar')
ncep_solar = ncep_solar.drop(columns=ncep_columns)

# DWD_SDR_S columns with lagged features
dwd_columns = [f'DWD_SDR_S_{i}' for i in range(20)]
dwd_solar = create_lagged_features(dwd_solar, dwd_columns, 'DWD_SDR_S')
dwd_solar = dwd_solar.drop(columns=dwd_columns)

# NCEP_SDR_Solar columns with lagged features
ncep_columns = [f'NCEP_SDR_Solar_{i}' for i in range(20)]
ncep_solar = create_lagged_features(ncep_solar, ncep_columns, 'NCEP_SDR_Solar')
ncep_solar = ncep_solar.drop(columns=ncep_columns)

# Resetting index after modifications
dwd_solar.reset_index(drop=True, inplace=True)
ncep_solar.reset_index(drop=True, inplace=True)

print('DWD Solar:', dwd_solar.head())
print('NCEP Solar:', ncep_solar.head())


DWD Solar:              reference_time                valid_time  DWD_CC_S_0  DWD_CC_S_1  \
0 2020-09-20 00:00:00+00:00 2020-09-20 00:00:00+00:00    0.426270    0.647109   
1 2020-09-20 00:00:00+00:00 2020-09-20 00:30:00+00:00    0.435937    0.637988   
2 2020-09-20 00:00:00+00:00 2020-09-20 01:00:00+00:00    0.445605    0.628867   
3 2020-09-20 00:00:00+00:00 2020-09-20 01:30:00+00:00    0.486719    0.649297   
4 2020-09-20 00:00:00+00:00 2020-09-20 02:00:00+00:00    0.527832    0.669727   

   DWD_CC_S_10  DWD_CC_S_11  DWD_CC_S_12  DWD_CC_S_13  DWD_CC_S_14  \
0     0.370059     0.167988     0.411250     0.511484     0.255020   
1     0.364805     0.164033     0.440850     0.521631     0.262324   
2     0.359551     0.160078     0.470449     0.531777     0.269629   
3     0.358066     0.168105     0.474639     0.525674     0.263389   
4     0.356582     0.176133     0.478828     0.519570     0.257148   

   DWD_CC_S_15  ...  DWD_SDR_S_avg_lag-2  DWD_SDR_S_var_lag-2  \
0     0.465625  

In [5]:
def find_missing_six_am_times(dataset):
    start_date = datetime(2020, 9, 21, 6, 0, 0)
    end_date = datetime(2024, 2, 19, 6, 0, 0)
    
    # Generate the complete set of 06:00 times
    all_six_am_times = {start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)}
    
    # Filter dataset to get only 06:00 reference times
    dataset = dataset[dataset['reference_time'].dt.hour == 6]
    dataset_times = set(dataset['reference_time'])
    
    # Find missing times
    missing_times = all_six_am_times - dataset_times
    
    return sorted(missing_times)

def process_dataset(dataset, missing_times):
    # Keep only 06:00 reference times
    dataset = dataset[dataset['reference_time'].dt.hour == 6]
    
    # Drop rows with missing reference times
    dataset = dataset[~dataset['reference_time'].isin(missing_times)]
    
    return dataset

In [6]:
# Get missing 06:00 reference times for each dataset

missing_times_dwd_solar = find_missing_six_am_times(dwd_solar)
missing_times_ncep_solar = find_missing_six_am_times(ncep_solar)

# Combine all missing times
all_missing_times = set(missing_times_dwd_solar) | set(missing_times_ncep_solar)

dwd_solar = process_dataset(dwd_solar, all_missing_times)
print(dwd_solar.shape)

ncep_solar = process_dataset(ncep_solar, all_missing_times)
print(ncep_solar.shape)

(97162, 34)
(98152, 34)


In [7]:
def filter_relevant_data(dataset, best_ref_time):
    DA_Market_times = day_ahead_market_times(best_ref_time).tz_convert('UTC')
    relevant_data = dataset[(dataset['reference_time'] == best_ref_time) &
                            (dataset['valid_time'].isin(DA_Market_times))]
    return relevant_data

In [8]:
dwd_solar = pd.concat([filter_relevant_data(dwd_solar, ref_time) for ref_time in dwd_solar['reference_time'].unique()])
dwd_solar = dwd_solar.drop(columns=['reference_time'])
print(dwd_solar.shape)

ncep_solar = pd.concat([filter_relevant_data(ncep_solar, ref_time) for ref_time in ncep_solar['reference_time'].unique()])
ncep_solar = ncep_solar.drop(columns=['reference_time'])
print(ncep_solar.shape)

(56678, 33)
(57260, 33)


In [9]:
# Generate half-hour intervals
def generate_half_hour_intervals(start_date, end_date):
    return pd.date_range(start=start_date, end=end_date, freq='30min')

start_date = '2020-09-20 22:00:00+00:00'
end_date = '2024-02-19 23:00:00+00:00'
half_hour_intervals = generate_half_hour_intervals(start_date, end_date)
half_hour_df = pd.DataFrame({'valid_time': half_hour_intervals})

# Merge the filtered datasets with the half-hour intervals DataFrame
datasets = [dwd_solar, ncep_solar, train_energy_data] 
merged_df = half_hour_df

for dataset in datasets:
    merged_df = merged_df.merge(dataset, on='valid_time', how='left')

# Display the shape of the merged DataFrame
print(merged_df.shape)
print(merged_df.head())  

(59859, 79)
                 valid_time  DWD_CC_S_0  DWD_CC_S_1  DWD_CC_S_10  DWD_CC_S_11  \
0 2020-09-20 22:00:00+00:00    0.639941    0.524395     0.486660     0.261543   
1 2020-09-20 22:30:00+00:00    0.614365    0.578506     0.449414     0.305391   
2 2020-09-20 23:00:00+00:00    0.588789    0.632617     0.412168     0.349238   
3 2020-09-20 23:30:00+00:00    0.608701    0.626680     0.441406     0.331182   
4 2020-09-21 00:00:00+00:00    0.628613    0.620742     0.470645     0.313125   

   DWD_CC_S_12  DWD_CC_S_13  DWD_CC_S_14  DWD_CC_S_15  DWD_CC_S_16  ...  \
0     0.340078     0.622539     0.227148     0.509082     0.646523  ...   
1     0.285479     0.646113     0.254004     0.458555     0.646064  ...   
2     0.230879     0.669688     0.280859     0.408027     0.645605  ...   
3     0.215723     0.653096     0.290117     0.482500     0.640889  ...   
4     0.200566     0.636504     0.299375     0.556973     0.636172  ...   

   Wind_MW  SS_Price  boa_MWh  DA_Price  Availabil

In [10]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Drop specified columns from merged_df
merged_df = merged_df.drop(['Availability1','Availability2','Availability3','total_generation_MWh','boa_MWh','Wind_MWh_credit', 'Wind_MW', 'Solar_MW', 'MIP'], axis=1)

In [11]:
import pandas as pd
import pvlib
# Latitude and Longitude points
lats = [52.4872562, 52.8776682, 52.1354277, 52.4880497, 51.9563696, 52.2499177, 52.6416477, 52.2700912, 52.1960768, 52.7082618, 52.4043468, 52.0679429, 52.024023, 52.7681276, 51.8750506, 52.5582373, 52.4478922, 52.5214863, 52.8776682, 52.0780721]
lons = [0.4012455, 0.7906532, -0.2640343, -0.1267052, 0.6588173, 1.3894081, 1.3509559, 0.7082557, 0.1534462, 0.7302284, 1.0762977, 1.1751747, 0.2962684, 0.1699257, 0.9115028, 0.7137489, 0.1204872, 1.5706825, 1.1916542, -0.0113488]
# Calculate the average latitude and longitude
avg_lat = sum(lats) / len(lats)
avg_lon = sum(lons) / len(lons)

# Define the function to calculate solar position and radiance for a single location
def add_solar_features(df, lat, lon, time_column):
    # Ensure the time column is in datetime format
    df[time_column] = pd.to_datetime(df[time_column])
    
    # Initialize an empty DataFrame to store results
    solar_features = pd.DataFrame(index=df[time_column].unique())
    
    # Calculate solar position and radiance for the average location
    location = pvlib.location.Location(lat, lon)
    solar_position = location.get_solarposition(solar_features.index)
    clear_sky = location.get_clearsky(solar_features.index)

    # Add the solar position and radiance data to the solar_features DataFrame
    solar_features[f'solar_zenith'] = solar_position['zenith'].values
    solar_features[f'solar_azimuth'] = solar_position['azimuth'].values
    solar_features[f'DNI'] = clear_sky['dni'].values
    solar_features[f'GHI'] = clear_sky['ghi'].values

    # Merge the solar features with the original DataFrame based on the time column
    df = df.merge(solar_features, left_on=time_column, right_index=True, how='left')
    
    return df


# Add solar features to train and test sets using the average location
merged_df = add_solar_features(merged_df, avg_lat, avg_lon, 'valid_time')


print('Train set shape after adding solar features:', merged_df.shape)

Train set shape after adding solar features: (59859, 74)


In [12]:
print(merged_df.describe())
merged_df.dropna(inplace=True)
print(merged_df.describe())
merged_df.to_hdf('data/reference_time_06/SolarTrainTable_Trading.h5', key='df', mode='w')

         DWD_CC_S_0    DWD_CC_S_1   DWD_CC_S_10   DWD_CC_S_11   DWD_CC_S_12  \
count  56678.000000  56678.000000  56678.000000  56678.000000  56678.000000   
mean       0.732168      0.729177      0.726552      0.714080      0.724030   
std        0.321175      0.314814      0.318201      0.328473      0.321080   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.536199      0.536216      0.530552      0.501526      0.530298   
50%        0.883335      0.869146      0.866748      0.857227      0.864834   
75%        0.999863      0.998691      0.999395      0.999668      0.998257   
max        1.000000      1.000000      1.000000      1.000000      1.000000   

        DWD_CC_S_13   DWD_CC_S_14   DWD_CC_S_15   DWD_CC_S_16   DWD_CC_S_17  \
count  56678.000000  56678.000000  56678.000000  56678.000000  56678.000000   
mean       0.726532      0.709142      0.734100      0.728488      0.723424   
std        0.326174      0.332415      0.312906    