# Hybrid Renewable Energy Forecasting and Trading Competition
Author: George Panagiotou

CID: 02527334

## Transform Data from 5D NetCDF File to a 2D HDF5 DataFrame
The weather dataset provided by the HEFT competition is in the format of high-dimensional NetCDF files with the following dimensions:
1. Space
2. Latitude
3. Longitude
4. Forecast Reference Time
5. Valid Time
   
In this notebook, the main objective is to convert the file into a 2D DataFrame. For example, the temperature at an X,Y location will now have its own column.

In [2]:
import pandas as pd
import xarray as xr
import numpy as np
import comp_utils
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import MaxNLocator
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pickle as pkl
from comp_utils import day_ahead_market_times

## dwd_icon WIND

In [2]:
dwd_Hornsea1 = xr.open_dataset("data/dwd_icon_eu_hornsea_1_20240129_20240519.nc")

ws_dwd_df = dwd_Hornsea1["WindSpeed"].to_dataframe().reset_index().rename(columns={'WindSpeed': 'WindSpeed_dwd'})
ws100_dwd_df = dwd_Hornsea1["WindSpeed:100"].to_dataframe().reset_index().rename(columns={'WindSpeed:100': 'WindSpeed_dwd_100'})
wd_dwd_df = dwd_Hornsea1["WindDirection"].to_dataframe().reset_index().rename(columns={'WindDirection': 'WindDirection_dwd'})
wd100_dwd_df = dwd_Hornsea1["WindDirection:100"].to_dataframe().reset_index().rename(columns={'WindDirection:100': 'WindDirection_dwd_100'})
rh_dwd_df = dwd_Hornsea1["RelativeHumidity"].to_dataframe().reset_index().rename(columns={'RelativeHumidity': 'RelativeHumidity_dwd'})
t_dwd_df = dwd_Hornsea1["Temperature"].to_dataframe().reset_index().rename(columns={'Temperature': 'Temperature_dwd'})

# Merge both DataFrames on their common columns
merged_wind_dwd_df = pd.merge(ws_dwd_df, ws100_dwd_df, on=["reference_time", "valid_time", "latitude", "longitude"])
merged_wind_dwd_df = pd.merge(merged_wind_dwd_df, wd_dwd_df, on=["reference_time", "valid_time", "latitude", "longitude"])
merged_wind_dwd_df = pd.merge(merged_wind_dwd_df, wd100_dwd_df, on=["reference_time", "valid_time", "latitude", "longitude"])
merged_wind_dwd_df = pd.merge(merged_wind_dwd_df, rh_dwd_df, on=["reference_time", "valid_time", "latitude", "longitude"])
merged_wind_dwd_df = pd.merge(merged_wind_dwd_df, t_dwd_df, on=["reference_time", "valid_time", "latitude", "longitude"])

# Localize ref_datetime to UTC and adjust valid_datetime
merged_wind_dwd_df["reference_time"] = merged_wind_dwd_df["reference_time"].dt.tz_localize("UTC")
merged_wind_dwd_df["valid_time"] = merged_wind_dwd_df["reference_time"] + pd.to_timedelta(merged_wind_dwd_df["valid_time"], unit="H")

merged_wind_dwd_df['WindSpeed_col'] = merged_wind_dwd_df.apply(lambda row: f"DWD_WS_W_{row['latitude']}_{row['longitude']}", axis=1)
merged_wind_dwd_df['WindSpeed100_col'] = merged_wind_dwd_df.apply(lambda row: f"DWD_WS100_W_{row['latitude']}_{row['longitude']}", axis=1)
merged_wind_dwd_df['WindDirection_col'] = merged_wind_dwd_df.apply(lambda row: f"DWD_WD_W_{row['latitude']}_{row['longitude']}", axis=1)
merged_wind_dwd_df['WindDirection100_col'] = merged_wind_dwd_df.apply(lambda row: f"DWD_WD100_W_{row['latitude']}_{row['longitude']}", axis=1)
merged_wind_dwd_df['WindRelHum_col'] = merged_wind_dwd_df.apply(lambda row: f"DWD_RH_W_{row['latitude']}_{row['longitude']}", axis=1)
merged_wind_dwd_df['WindTemp_col'] = merged_wind_dwd_df.apply(lambda row: f"DWD_T_W_{row['latitude']}_{row['longitude']}", axis=1)

# Pivot the DataFrame for WindSpeed
pivot_ws = merged_wind_dwd_df.pivot_table(index=['reference_time', 'valid_time'], 
                                 columns='WindSpeed_col', 
                                 values='WindSpeed_dwd').reset_index()
# Pivot the DataFrame for WindSpeed:100
pivot_ws100 = merged_wind_dwd_df.pivot_table(index=['reference_time', 'valid_time'], 
                                    columns='WindSpeed100_col', 
                                    values='WindSpeed_dwd_100').reset_index()
# Pivot the DataFrame for WindDirection
pivot_wd = merged_wind_dwd_df.pivot_table(index=['reference_time', 'valid_time'], 
                                 columns='WindDirection_col', 
                                 values='WindDirection_dwd').reset_index()
# Pivot the DataFrame for WindDirection:100
pivot_wd100 = merged_wind_dwd_df.pivot_table(index=['reference_time', 'valid_time'], 
                                    columns='WindDirection100_col', 
                                    values='WindDirection_dwd_100').reset_index()
# Pivot the DataFrame for RelativeHumidity
pivot_rh = merged_wind_dwd_df.pivot_table(index=['reference_time', 'valid_time'], 
                                 columns='WindRelHum_col', 
                                 values='RelativeHumidity_dwd').reset_index()
# Pivot the DataFrame for Temperature
pivot_t = merged_wind_dwd_df.pivot_table(index=['reference_time', 'valid_time'], 
                                    columns='WindTemp_col', 
                                    values='Temperature_dwd').reset_index()

merged_df = pd.merge(pivot_ws, pivot_ws100, on=["reference_time", "valid_time"], suffixes=('', '_100_ws'))
merged_df = pd.merge(merged_df, pivot_wd, on=["reference_time", "valid_time"], suffixes=('', '_wd'))
merged_df = pd.merge(merged_df, pivot_wd100, on=["reference_time", "valid_time"], suffixes=('', '_100_wd'))
merged_df = pd.merge(merged_df, pivot_rh, on=["reference_time", "valid_time"], suffixes=('', '_rh'))
final_wind_dwd_df = pd.merge(merged_df, pivot_t, on=["reference_time", "valid_time"], suffixes=('', '_t'))
final_wind_dwd_df.shape
final_wind_dwd_df.to_hdf('dwd_wind_20240129_20240519.h5', key='df', mode='w')

## dwd_icon SOLAR

In [3]:
dwd_pes10 = xr.open_dataset("data/dwd_icon_eu_pes10_20240129_20240519.nc")

# Convert the SolarDownwardRadiation data to a DataFrame and reset index
cc_pes10_dwd_df = dwd_pes10["CloudCover"].to_dataframe().reset_index().rename(columns={'CloudCover': 'CloudCover_pes10_dwd'})
sdr_pes10_dwd_df = dwd_pes10["SolarDownwardRadiation"].to_dataframe().reset_index().rename(columns={'SolarDownwardRadiation': 'SolarDownwardRadiation_pes10_dwd'})
t_pes10_dwd_df = dwd_pes10["Temperature"].to_dataframe().reset_index().rename(columns={'Temperature': 'Temperature_pes10_dwd'})

merged_solar_dwd_df = pd.merge(cc_pes10_dwd_df, sdr_pes10_dwd_df, on=["reference_time", "valid_time", "latitude", "longitude"])
merged_solar_dwd_df = pd.merge(merged_solar_dwd_df, t_pes10_dwd_df, on=["reference_time", "valid_time", "latitude", "longitude"])

# Localize ref_datetime to UTC and adjust valid_datetime
merged_solar_dwd_df["reference_time"] = merged_solar_dwd_df["reference_time"].dt.tz_localize("UTC")
merged_solar_dwd_df["valid_time"] = merged_solar_dwd_df["reference_time"] + pd.to_timedelta(merged_solar_dwd_df["valid_time"], unit="H")

# Generate unique identifiers for solar radiation measurements based on the 'point' column
merged_solar_dwd_df['CloudCover_pes10_col'] = merged_solar_dwd_df.apply(lambda row: f"DWD_CC_S_{row['point']}", axis=1)
merged_solar_dwd_df['SolarDownwardRadiation_pes10_col'] = merged_solar_dwd_df.apply(lambda row: f"DWD_SDR_S_{row['point']}", axis=1)
merged_solar_dwd_df['Temperature_pes10_col'] = merged_solar_dwd_df.apply(lambda row: f"DWD_T_S_{row['point']}", axis=1)

# Pivot the DataFrame for SolarDownwardRadiation
pivot_cc = merged_solar_dwd_df.pivot_table(index=['reference_time', 'valid_time'], 
                                             columns='CloudCover_pes10_col', 
                                             values='CloudCover_pes10_dwd').reset_index()
# Pivot the DataFrame for SolarDownwardRadiation
pivot_srd = merged_solar_dwd_df.pivot_table(index=['reference_time', 'valid_time'], 
                                             columns='SolarDownwardRadiation_pes10_col', 
                                             values='SolarDownwardRadiation_pes10_dwd').reset_index()
# Pivot the DataFrame for SolarDownwardRadiation
pivot_temp = merged_solar_dwd_df.pivot_table(index=['reference_time', 'valid_time'], 
                                             columns='Temperature_pes10_col', 
                                             values='Temperature_pes10_dwd').reset_index()

merged_solar_df = pd.merge(pivot_cc, pivot_srd, on=["reference_time", "valid_time"])
final_solar_dwd_df = pd.merge(merged_solar_df, pivot_temp, on=["reference_time", "valid_time"])
final_solar_dwd_df.shape
final_solar_dwd_df.to_hdf('dwd_solar_20240129_20240519.h5', key='df', mode='w')

## ncep_gfs WIND

In [4]:
ncep_Hornsea1 =xr.open_dataset("data/ncep_gfs_hornsea_1_20240129_20240519.nc")

ws_ncep_df = ncep_Hornsea1["WindSpeed"].to_dataframe().reset_index().rename(columns={'WindSpeed': 'WindSpeed_ncep'})
ws100_ncep_df = ncep_Hornsea1["WindSpeed:100"].to_dataframe().reset_index().rename(columns={'WindSpeed:100': 'WindSpeed_ncep_100'})
wd_ncep_df = ncep_Hornsea1["WindDirection"].to_dataframe().reset_index().rename(columns={'WindDirection': 'WindDirection_ncep'})
wd100_ncep_df = ncep_Hornsea1["WindDirection:100"].to_dataframe().reset_index().rename(columns={'WindDirection:100': 'WindDirection_ncep_100'})
rh_ncep_df = ncep_Hornsea1["RelativeHumidity"].to_dataframe().reset_index().rename(columns={'RelativeHumidity': 'RelativeHumidity_ncep'})
t_ncep_df = ncep_Hornsea1["Temperature"].to_dataframe().reset_index().rename(columns={'Temperature': 'Temperature_ncep'})

# Merge both DataFrames on their common columns
merged_wind_ncep_df = pd.merge(ws_ncep_df, ws100_ncep_df, on=["reference_time", "valid_time", "latitude", "longitude"])
merged_wind_ncep_df = pd.merge(merged_wind_ncep_df, wd_ncep_df, on=["reference_time", "valid_time", "latitude", "longitude"])
merged_wind_ncep_df = pd.merge(merged_wind_ncep_df, wd100_ncep_df, on=["reference_time", "valid_time", "latitude", "longitude"])
merged_wind_ncep_df = pd.merge(merged_wind_ncep_df, rh_ncep_df, on=["reference_time", "valid_time", "latitude", "longitude"])
merged_wind_ncep_df = pd.merge(merged_wind_ncep_df, t_ncep_df, on=["reference_time", "valid_time", "latitude", "longitude"])

# Localize ref_datetime to UTC and adjust valid_datetime
merged_wind_ncep_df["reference_time"] = merged_wind_ncep_df["reference_time"].dt.tz_localize("UTC")
merged_wind_ncep_df["valid_time"] = merged_wind_ncep_df["reference_time"] + pd.to_timedelta(merged_wind_ncep_df["valid_time"], unit="H")

merged_wind_ncep_df['WindSpeed_col'] = merged_wind_ncep_df.apply(lambda row: f"NCEP_WS_W_{row['latitude']}_{row['longitude']}", axis=1)
merged_wind_ncep_df['WindSpeed100_col'] = merged_wind_ncep_df.apply(lambda row: f"NCEP_WS100_W_{row['latitude']}_{row['longitude']}", axis=1)
merged_wind_ncep_df['WindDirection_col'] = merged_wind_ncep_df.apply(lambda row: f"NCEP_WD_W_{row['latitude']}_{row['longitude']}", axis=1)
merged_wind_ncep_df['WindDirection100_col'] = merged_wind_ncep_df.apply(lambda row: f"NCEP_WD100_W_{row['latitude']}_{row['longitude']}", axis=1)
merged_wind_ncep_df['WindRelHum_col'] = merged_wind_ncep_df.apply(lambda row: f"NCEP_RH_W_{row['latitude']}_{row['longitude']}", axis=1)
merged_wind_ncep_df['WindTemp_col'] = merged_wind_ncep_df.apply(lambda row: f"NCEP_T_W_{row['latitude']}_{row['longitude']}", axis=1)

# Pivot the DataFrame for WindSpeed
pivot_ncep_ws = merged_wind_ncep_df.pivot_table(index=['reference_time', 'valid_time'], 
                                 columns='WindSpeed_col', 
                                 values='WindSpeed_ncep').reset_index()
# Pivot the DataFrame for WindSpeed:100
pivot_ncep_ws100 = merged_wind_ncep_df.pivot_table(index=['reference_time', 'valid_time'], 
                                    columns='WindSpeed100_col', 
                                    values='WindSpeed_ncep_100').reset_index()
# Pivot the DataFrame for WindDirection
pivot_ncep_wd = merged_wind_ncep_df.pivot_table(index=['reference_time', 'valid_time'], 
                                 columns='WindDirection_col', 
                                 values='WindDirection_ncep').reset_index()
# Pivot the DataFrame for WindDirection:100
pivot_ncep_wd100 = merged_wind_ncep_df.pivot_table(index=['reference_time', 'valid_time'], 
                                    columns='WindDirection100_col', 
                                    values='WindDirection_ncep_100').reset_index()
# Pivot the DataFrame for RelativeHumidity
pivot_ncep_rh = merged_wind_ncep_df.pivot_table(index=['reference_time', 'valid_time'], 
                                 columns='WindRelHum_col', 
                                 values='RelativeHumidity_ncep').reset_index()
# Pivot the DataFrame for Temperature
pivot_ncep_t = merged_wind_ncep_df.pivot_table(index=['reference_time', 'valid_time'], 
                                    columns='WindTemp_col', 
                                    values='Temperature_ncep').reset_index()

merged_ncep_df = pd.merge(pivot_ncep_ws, pivot_ncep_ws100, on=["reference_time", "valid_time"], suffixes=('', '_100_ws'))
merged_ncep_df = pd.merge(merged_ncep_df, pivot_ncep_wd, on=["reference_time", "valid_time"], suffixes=('', '_wd'))
merged_ncep_df = pd.merge(merged_ncep_df, pivot_ncep_wd100, on=["reference_time", "valid_time"], suffixes=('', '_100_wd'))
merged_ncep_df = pd.merge(merged_ncep_df, pivot_ncep_rh, on=["reference_time", "valid_time"], suffixes=('', '_rh'))
final_wind_ncep_df = pd.merge(merged_ncep_df, pivot_ncep_t, on=["reference_time", "valid_time"], suffixes=('', '_t'))
final_wind_ncep_df.to_hdf('ncep_wind_20240129_20240519.h5', key='df', mode='w')

## ncep_gfs SOLAR

In [5]:
# Load the solar radiation dataset
ncep_solar = xr.open_dataset("data/ncep_gfs_pes10_20240129_20240519.nc")

# Convert the SolarDownwardRadiation data to a DataFrame and reset index
cc_pes10_ncep_df = ncep_solar["CloudCover"].to_dataframe().reset_index().rename(columns={'CloudCover': 'CloudCover_pes10_ncep'})
sdr_pes10_ncep_df = ncep_solar["SolarDownwardRadiation"].to_dataframe().reset_index().rename(columns={'SolarDownwardRadiation': 'SolarDownwardRadiation_pes10_ncep'})
t_pes10_ncep_df = ncep_solar["Temperature"].to_dataframe().reset_index().rename(columns={'Temperature': 'Temperature_pes10_ncep'})

merged_solar_ncep_df = pd.merge(cc_pes10_ncep_df, sdr_pes10_ncep_df, on=["reference_time", "valid_time", "latitude", "longitude"])
merged_solar_ncep_df = pd.merge(merged_solar_ncep_df, t_pes10_ncep_df, on=["reference_time", "valid_time", "latitude", "longitude"])

# Localize ref_datetime to UTC and adjust valid_datetime
merged_solar_ncep_df["reference_time"] = merged_solar_ncep_df["reference_time"].dt.tz_localize("UTC")
merged_solar_ncep_df["valid_time"] = merged_solar_ncep_df["reference_time"] + pd.to_timedelta(merged_solar_ncep_df["valid_time"], unit="H")

# Generate unique identifiers for solar radiation measurements based on the 'point' column
merged_solar_ncep_df['CloudCover_pes10_col'] = merged_solar_ncep_df.apply(lambda row: f"NCEP_CC_Solar_{row['point']}", axis=1)
merged_solar_ncep_df['SolarDownwardRadiation_pes10_col'] = merged_solar_ncep_df.apply(lambda row: f"NCEP_SDR_Solar_{row['point']}", axis=1)
merged_solar_ncep_df['Temperature_pes10_col'] = merged_solar_ncep_df.apply(lambda row: f"NCEP_T_Solar_{row['point']}", axis=1)

# Pivot the DataFrame for SolarDownwardRadiation
pivot_ncep_cc = merged_solar_ncep_df.pivot_table(index=['reference_time', 'valid_time'], 
                                             columns='CloudCover_pes10_col', 
                                             values='CloudCover_pes10_ncep').reset_index()
# Pivot the DataFrame for SolarDownwardRadiation
pivot_ncep_srd = merged_solar_ncep_df.pivot_table(index=['reference_time', 'valid_time'], 
                                             columns='SolarDownwardRadiation_pes10_col', 
                                             values='SolarDownwardRadiation_pes10_ncep').reset_index()
# Pivot the DataFrame for SolarDownwardRadiation
pivot_ncep_temp = merged_solar_ncep_df.pivot_table(index=['reference_time', 'valid_time'], 
                                             columns='Temperature_pes10_col', 
                                             values='Temperature_pes10_ncep').reset_index()

merged_solar_ncep_df = pd.merge(pivot_ncep_cc, pivot_ncep_srd, on=["reference_time", "valid_time"])
final_solar_ncep_df = pd.merge(merged_solar_ncep_df, pivot_ncep_temp, on=["reference_time", "valid_time"])
final_solar_ncep_df.shape
final_solar_ncep_df.to_hdf('ncep_solar_20240129_20240519.h5', key='df', mode='w')