# Imports

In [1]:
import pandas as pd
import datetime as dt
# from datetime import datetime, timezone
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import xgboost as xg
from entsoe import EntsoePandasClient
from statsmodels.graphics import tsaplots

C:\Users\Elena\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\Elena\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


# Import data

## Entsoe

In [2]:
day_ahead_prices = pd.read_csv('day_ahead_prices.csv')
day_ahead_prices.tail()

Unnamed: 0.1,Unnamed: 0,Day-ahead prices
43796,2022-12-30 20:00:00+01:00,19.88
43797,2022-12-30 21:00:00+01:00,1.45
43798,2022-12-30 22:00:00+01:00,0.61
43799,2022-12-30 23:00:00+01:00,0.01
43800,2022-12-31 00:00:00+01:00,2.0


In [3]:
load_and_forecast = pd.read_csv('load_and_forecast.csv')
load_and_forecast.tail()

Unnamed: 0.1,Unnamed: 0,Forecasted Load,Actual Load
43793,2022-12-30 19:00:00+01:00,4344.0,4375.0
43794,2022-12-30 20:00:00+01:00,4179.0,4187.0
43795,2022-12-30 21:00:00+01:00,4053.0,4103.0
43796,2022-12-30 22:00:00+01:00,3889.0,3965.0
43797,2022-12-30 23:00:00+01:00,3656.0,3750.0


In [4]:
wind_solar_forecast = pd.read_csv('wind_solar_forecast.csv')
wind_solar_forecast.tail()

Unnamed: 0.1,Unnamed: 0,Solar,Wind Offshore,Wind Onshore
43699,2022-12-30 19:00:00+01:00,0.0,1445.0,2973.0
43700,2022-12-30 20:00:00+01:00,0.0,1436.0,3033.0
43701,2022-12-30 21:00:00+01:00,0.0,1406.0,3205.0
43702,2022-12-30 22:00:00+01:00,0.0,1417.0,3272.0
43703,2022-12-30 23:00:00+01:00,0.0,1422.0,3275.0


## Investing

In [5]:
# Natural Gas - TTF prices
ttf_prices = pd.read_csv('TTF_prices.csv', decimal = '.')  # in €
ttf_prices['Date'] = pd.to_datetime(ttf_prices['Date'], format = '%m/%d/%Y') # convert date from string to datetime object

# drop unnecessary columns
cols_drop = [col for col in ttf_prices.columns if (col != 'Date' and col != 'Price')]
ttf_prices.drop(cols_drop, axis = 1, inplace = True)

ttf_prices.sort_values(by = ['Date'], ascending = True, inplace = True)
ttf_prices.reset_index(inplace = True, drop = True)

In [6]:
ttf_prices.head()

Unnamed: 0,Date,Price
0,2018-01-02,19.32
1,2018-01-03,19.325
2,2018-01-04,19.2
3,2018-01-05,18.915
4,2018-01-08,19.05


In [7]:
# CO2 emissions prices
co2_prices = pd.read_csv('CO2_prices.csv', decimal = '.')  # in €
co2_prices['Date'] = pd.to_datetime(co2_prices['Date'], format = '%m/%d/%Y') # convert date from string to datetime object

# drop unnecessary columns
cols_drop = [col for col in co2_prices.columns if (col != 'Date' and col != 'Price')]
co2_prices.drop(cols_drop, axis = 1, inplace = True)

co2_prices.sort_values(by = ['Date'], ascending = True, inplace = True)
co2_prices.reset_index(inplace = True, drop = True)

In [8]:
co2_prices.head()

Unnamed: 0,Date,Price
0,2018-01-02,8.22
1,2018-01-03,8.24
2,2018-01-04,8.18
3,2018-01-05,8.19
4,2018-01-08,8.07


# Pre-processing
## Adjust timeseries

In [9]:
def remove_utc(col_name, # str: name of the column that contains the object to convert to timestamp
               tz_offset, # int: timezone offset. E.g., CET = +1
               df # dataframe: contains all the info
              ):
    df['Timestamp'] = pd.to_datetime(df[col_name], format = '%Y %m %d %H:%M:%S',utc = True)
    df['Timestamp'] = (df['Timestamp'] + dt.timedelta(hours = tz_offset)).dt.tz_localize(None)
    df.drop([col_name], axis = 1, inplace = True) # drop the column
    df.set_index('Timestamp', inplace = True) # set column 'Timestamp' as index
    return df

In [10]:
day_ahead_prices = remove_utc('Unnamed: 0', 1, day_ahead_prices)
day_ahead_prices.tail()

Unnamed: 0_level_0,Day-ahead prices
Timestamp,Unnamed: 1_level_1
2022-12-30 20:00:00,19.88
2022-12-30 21:00:00,1.45
2022-12-30 22:00:00,0.61
2022-12-30 23:00:00,0.01
2022-12-31 00:00:00,2.0


In [11]:
load_and_forecast  = remove_utc('Unnamed: 0', 1, load_and_forecast)
load_and_forecast.tail()

Unnamed: 0_level_0,Forecasted Load,Actual Load
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-12-30 19:00:00,4344.0,4375.0
2022-12-30 20:00:00,4179.0,4187.0
2022-12-30 21:00:00,4053.0,4103.0
2022-12-30 22:00:00,3889.0,3965.0
2022-12-30 23:00:00,3656.0,3750.0


In [12]:
wind_solar_forecast = remove_utc('Unnamed: 0', 1, wind_solar_forecast)
wind_solar_forecast.tail()

Unnamed: 0_level_0,Solar,Wind Offshore,Wind Onshore
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-12-30 19:00:00,0.0,1445.0,2973.0
2022-12-30 20:00:00,0.0,1436.0,3033.0
2022-12-30 21:00:00,0.0,1406.0,3205.0
2022-12-30 22:00:00,0.0,1417.0,3272.0
2022-12-30 23:00:00,0.0,1422.0,3275.0


## Join datasets

In [13]:
# create empty dataframe where to "pour" all the data
start = pd.to_datetime('2018-01-01 00:00:00')
end = pd.to_datetime('2022-12-31 23:59:00')
df = pd.DataFrame()
df['Timestamp'] = pd.date_range(start, end, freq = 'H')
df['Date'] = df['Timestamp'].dt.date
df.set_index('Timestamp', inplace = True)# set column 'Timestamp' as index

In [14]:
df.head()

Unnamed: 0_level_0,Date
Timestamp,Unnamed: 1_level_1
2018-01-01 00:00:00,2018-01-01
2018-01-01 01:00:00,2018-01-01
2018-01-01 02:00:00,2018-01-01
2018-01-01 03:00:00,2018-01-01
2018-01-01 04:00:00,2018-01-01


In [15]:
# mapping daily prices: TTF and CO2
df['TTF'] = df['Date'].map(ttf_prices.set_index('Date')['Price'])
df['CO2'] = df['Date'].map(co2_prices.set_index('Date')['Price'])

In [16]:
df.head()

Unnamed: 0_level_0,Date,TTF,CO2
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01 00:00:00,2018-01-01,,
2018-01-01 01:00:00,2018-01-01,,
2018-01-01 02:00:00,2018-01-01,,
2018-01-01 03:00:00,2018-01-01,,
2018-01-01 04:00:00,2018-01-01,,


In [17]:
df_merged = pd.merge(df, day_ahead_prices, on = 'Timestamp', how = 'outer')
df_merged = pd.merge(df_merged, load_and_forecast, on = 'Timestamp', how = 'outer')
df_merged = pd.merge(df_merged, wind_solar_forecast, on = 'Timestamp', how = 'outer')
df_merged.reset_index(inplace = True)
df_merged[25:35]

Unnamed: 0,Timestamp,Date,TTF,CO2,Day-ahead prices,Forecasted Load,Actual Load,Solar,Wind Offshore,Wind Onshore
25,2018-01-02 01:00:00,2018-01-02,19.32,8.22,14.99,2817.0,2825.0,0.0,706.0,1066.0
26,2018-01-02 02:00:00,2018-01-02,19.32,8.22,17.79,2798.0,2780.0,0.0,657.0,1000.0
27,2018-01-02 03:00:00,2018-01-02,19.32,8.22,24.51,2836.0,2793.0,0.0,549.0,906.0
28,2018-01-02 04:00:00,2018-01-02,19.32,8.22,17.82,2921.0,2908.0,0.0,523.0,843.0
29,2018-01-02 05:00:00,2018-01-02,19.32,8.22,26.84,3076.0,3101.0,0.0,443.0,829.0
30,2018-01-02 06:00:00,2018-01-02,19.32,8.22,27.8,3575.0,3648.0,0.0,397.0,844.0
31,2018-01-02 07:00:00,2018-01-02,19.32,8.22,41.92,4276.0,4226.0,0.0,476.0,859.0
32,2018-01-02 08:00:00,2018-01-02,19.32,8.22,47.95,4601.0,4596.0,1.0,482.0,851.0
33,2018-01-02 09:00:00,2018-01-02,19.32,8.22,38.02,4678.0,4694.0,21.0,465.0,830.0
34,2018-01-02 10:00:00,2018-01-02,19.32,8.22,35.04,4775.0,4773.0,72.0,460.0,809.0


In [18]:
# add wind generation columns
df_merged['Wind Total'] = df_merged['Wind Offshore'] + df_merged['Wind Onshore']

## Saving the resulting dataset

In [21]:
df_merged.to_csv('DK_2.csv', index = False)

In [22]:
df_merged.head()

Unnamed: 0,Timestamp,Date,TTF,CO2,Day-ahead prices,Forecasted Load,Actual Load,Solar,Wind Offshore,Wind Onshore,Wind Total
0,2018-01-01 00:00:00,2018-01-01,,,26.33,3422.0,3421.0,0.0,783.0,1493.0,2276.0
1,2018-01-01 01:00:00,2018-01-01,,,26.43,3289.0,3308.0,0.0,893.0,1481.0,2374.0
2,2018-01-01 02:00:00,2018-01-01,,,26.1,3157.0,3118.0,0.0,755.0,1430.0,2185.0
3,2018-01-01 03:00:00,2018-01-01,,,24.7,3025.0,3018.0,0.0,747.0,1458.0,2205.0
4,2018-01-01 04:00:00,2018-01-01,,,24.74,2939.0,2916.0,0.0,886.0,1472.0,2358.0
