In [18]:
# standard modules
import sys
import os

# data and vizualisation models
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

# machine learning
import joblib
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import cross_validate, learning_curve, train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor


# Get the current working directory
current_dir = os.getcwd()

# add 'main' to the path
main_dir = os.path.dirname(current_dir)
sys.path.append(main_dir)

# data preprocessing and scaling
from feature_engin.feature_import import combine_dataframes


In [19]:
# fetching data
raw_data = combine_dataframes()


Loaded coal_price.csv successfully.
Loaded ttf_price.csv successfully.
Loaded oil_price.csv successfully.
Loaded germany_electricity_generation_2018-2023.csv successfully.
Loaded holidays.csv successfully.
Loaded PMI_germany.csv successfully.
Loaded weather_north_hourly.csv successfully.
Loaded weather_south_hourly.csv successfully.
Loaded weather_brocken_hourly.csv successfully.


In [20]:
# copying
data = raw_data.copy()
data.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 199389 entries, 2018-01-02 00:00:00+00:00 to 2023-11-22 00:00:00+00:00
Data columns (total 45 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   coal_adj_close          199389 non-null  float64
 1   ttf_adj_close           199389 non-null  float64
 2   ttf_volume              199389 non-null  float64
 3   oil_adj_close           199389 non-null  float64
 4   oil_volume              199389 non-null  float64
 5   fractional_hour         199389 non-null  float64
 6   day_of_week             199389 non-null  float64
 7   week_of_year            199389 non-null  float64
 8   month                   199389 non-null  float64
 9   year                    199389 non-null  float64
 10  hydro_storage_in        199389 non-null  float64
 11  cross_border            199389 non-null  float64
 12  nuclear                 199389 non-null  float64
 13  hydro                   1993

In [4]:
# checking if feature is unscaled
data["day_ahead_price"]


2018-01-02 00:00:00+00:00     14.99
2018-01-02 00:15:00+00:00     14.99
2018-01-02 00:30:00+00:00     14.99
2018-01-02 00:45:00+00:00     14.99
2018-01-02 01:00:00+00:00     13.63
                              ...  
2023-11-29 09:45:00+00:00    133.72
2023-11-29 10:00:00+00:00    130.00
2023-11-29 10:15:00+00:00    130.00
2023-11-29 10:30:00+00:00    130.00
2023-11-29 10:45:00+00:00    130.00
Name: day_ahead_price, Length: 207164, dtype: float64

In [5]:
# scale data with StandardScaler
scaler = StandardScaler()

# Reshape data["day_ahead_price"] to a 2D array as scaler expects 2D inputs
day_ahead_price_scaled = scaler.fit_transform(data["day_ahead_price"].values.reshape(-1, 1))

# Replace the original target variable in the dataframe with the scaled values
data["day_ahead_price_scaled"] = day_ahead_price_scaled.flatten()

# save the unscaled data in an df for later use
unscaled_target = data[['day_ahead_price']].copy()

# drop unscaled target from data
data.drop('day_ahead_price', axis=1, inplace=True)

data["day_ahead_price_scaled"]


2018-01-02 00:00:00+00:00   -0.759007
2018-01-02 00:15:00+00:00   -0.759007
2018-01-02 00:30:00+00:00   -0.759007
2018-01-02 00:45:00+00:00   -0.759007
2018-01-02 01:00:00+00:00   -0.772712
                               ...   
2023-11-29 09:45:00+00:00    0.437455
2023-11-29 10:00:00+00:00    0.399968
2023-11-29 10:15:00+00:00    0.399968
2023-11-29 10:30:00+00:00    0.399968
2023-11-29 10:45:00+00:00    0.399968
Name: day_ahead_price_scaled, Length: 207164, dtype: float64

In [6]:
# Calculate the correlation matrix
corr_matrix = data.corr()

# check cross-correlation with target
corr_matrix["day_ahead_price_scaled"].abs().sort_values()

# Calculate the absolute correlation with the target variable
target_corr = corr_matrix["day_ahead_price_scaled"].abs()

# Filter out features with correlation less than 0.1
selected_features = target_corr[target_corr >= 0.1].index.tolist()

# Create a new dataset with the selected features
data = data[selected_features]

# before 43 features -> now 28
data.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 207164 entries, 2018-01-02 00:00:00+00:00 to 2023-11-29 10:45:00+00:00
Data columns (total 30 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   day_of_week             207164 non-null  float64
 1   week_of_year            207164 non-null  float64
 2   month                   207164 non-null  float64
 3   year                    207164 non-null  float64
 4   hydro_storage_in        207164 non-null  float64
 5   cross_border            207164 non-null  float64
 6   nuclear                 207164 non-null  float64
 7   hydro                   207164 non-null  float64
 8   biomass                 207164 non-null  float64
 9   lignite                 207164 non-null  float64
 10  hard_coal               207164 non-null  float64
 11  oil                     207164 non-null  float64
 12  coal_gas                207164 non-null  float64
 13  nat_gas                 2071

In [7]:
# sortby the datetime index
data = data.sort_index()

# Identify the Split Point
split_date = data.index.max() - pd.Timedelta(weeks=2)

data_mod = data[data.index <= split_date]
last_two_weeks_data = data[data.index > split_date]

# Check the number of entries in the split datasets
print("Data up to the last two weeks:", data_mod.shape[0])
print("Last two weeks data:", last_two_weeks_data.shape[0])


Data up to the last two weeks: 205820
Last two weeks data: 1344


In [8]:
# Prepare your feature set and target variable
X = data_mod.drop(columns=['day_ahead_price_scaled'])
y = data_mod['day_ahead_price_scaled']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Elastic Net Regression

In [9]:
from sklearn.linear_model import ElasticNet

elastic_net_model = ElasticNet(random_state=42)

param_grid = {
    'alpha': [0.1, 0.5, 1, 2, 5, 10],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=elastic_net_model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2)


In [10]:
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END ............................alpha=0.1, l1_ratio=0.3; total time=   3.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   3.9s
[CV] END ............................alpha=0.1, l1_ratio=0.3; total time=   3.6s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   4.1s
[CV] END ............................alpha=0.1, l1_ratio=0.3; total time=   3.9s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   4.9s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   5.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   5.2s
[CV] END ............................alpha=0.1, l1_ratio=0.3; total time=   3.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   2.9s
[CV] END ............................alpha=0.1, l1_ratio=0.3; total time=   3.0s
[CV] END ............................alpha=0.1,

In [11]:
best_elastic_net = grid_search.best_estimator_
y_pred = best_elastic_net.predict(X_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error: ", mse)
print("R-squared: ", r2)


Mean Squared Error:  0.4178797452987884
R-squared:  0.5961372387130774


# Support Vector Machines (SVM) for Regression (SVR)

In [13]:
from sklearn.svm import SVR


In [14]:
svr_model = SVR()


In [15]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear', 'poly']
}


In [16]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=svr_model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2)


In [17]:
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 

In [None]:
best_svr = grid_search.best_estimator_
y_pred = best_svr.predict(X_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error: ", mse)
print("R-squared: ", r2)
