# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

import xgboost as xg
# from xgboost import *

In [2]:
from helper import *

# Models

In [3]:
def short_term(model,x_train,y_train,x_test,y_test,x_val=False,y_val=False):
    if model=='rft':
        # perform randomized search cross validation
        rf = RandomForestRegressor(random_state=42, min_samples_split = 10)
        rf_grid = {"n_estimators": np.arange(10, 100, 10),<br>
           "max_depth": [None, 3, 5, 10],<br>
           "min_samples_split": np.arange(2, 20, 2),<br>
           "min_samples_leaf": np.arange(1, 20, 2),<br>
           "max_features": [0.5, 1, "sqrt", "auto"],
           "max_samples": [10000]}
        rf = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 30, cv = 3, random_state=42, n_jobs = -1)
        rf.fit(x_train, y_train)
        # the optimal parameters
#         rf_random.best_estimator_.get_params()
        # evaluate the optimal model
#         model_ = rf.best_estimator_
        model_ = rf
        
    elif model=='xgb':
        xgb = xg.XGBRegressor(seed = 42)
        xgb.fit(x_train, y_train)
        model_= xgb
    
    ypred = model_.predict(x_test)
    
    return ypred

In [4]:
df = read_our_data('DK_2_filled.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43776 entries, 0 to 43775
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Timestamp         43776 non-null  datetime64[ns]
 1   Date              43776 non-null  datetime64[ns]
 2   TTF               43776 non-null  float64       
 3   CO2               43776 non-null  float64       
 4   Day-ahead prices  43776 non-null  float64       
 5   Forecasted Load   43774 non-null  float64       
 6   Actual Load       43774 non-null  float64       
 7   Solar             43344 non-null  float64       
 8   Wind Offshore     43680 non-null  float64       
 9   Wind Onshore      43632 non-null  float64       
 10  Wind Total        43632 non-null  float64       
 11  Year              43776 non-null  int64         
 12  Quarter           43776 non-null  int64         
 13  Month             43776 non-null  int64         
 14  Day               4377

In [6]:
# lagged columns to add
lags = [24, 25, 26, 27, 28, 48, 72]
col_to_lag = 'Day-ahead prices'
df_lagged = lag_df(df, col_to_lag, lags)
df_lagged.dropna(how = 'any', axis = 'index', inplace = True)

In [7]:
df_lagged.columns

Index(['Timestamp', 'Date', 'TTF', 'CO2', 'Day-ahead prices',
       'Forecasted Load', 'Actual Load', 'Solar', 'Wind Offshore',
       'Wind Onshore', 'Wind Total', 'Year', 'Quarter', 'Month', 'Day', 'Hour',
       'Week', 'business', 'Day-ahead prices-lag24', 'Day-ahead prices-lag25',
       'Day-ahead prices-lag26', 'Day-ahead prices-lag27',
       'Day-ahead prices-lag28', 'Day-ahead prices-lag48',
       'Day-ahead prices-lag72'],
      dtype='object')

In [8]:
cols_std = ['TTF', 'CO2', 'Forecasted Load', 'Actual Load',
            'Solar', 'Wind Offshore', 'Wind Onshore', 'Wind Total',
             'Day', 'Hour', 'Week', 'Day-ahead prices-lag24', 'Day-ahead prices-lag25', 'Day-ahead prices-lag26',
            'Day-ahead prices-lag27', 'Day-ahead prices-lag28', 'Day-ahead prices-lag48', 'Day-ahead prices-lag72']
features = cols_std + ['Quarter', 'Month', 'business']
target = ['Day-ahead prices']
train_start = pd.to_datetime(['2018-01-01', '2021-01-01', '2021-07-01'], format = '%Y-%m-%d')

In [9]:
k_folds = 3
for k in range(k_folds):
    # split in train and test set
    train_set, test_set = split_timeseries(df_lagged, train_start, k, method = 0)
    
    # get features and target
    X_train, y_train = get_feature_target(train_set, features, target)
    X_test, y_test = get_feature_target(test_set, features, target)
    y_test = np.array(y_test[target])

    # standardize
    X_train_std, X_test_std = standardize(X_train, X_test, cols_std)
 
    # short  term
    ypred = short_term('rft', X_train_std, y_train, X_test_std, y_test)
    
    print('Iteration ', k)
    model_evaluation(ypred, y_test)

Iteration  0
Mean Absolute Error (MAE): 11.4789760794581
Mean Squared Error (MSE): 283.3823917328713
Root Mean Squared Error (RMSE): 16.83396541914208
Mean Absolute Percentage Error (MAPE): 62.07
Accuracy: 37.93
Iteration  1
Mean Absolute Error (MAE): 11.803883173704907
Mean Squared Error (MSE): 227.08910309873337
Root Mean Squared Error (RMSE): 15.069475873391662
Mean Absolute Percentage Error (MAPE): 41.22
Accuracy: 58.78
Iteration  2
Mean Absolute Error (MAE): 48.49318527315472
Mean Squared Error (MSE): 4190.129930798026
Root Mean Squared Error (RMSE): 64.73121295633217
Mean Absolute Percentage Error (MAPE): 140.49
Accuracy: -40.49


In [16]:
k_folds = 3
for k in range(k_folds):
    # split in train and test set
    train_set, test_set = split_timeseries(df_lagged, train_start, k, method = 0)
    
    # get features and target
    X_train, y_train = get_feature_target(train_set, features, target)
    X_test, y_test = get_feature_target(test_set, features, target)
    y_test = np.array(y_test[target])

    # standardize
    X_train_std, X_test_std = standardize(X_train, X_test, cols_std)
 
    # short  term
    ypred = short_term('xgb', X_train_std, y_train, X_test_std, y_test)
    
    print('Iteration ', k)
    model_evaluation(ypred, y_test)

Iteration  0
Mean Absolute Error (MAE): 12.14288946864896
Mean Squared Error (MSE): 310.2397764917463
Root Mean Squared Error (RMSE): 17.613624740289726
Mean Absolute Percentage Error (MAPE): 76.67
Accuracy: 23.33
Iteration  1
Mean Absolute Error (MAE): 11.72984522205677
Mean Squared Error (MSE): 223.76419342546785
Root Mean Squared Error (RMSE): 14.958749728017642
Mean Absolute Percentage Error (MAPE): 49.8
Accuracy: 50.2
Iteration  2
Mean Absolute Error (MAE): 60.43647361880846
Mean Squared Error (MSE): 6501.957700464761
Root Mean Squared Error (RMSE): 80.63471771181915
Mean Absolute Percentage Error (MAPE): 504.96
Accuracy: -404.96


correlation between ytrue and ypred, RMSE can be hard to refrence.<br>
use power transform instead of log (how?)

In [None]:
# rolling mean as a baseline
def baseline():
    