# Baseline model

## It predicts the wind farm dependent average electricity production and aggregates predictions over all wind farms.

In [15]:
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

sys.path.append("..")

In [16]:
data = pd.read_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', \
                    parse_dates= ['TIMESTAMP'],
                    index_col= 'TIMESTAMP' )
                    
data.interpolate(method = 'linear', inplace= True)
data = pd.get_dummies(data, columns = ['WD100CARD','WD10CARD'], drop_first=True)
RSEED = 42

In [17]:
train = data[:'2013-07-01 00:00:00']
test = data['2013-07-01 01:00:00':]


In [23]:
# zones to loop over
zones = np.sort(train.ZONEID.unique()) 

# baseline predictions of all sites will be merged into one DataFrame to calculate the RMSE with respect to the observations of all zones
# finalpred = pd.DataFrame() 
df_results = pd.DataFrame(index = [f'ZONE{zone}' for zone in zones] + ['TOTAL'], 
                          columns = ['BEST_PARAMS','CV','MODEL','FC','TESTSCORE','TRAINSCORE'])
df_results.loc['TOTAL'].TRAINSCORE = 0
df_results.loc['TOTAL'].TESTSCORE = 0
df_results['MODEL'] = 'Baseline'

# loop over all zones
for zone in zones:

    # get train and test data of individual zones
    ytrain = train[train.ZONEID == zone].TARGETVAR
    ytest =  test[test.ZONEID == zone].TARGETVAR

    # baseline predicton for individual zone
    pred_train = np.ones(len(ytrain)) * np.mean(ytrain)
    pred_test = np.ones(len(ytest)) * np.mean(ytrain)

    df_results.loc[f'ZONE{zone}'].TRAINSCORE = mean_squared_error(ytrain, pred_train, squared=False)
    df_results.loc[f'ZONE{zone}'].TESTSCORE = mean_squared_error(ytest, pred_test, squared=False)

    df_results.loc['TOTAL'].TRAINSCORE += np.power(df_results.loc[f'ZONE{zone}'].TRAINSCORE,2) * len(ytrain)/len(train)
    df_results.loc['TOTAL'].TESTSCORE += np.power(df_results.loc[f'ZONE{zone}'].TESTSCORE,2) * len(ytest)/len(test)

df_results.loc['TOTAL'].TRAINSCORE = np.power(df_results.loc['TOTAL'].TRAINSCORE,.5)
df_results.loc['TOTAL'].TESTSCORE = np.power(df_results.loc['TOTAL'].TESTSCORE,.5)

df_results.index.set_names(['ZONE'], inplace=True)


In [24]:
df_results

Unnamed: 0_level_0,BEST_PARAMS,CV,MODEL,FC,TESTSCORE,TRAINSCORE
ZONE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ZONE1,,,Baseline,,0.330221,0.274462
ZONE2,,,Baseline,,0.290033,0.256417
ZONE3,,,Baseline,,0.312627,0.296552
ZONE4,,,Baseline,,0.366647,0.31975
ZONE5,,,Baseline,,0.361371,0.326282
ZONE6,,,Baseline,,0.355799,0.331287
ZONE7,,,Baseline,,0.307106,0.251734
ZONE8,,,Baseline,,0.316515,0.262741
ZONE9,,,Baseline,,0.312423,0.276838
ZONE10,,,Baseline,,0.344026,0.339246
