# Baseline model and first simple models - by Jerome, 26 November 2021 #

---

In [43]:
## load modules and run mlflow_logging.ipynb to get function to track model information on MLFLow
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns

RSEED = 42

%run mlflow_logging.ipynb

### Read data and remove NaNs ###

In [44]:
## read data
data = pd.read_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', parse_dates=['TIMESTAMP'])
data.head()

Unnamed: 0,ZONEID,TIMESTAMP,TARGETVAR,U10,V10,U100,V100,HOUR,MONTH,WEEKDAY,IS_HOLIDAY,WS10,WS100,WD10,WD100,WD100CARD,WD10CARD,U100NORM,V100NORM
0,1,2012-01-01 01:00:00,0.0,2.1246,-2.681966,2.86428,-3.666076,1,1,6,1,3.42153,4.652334,321.614439,321.999735,NW,NW,0.615665,-0.788008
1,1,2012-01-01 02:00:00,0.054879,2.521695,-1.79696,3.344859,-2.464761,2,1,6,1,3.096451,4.154892,305.47368,306.385781,NW,NW,0.805041,-0.593219
2,1,2012-01-01 03:00:00,0.110234,2.67221,-0.822516,3.508448,-1.214093,3,1,6,1,2.795932,3.712577,287.108562,289.088098,WNW,WNW,0.945017,-0.327022
3,1,2012-01-01 04:00:00,0.165116,2.457504,-0.143642,3.215233,-0.355546,4,1,6,1,2.461699,3.234831,273.34516,276.310236,W,W,0.993941,-0.109912
4,1,2012-01-01 05:00:00,0.15694,2.245898,0.389576,2.957678,0.332701,5,1,6,1,2.279435,2.976332,260.159324,263.581938,W,W,0.993733,0.111782


In [45]:
## remove NaNs
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175265 entries, 0 to 175433
Data columns (total 19 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   ZONEID      175265 non-null  int64         
 1   TIMESTAMP   175265 non-null  datetime64[ns]
 2   TARGETVAR   175265 non-null  float64       
 3   U10         175265 non-null  float64       
 4   V10         175265 non-null  float64       
 5   U100        175265 non-null  float64       
 6   V100        175265 non-null  float64       
 7   HOUR        175265 non-null  int64         
 8   MONTH       175265 non-null  int64         
 9   WEEKDAY     175265 non-null  int64         
 10  IS_HOLIDAY  175265 non-null  int64         
 11  WS10        175265 non-null  float64       
 12  WS100       175265 non-null  float64       
 13  WD10        175265 non-null  float64       
 14  WD100       175265 non-null  float64       
 15  WD100CARD   175265 non-null  object        
 16  WD

In [46]:
data = pd.get_dummies(data, columns = ['WD100CARD','WD10CARD'], drop_first=True)

### Train-test-split ###

In [47]:
## train-test-split
data_train, data_test = train_test_split(data, test_size=0.25, random_state=RSEED, stratify=data.ZONEID)

### Baseline model and track it to MLFlow ###

The baseline model everytime predicts the zone-dependent average electricity production of the train data.

In [48]:
## baseline model for every zone and aggregated over all zones

# zones to loop over
zones = np.sort(data.ZONEID.unique()) 

# baseline predictions of all sites will be merged into one DataFrame to calculate the RMSE with respect to the observations of all zones
y_finalpred = pd.DataFrame() 

# scores of zone-dependent baseline models are saved in dictionary
score_baseline = {}

# loop over all zones
for zone in zones:

    # get train and test data of individual zones
    y_train = data_train[data_train.ZONEID == zone].TARGETVAR
    y_test = data_test[data_test.ZONEID == zone].TARGETVAR

    # baseline predicton for individual zone
    y_pred = np.ones(len(y_test)) * np.mean(y_train)

    # RMSE for current zone
    score_baseline['ZONE' + str(zone)] = mean_squared_error(y_test, y_pred, squared=False)

    # add y_pred to DataFrame y_finalpred
    y_pred = pd.DataFrame(y_pred, index = y_test.index, columns = ['pred'])
    y_finalpred = pd.concat([y_finalpred, y_pred], axis=0)

# merge final baseline predictions with observations of all zones to ensure a right order in both data  
y_finalpred = y_finalpred.join(data_test.TARGETVAR)
y_finalpred.rename(columns = {'TARGETVAR':'test'}, inplace=True)

# RMSE for whole dataset
score_baseline['TOTAL'] = mean_squared_error(y_finalpred['test'], y_finalpred['pred'], squared=False)

# track on MLFlow
for key, value in score_baseline.items():
    print(f'RMSE baseline model for {key}: {round(value,3)}')
    # log_to_mlflow(ZONEID=key, Model='Baseline', features=None, train_RMSE=None, test_RMSE=value, 
    #              nan_removed=True, zero_removed=False, mean=None, 
    #              hyperparameter=None, model_parameters=None, info='Predict zone-dependent means of electricity production')

RMSE baseline model for ZONE1: 0.289
RMSE baseline model for ZONE2: 0.265
RMSE baseline model for ZONE3: 0.297
RMSE baseline model for ZONE4: 0.333
RMSE baseline model for ZONE5: 0.334
RMSE baseline model for ZONE6: 0.335
RMSE baseline model for ZONE7: 0.266
RMSE baseline model for ZONE8: 0.282
RMSE baseline model for ZONE9: 0.286
RMSE baseline model for ZONE10: 0.342
RMSE baseline model for TOTAL: 0.304


### Linear regression model ###

In [49]:
# define features
features = data.columns.to_list()
features = [var for var in features if var not in ('ZONEID','TARGETVAR','TIMESTAMP','WD100','WD10')]
features

['U10',
 'V10',
 'U100',
 'V100',
 'HOUR',
 'MONTH',
 'WEEKDAY',
 'IS_HOLIDAY',
 'WS10',
 'WS100',
 'U100NORM',
 'V100NORM',
 'WD100CARD_E',
 'WD100CARD_ENE',
 'WD100CARD_ESE',
 'WD100CARD_N',
 'WD100CARD_NE',
 'WD100CARD_NNE',
 'WD100CARD_NNW',
 'WD100CARD_NW',
 'WD100CARD_S',
 'WD100CARD_SE',
 'WD100CARD_SSE',
 'WD100CARD_SSW',
 'WD100CARD_SW',
 'WD100CARD_W',
 'WD100CARD_WNW',
 'WD100CARD_WSW',
 'WD10CARD_E',
 'WD10CARD_ENE',
 'WD10CARD_ESE',
 'WD10CARD_N',
 'WD10CARD_NE',
 'WD10CARD_NNE',
 'WD10CARD_NNW',
 'WD10CARD_NW',
 'WD10CARD_S',
 'WD10CARD_SE',
 'WD10CARD_SSE',
 'WD10CARD_SSW',
 'WD10CARD_SW',
 'WD10CARD_W',
 'WD10CARD_WNW',
 'WD10CARD_WSW']

Linear regession model for the different sites.

In [50]:
# zones to loop over
zones = np.sort(data.ZONEID.unique()) 

# initialize DataFrame where predictions of various zones are saved
y_trainpred, y_testpred = pd.DataFrame(), pd.DataFrame()

# save scores of linear regression models for different zones in dictionary
trainscore, testscore = {}, {}

# loop over zones
for zone in zones:

    # split train and test data in feature and TARGETVAR parts and cut data to desired zones
    X_train = data_train[data_train.ZONEID == zone][features]
    y_train = data_train[data_train.ZONEID == zone].TARGETVAR

    X_test = data_test[data_test.ZONEID == zone][features]
    y_test = data_test[data_test.ZONEID == zone].TARGETVAR

    # initialize and train the model
    model = LR()
    model.fit(X_train, y_train)

    # predict train data with the model
    y_pred = model.predict(X_train)
    y_pred = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred]
    trainscore['ZONE' + str(zone)] = mean_squared_error(y_pred, y_train, squared=False)

    # concatenate predictions for the different sites
    y_pred = pd.DataFrame(y_pred, index = y_train.index, columns = ['pred'])
    y_trainpred = pd.concat([y_trainpred, y_pred], axis=0)

    # predict test data with the model
    y_pred = model.predict(X_test)
    y_pred = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred]
    testscore['ZONE' + str(zone)] = mean_squared_error(y_pred, y_test, squared=False)

    # concatenate predictions for the different sites
    y_pred = pd.DataFrame(y_pred, index = y_test.index, columns = ['pred'])
    y_testpred = pd.concat([y_testpred, y_pred], axis=0)    
    
# merge final train and test predictions with observations to ensure a right order in both data  
y_trainpred = y_trainpred.join(data_train.TARGETVAR)
y_testpred = y_testpred.join(data_test.TARGETVAR)

y_trainpred.rename(columns = {'TARGETVAR':'test'}, inplace=True)
y_testpred.rename(columns = {'TARGETVAR':'test'}, inplace=True)

testscore['TOTAL'] = mean_squared_error(y_testpred['test'], y_testpred['pred'], squared=False)
trainscore['TOTAL'] = mean_squared_error(y_trainpred['test'], y_trainpred['pred'], squared=False)

# track scores to MLFlow
for key in testscore.keys():
    print(f'train-RMSE/test-RMSE linear regression model for {key}: {round(trainscore[key],3)} {round(testscore[key],3)}')
    # log_to_mlflow(ZONEID=key, Model='Linear Regression', features=features, train_RMSE=trainscore[key], test_RMSE=testscore[key], 
    #               nan_removed=True, zero_removed=False, mean=None, 
    #               hyperparameter=None, model_parameters=None, info=None)


train-RMSE/test-RMSE linear regression model for ZONE1: 0.181 0.183
train-RMSE/test-RMSE linear regression model for ZONE2: 0.154 0.149
train-RMSE/test-RMSE linear regression model for ZONE3: 0.152 0.154
train-RMSE/test-RMSE linear regression model for ZONE4: 0.177 0.179
train-RMSE/test-RMSE linear regression model for ZONE5: 0.18 0.183
train-RMSE/test-RMSE linear regression model for ZONE6: 0.188 0.185
train-RMSE/test-RMSE linear regression model for ZONE7: 0.138 0.139
train-RMSE/test-RMSE linear regression model for ZONE8: 0.17 0.17
train-RMSE/test-RMSE linear regression model for ZONE9: 0.165 0.164
train-RMSE/test-RMSE linear regression model for ZONE10: 0.203 0.205
train-RMSE/test-RMSE linear regression model for TOTAL: 0.172 0.172


Linear regression model for all sites ignoring potential dependencies on the different zones.

In [51]:
## linear regression model using data of all sites directly together
X_train = data_train[features]
y_train = data_train.TARGETVAR
X_test = data_test[features]
y_test = data_test.TARGETVAR

model = LR()
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_train = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred_train]
y_pred_test = model.predict(X_test)
y_pred_test = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred_test]

print(f'RMSE train data: {round(mean_squared_error(y_train, y_pred_train, squared=False),4)}')
print(f'RMSE test data: {round(mean_squared_error(y_test, y_pred_test, squared=False),4)}')

RMSE train data: 0.1938
RMSE test data: 0.1933
