## Simple models by Jerome, 25 November 2021 ##

---

In [68]:
# load modules
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns

RSEED = 42

%run mlflow_logging.ipynb

### Read and manipulate data ###

In [69]:
# read data
data = pd.read_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', parse_dates=['TIMESTAMP'])
data.head()

Unnamed: 0,ZONEID,TIMESTAMP,TARGETVAR,U10,V10,U100,V100,HOUR,MONTH,WEEKDAY,IS_HOLIDAY,WS10,WS100,WD10,WD100,WD100CARD,WD10CARD,U100NORM,V100NORM
0,1,2012-01-01 01:00:00,0.0,2.1246,-2.681966,2.86428,-3.666076,1,1,6,1,3.42153,4.652334,321.614439,321.999735,NW,NW,0.615665,-0.788008
1,1,2012-01-01 02:00:00,0.054879,2.521695,-1.79696,3.344859,-2.464761,2,1,6,1,3.096451,4.154892,305.47368,306.385781,NW,NW,0.805041,-0.593219
2,1,2012-01-01 03:00:00,0.110234,2.67221,-0.822516,3.508448,-1.214093,3,1,6,1,2.795932,3.712577,287.108562,289.088098,WNW,WNW,0.945017,-0.327022
3,1,2012-01-01 04:00:00,0.165116,2.457504,-0.143642,3.215233,-0.355546,4,1,6,1,2.461699,3.234831,273.34516,276.310236,W,W,0.993941,-0.109912
4,1,2012-01-01 05:00:00,0.15694,2.245898,0.389576,2.957678,0.332701,5,1,6,1,2.279435,2.976332,260.159324,263.581938,W,W,0.993733,0.111782


In [70]:
# remove NaNs
data.dropna(inplace=True)
#data.fillna(np.nanmean(data.TARGETVAR),inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175265 entries, 0 to 175433
Data columns (total 19 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   ZONEID      175265 non-null  int64         
 1   TIMESTAMP   175265 non-null  datetime64[ns]
 2   TARGETVAR   175265 non-null  float64       
 3   U10         175265 non-null  float64       
 4   V10         175265 non-null  float64       
 5   U100        175265 non-null  float64       
 6   V100        175265 non-null  float64       
 7   HOUR        175265 non-null  int64         
 8   MONTH       175265 non-null  int64         
 9   WEEKDAY     175265 non-null  int64         
 10  IS_HOLIDAY  175265 non-null  int64         
 11  WS10        175265 non-null  float64       
 12  WS100       175265 non-null  float64       
 13  WD10        175265 non-null  float64       
 14  WD100       175265 non-null  float64       
 15  WD100CARD   175265 non-null  object        
 16  WD

### Train-test-split ###

In [71]:
# train-test-split
data_train, data_test = train_test_split(data, test_size=0.25, random_state=RSEED)
#data_train = data_train[~((data_train.TARGETVAR == 0) & (data_train.WS100 >= 4))]

### Baseline model and load its results to MLFlow ###

The baseline model everytime predicts the average electricity production of the train data.

In [72]:
# baseline model, standard deviation and mean of TARGETVAR in train data
y_pred_baseline = np.ones(len(data_test)) * np.mean(data_train.TARGETVAR)
print(f'RMSE baseline model: {mean_squared_error(data_test.TARGETVAR, y_pred_baseline, squared=False)}')
print(f'Standard deviation of TARGETVAR in data_train: {data_train.TARGETVAR.std()}')
print(f'Mean of TARGETVAR in data_train: {data_train.TARGETVAR.mean()}')

RMSE baseline model: 0.3108326090619405
Standard deviation of TARGETVAR in data_train: 0.30983699669128106
Mean of TARGETVAR in data_train: 0.3575744161948021


In [73]:
## baseline models for every zone and aggregated over all zones

# zones to loop over
zones = list(np.sort(data_train.ZONEID.unique())) 

# score of baseline models for different zones are saved in dictionary
score_baseline = {}

# baseline predictions of all sites will be merged into one DataFrame to calculate a global RMSE
y_finalpred = pd.DataFrame() 

for zone in zones:

    # get train and test data of individual zones
    y_train = data_train[data_train.ZONEID == zone].TARGETVAR
    y_test = data_test[data_test.ZONEID == zone].TARGETVAR

    # baseline predicton for individual zone
    y_pred = np.ones(len(y_test)) * np.mean(y_train)

    # RMSE for current zone
    score_baseline['ZONE' + str(zone)] = mean_squared_error(y_test, y_pred, squared=False)

    # add y_pred to DataFrame y_finalpred
    y_pred = pd.DataFrame(y_pred, index = y_test.index, columns = ['pred'])
    y_finalpred = pd.concat([y_finalpred, y_pred], axis=0)

# merge final baseline predictions with observations to ensure a right order in both data  
y_finalpred = y_finalpred.join(data_test.TARGETVAR)
y_finalpred.rename(columns = {'TARGETVAR':'test'}, inplace=True)

# RMSE for whole dataset
score_baseline['TOTAL'] = mean_squared_error(y_finalpred['test'], y_finalpred['pred'], squared=False)

# track on MLFlow
for key, value in score_baseline.items():
    log_to_mlflow(ZONEID=key, Model='Baseline', features=None, train_RMSE=None, test_RMSE=value, 
                  nan_removed=True, zero_removed=False, mean=None, 
                  hyperparameter=None, model_parameters=None, info='Predict zone-dependent means of electricity production')

Active run_id: e1aabd913d2b43d78a5c78a71d8faa3d
Active run_id: 1094f8d586f54080a861f29ca7f13e7d
Active run_id: e040afa1bcb44b63a35796706a86cc69
Active run_id: dc3ebdf1485149e380d951c15a7319b5
Active run_id: 5c2280c1a01c480fb8dbe015cb9e7b5a
Active run_id: a958fe1db2b4413581742ad561a9cbcf
Active run_id: b4a564d03aae4756bc5f1feda7da5a8e
Active run_id: 5c9fba78f7ae4de5b759e32e493d01d5
Active run_id: 4b8e48f0ae044c858edcfadfe0db7902
Active run_id: 48624423603e4b749a49c125a15d8e23
Active run_id: 0179008311354498967303550f949286


### Linear regression model for data of all sites and data of different sites separately ###

Define features.

In [74]:
data_train.columns

Index(['ZONEID', 'TIMESTAMP', 'TARGETVAR', 'U10', 'V10', 'U100', 'V100',
       'HOUR', 'MONTH', 'WEEKDAY', 'IS_HOLIDAY', 'WS10', 'WS100', 'WD10',
       'WD100', 'WD100CARD', 'WD10CARD', 'U100NORM', 'V100NORM'],
      dtype='object')

In [75]:
# define features
features = ['WS100'] # RMSE = 0.1831 if models is differently trained for different sites sites 

features = data_train.columns.to_list() # RMSE = 0.1759
for var in ['ZONEID','TARGETVAR','TIMESTAMP','WD100CARD','WD10CARD']:
    features.remove(var)

Linear regression model for all sites together.

In [76]:
# model using data of all sites and the model's performance
X_train = data_train[features]
y_train = data_train.TARGETVAR
X_test = data_test[features]
y_test = data_test.TARGETVAR

model = LR()
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_train = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred_train]
y_pred_test = model.predict(X_test)
y_pred_test = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred_test]

print(f'RMSE train data: {mean_squared_error(y_train, y_pred_train, squared=False)}')
print(f'RMSE test data: {mean_squared_error(y_test, y_pred_test, squared=False)}')

RMSE train data: 0.19419942483439473
RMSE test data: 0.1943233575191369


Linear regession model for the different sites.

In [77]:
zones = np.sort(data_train.ZONEID.unique()) 

In [78]:
# initialize DataFrame where predictions of various zones are saved
y_finalpred = pd.DataFrame()

for zone in zones:

    # split train and test data in feature and TARGETVAR parts and cut data to desired zones
    X_train = data_train[data_train.ZONEID == zone][features]
    y_train = data_train[data_train.ZONEID == zone].TARGETVAR

    X_test = data_test[data_test.ZONEID == zone][features]
    y_test = data_test[data_test.ZONEID == zone].TARGETVAR

    # initialize and train the model
    model = LR()
    model.fit(X_train, y_train)

    # predict with the model
    y_pred_train = model.predict(X_train)
    y_pred_train = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred_train]

    y_pred_test = model.predict(X_test)
    y_pred_test = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred_test]
    y_pred_test = pd.DataFrame(y_pred_test, index = y_test.index, columns = ['pred'])

    # print model performance for individual sites
    print('RMSE train/test data in zone{}: {} {}'.format(\
          zone,
          round(mean_squared_error(y_train, y_pred_train, squared=False), 4),
          round(mean_squared_error(y_test, y_pred_test, squared=False), 4)))

    # concatenate predictions for the different sites
    y_finalpred = pd.concat([y_finalpred, y_pred_test], axis=0)

# merge final test predictions with observations to ensure a right order in both data  
y_finalpred = y_finalpred.join(data_test.TARGETVAR)
print('\nRMSE test data with model train and predictions on all sites separately: {}'.format(\
       round(mean_squared_error(y_finalpred['pred'], y_finalpred['TARGETVAR'], squared = False),4)))

RMSE train/test data in zone1: 0.1841 0.1878
RMSE train/test data in zone2: 0.1553 0.1579
RMSE train/test data in zone3: 0.1592 0.1586
RMSE train/test data in zone4: 0.1805 0.1847
RMSE train/test data in zone5: 0.1837 0.1825
RMSE train/test data in zone6: 0.1917 0.1986
RMSE train/test data in zone7: 0.1423 0.1422
RMSE train/test data in zone8: 0.1745 0.1682
RMSE train/test data in zone9: 0.1654 0.166
RMSE train/test data in zone10: 0.2085 0.2026

RMSE test data with model train and predictions on all sites separately: 0.1759
