# 2. Modelling SVR Linear

---

In [1]:
## load modules and run mlflow_logging.ipynb to get function to track model information on MLFLow
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import LinearSVR, SVR
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from datetime import datetime
import pickle
import os


from sklearn.model_selection import GridSearchCV

from modeling.functions import modelling, log_to_mlflow, get_features, save_models, load_models, save_results



In [2]:
data = pd.read_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', parse_dates=['TIMESTAMP'], index_col='TIMESTAMP')
data.head()
data.interpolate(method='linear', inplace = True)
data.info()
RSEED = 42

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 175440 entries, 2012-01-01 01:00:00 to 2014-01-01 00:00:00
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ZONEID      175440 non-null  int64  
 1   TARGETVAR   175440 non-null  float64
 2   U10         175440 non-null  float64
 3   V10         175440 non-null  float64
 4   U100        175440 non-null  float64
 5   V100        175440 non-null  float64
 6   HOUR        175440 non-null  int64  
 7   MONTH       175440 non-null  int64  
 8   WEEKDAY     175440 non-null  int64  
 9   IS_HOLIDAY  175440 non-null  int64  
 10  WS10        175440 non-null  float64
 11  WS100       175440 non-null  float64
 12  WD10        175440 non-null  float64
 13  WD100       175440 non-null  float64
 14  WD100CARD   175440 non-null  object 
 15  WD10CARD    175440 non-null  object 
 16  U100NORM    175440 non-null  float64
 17  V100NORM    175440 non-null  float64
dtypes: float64

In [3]:
data = pd.get_dummies(data, columns = ['WD100CARD','WD10CARD'])
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 175440 entries, 2012-01-01 01:00:00 to 2014-01-01 00:00:00
Data columns (total 48 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ZONEID         175440 non-null  int64  
 1   TARGETVAR      175440 non-null  float64
 2   U10            175440 non-null  float64
 3   V10            175440 non-null  float64
 4   U100           175440 non-null  float64
 5   V100           175440 non-null  float64
 6   HOUR           175440 non-null  int64  
 7   MONTH          175440 non-null  int64  
 8   WEEKDAY        175440 non-null  int64  
 9   IS_HOLIDAY     175440 non-null  int64  
 10  WS10           175440 non-null  float64
 11  WS100          175440 non-null  float64
 12  WD10           175440 non-null  float64
 13  WD100          175440 non-null  float64
 14  U100NORM       175440 non-null  float64
 15  V100NORM       175440 non-null  float64
 16  WD100CARD_E    175440 non-null  uint8  


In [4]:
data_ts = pd.DataFrame()
# df_zone = data[data.ZONEID == 1]
# data_ts = pd.concat([data_ts, df_zone])
# data_ts
for zone in data.ZONEID.unique():
    df_zone = data[data.ZONEID == zone]
    df_zone['TARGETVAR_lag24'] = df_zone['TARGETVAR'].shift(periods = 24, freq = 'H')
    data_ts = pd.concat([data_ts, df_zone], axis = 0)

In [5]:
## train-test-split
#data_train, data_test = train_test_split(data, test_size=0.25, random_state=RSEED, stratify=data.ZONEID)
data_train = data_ts['2012-01-02 01:00:00':'2013-07-01 00:00:00']
data_test = data_ts['2013-07-01 01:00:00':]

In [6]:
data_test[data_test.TARGETVAR_lag24.isna()]

Unnamed: 0_level_0,ZONEID,TARGETVAR,U10,V10,U100,V100,HOUR,MONTH,WEEKDAY,IS_HOLIDAY,...,WD10CARD_NW,WD10CARD_S,WD10CARD_SE,WD10CARD_SSE,WD10CARD_SSW,WD10CARD_SW,WD10CARD_W,WD10CARD_WNW,WD10CARD_WSW,TARGETVAR_lag24
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [7]:
# define features and feature dict
feature_dict = get_features(data_ts)

features = feature_dict['all']
#features.append('TARGETVAR_lag24')
features

['U10',
 'V10',
 'U100',
 'V100',
 'HOUR',
 'MONTH',
 'WEEKDAY',
 'IS_HOLIDAY',
 'WS10',
 'WS100',
 'WD10',
 'WD100',
 'U100NORM',
 'V100NORM',
 'WD100CARD_E',
 'WD100CARD_ENE',
 'WD100CARD_ESE',
 'WD100CARD_N',
 'WD100CARD_NE',
 'WD100CARD_NNE',
 'WD100CARD_NNW',
 'WD100CARD_NW',
 'WD100CARD_S',
 'WD100CARD_SE',
 'WD100CARD_SSE',
 'WD100CARD_SSW',
 'WD100CARD_SW',
 'WD100CARD_W',
 'WD100CARD_WNW',
 'WD100CARD_WSW',
 'WD10CARD_E',
 'WD10CARD_ENE',
 'WD10CARD_ESE',
 'WD10CARD_N',
 'WD10CARD_NE',
 'WD10CARD_NNE',
 'WD10CARD_NNW',
 'WD10CARD_NW',
 'WD10CARD_S',
 'WD10CARD_SE',
 'WD10CARD_SSE',
 'WD10CARD_SSW',
 'WD10CARD_SW',
 'WD10CARD_W',
 'WD10CARD_WNW',
 'WD10CARD_WSW',
 'TARGETVAR_lag24']

In [8]:
for key in list(feature_dict.keys())[3:]:
    del feature_dict[key]

len(feature_dict.keys())

3

In [9]:
#model = SVR(C = 0.1, kernel='rbf')
model = LinearRegression()
scaler = MinMaxScaler()

model_dict = {}
results = {}
results_train = {}
#param_grid = [ 
    # {           'C': [0.1, 1, 5, 10], 
    #             'degree': [5, 6, 7],
    #             'kernel': ['poly']
    #             },
    # {           'C': [0.1, 1, 10, 100] ,
    #             'kernel': ['rbf']
    #             }
#                 {           'C': [0.1] ,
#                 'kernel': ['rbf']
#                 }
# ]
for key in [feature_dict.keys()][3:]:
    del feature_dict[key]

for key in feature_dict.keys():
    print(f'Features: {key}')
    features = feature_dict[key]
    #features.append('TARGETVAR_lag24')
    results_train[key],results[key], model_dict[key] = modelling(data_train, data_test, features, 
                                                                    model = model, 
                                                                    scaler = scaler, 
                                                                    print_scores = True, 
                                                                    log = False, 
                                                                    infotext_mlflow = None, 
                                                                    save_model = True, 
#                                                                    perform_gridCV = True, 
#                                                                    param_grid = param_grid, 
                                                                    n_jobs = 3)
    results[key] = {k : np.round(value,5) for k,value in results[key].items()}

Features: all
Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.0, 1.0

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.01, 1.0

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.06, 1.08

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.02, 1.09

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.02, 1.09

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.0, 1.09

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.01, 1.01

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.01, 1.0

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.0, 1.05

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.13, 1.16

train-RMSE/test-RMSE LinearRegression for ZONE1: 0.179 0.191

train-RMSE/test-RMSE LinearReg

In [10]:
features

['U10',
 'V10',
 'U100',
 'V100',
 'HOUR',
 'MONTH',
 'WEEKDAY',
 'IS_HOLIDAY',
 'WS10',
 'WS100',
 'WD100CARD_E',
 'WD100CARD_ENE',
 'WD100CARD_ESE',
 'WD100CARD_N',
 'WD100CARD_NE',
 'WD100CARD_NNE',
 'WD100CARD_NNW',
 'WD100CARD_NW',
 'WD100CARD_S',
 'WD100CARD_SE',
 'WD100CARD_SSE',
 'WD100CARD_SSW',
 'WD100CARD_SW',
 'WD100CARD_W',
 'WD100CARD_WNW',
 'WD100CARD_WSW',
 'WD10CARD_E',
 'WD10CARD_ENE',
 'WD10CARD_ESE',
 'WD10CARD_N',
 'WD10CARD_NE',
 'WD10CARD_NNE',
 'WD10CARD_NNW',
 'WD10CARD_NW',
 'WD10CARD_S',
 'WD10CARD_SE',
 'WD10CARD_SSE',
 'WD10CARD_SSW',
 'WD10CARD_SW',
 'WD10CARD_W',
 'WD10CARD_WNW',
 'WD10CARD_WSW',
 'TARGETVAR_lag24']

In [11]:
path = save_models(model_dict)

FileExistsError: [Errno 17] File exists: '../saved_models/211207_1204_LinearRegression'

In [None]:
save_results(results_train, results, path)

In [12]:
results

{'all': {'ZONE1': 0.19101,
  'ZONE2': 0.17429,
  'ZONE3': 0.15518,
  'ZONE4': 0.17749,
  'ZONE5': 0.18026,
  'ZONE6': 0.19367,
  'ZONE7': 0.15365,
  'ZONE8': 0.1966,
  'ZONE9': 0.16669,
  'ZONE10': 0.21266,
  'TOTAL': 0.28413},
 'no_deg': {'ZONE1': 0.19025,
  'ZONE2': 0.17386,
  'ZONE3': 0.15509,
  'ZONE4': 0.1769,
  'ZONE5': 0.18047,
  'ZONE6': 0.19339,
  'ZONE7': 0.15074,
  'ZONE8': 0.19505,
  'ZONE9': 0.16393,
  'ZONE10': 0.2139,
  'TOTAL': 0.28373},
 'no_deg_norm': {'ZONE1': 0.19066,
  'ZONE2': 0.17404,
  'ZONE3': 0.15544,
  'ZONE4': 0.17663,
  'ZONE5': 0.18027,
  'ZONE6': 0.19327,
  'ZONE7': 0.15079,
  'ZONE8': 0.19505,
  'ZONE9': 0.16442,
  'ZONE10': 0.21303,
  'TOTAL': 0.28375}}

In [None]:
(results['all']['ZONE1'])

0.19101

In [None]:
rmses = []
for key in results.keys():
    mse = 0
    for zone in results[key].keys():
        #print(key, zone)
        mse_zone = np.power(results[key][zone], 2) 
        mse = mse + mse_zone
    mse = mse / len(results[key].keys())
    rmses.append(np.power(mse, 0.5))    

rmses


    # ((y_pred_1 - y_test_1) ^ 2 + (y_pred_2 - y_test_2) ^ 2 + (y2_pred_1 - y2_test_1) ^ 2 + (y2_pred_2 - y2_test_2) ^ 2) / len(test_all)

    # ((y_pred_1 - y_test_1) ^ 2 + (y_pred_2 - y_test_2) ^ 2) / len(test1) + ((y2_pred_1 - y2_test_1) ^ 2 + (y2_pred_2 - y2_test_2) ^ 2) / len(test2)
    

[0.19269539417527248, 0.19203077667631585, 0.19201405722687928]