# 2. Modelling SVR Linear

---

In [1]:
## load modules and run mlflow_logging.ipynb to get function to track model information on MLFLow
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import LinearSVR, SVR
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from datetime import datetime
import pickle
import os


from sklearn.model_selection import GridSearchCV

from modeling.functions import modelling, log_to_mlflow, get_features, save_models, load_models, save_results, modelling_fc



In [2]:
data = pd.read_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', parse_dates=['TIMESTAMP'], index_col='TIMESTAMP')
data.head()
data.interpolate(method='linear', inplace = True)
data.info()
RSEED = 42

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 175440 entries, 2012-01-01 01:00:00 to 2014-01-01 00:00:00
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ZONEID      175440 non-null  int64  
 1   TARGETVAR   175440 non-null  float64
 2   U10         175440 non-null  float64
 3   V10         175440 non-null  float64
 4   U100        175440 non-null  float64
 5   V100        175440 non-null  float64
 6   HOUR        175440 non-null  int64  
 7   MONTH       175440 non-null  int64  
 8   WEEKDAY     175440 non-null  int64  
 9   IS_HOLIDAY  175440 non-null  int64  
 10  WS10        175440 non-null  float64
 11  WS100       175440 non-null  float64
 12  WD10        175440 non-null  float64
 13  WD100       175440 non-null  float64
 14  WD100CARD   175440 non-null  object 
 15  WD10CARD    175440 non-null  object 
 16  U100NORM    175440 non-null  float64
 17  V100NORM    175440 non-null  float64
dtypes: float64

In [3]:
data = pd.get_dummies(data, columns = ['WD100CARD','WD10CARD'])
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 175440 entries, 2012-01-01 01:00:00 to 2014-01-01 00:00:00
Data columns (total 48 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ZONEID         175440 non-null  int64  
 1   TARGETVAR      175440 non-null  float64
 2   U10            175440 non-null  float64
 3   V10            175440 non-null  float64
 4   U100           175440 non-null  float64
 5   V100           175440 non-null  float64
 6   HOUR           175440 non-null  int64  
 7   MONTH          175440 non-null  int64  
 8   WEEKDAY        175440 non-null  int64  
 9   IS_HOLIDAY     175440 non-null  int64  
 10  WS10           175440 non-null  float64
 11  WS100          175440 non-null  float64
 12  WD10           175440 non-null  float64
 13  WD100          175440 non-null  float64
 14  U100NORM       175440 non-null  float64
 15  V100NORM       175440 non-null  float64
 16  WD100CARD_E    175440 non-null  uint8  


In [4]:
data_ts = pd.DataFrame()
# df_zone = data[data.ZONEID == 1]
# data_ts = pd.concat([data_ts, df_zone])
# data_ts
for zone in data.ZONEID.unique():
    df_zone = data[data.ZONEID == zone]
    df_zone['TARGETVAR_lag24'] = df_zone['TARGETVAR'].shift(periods = 24, freq = 'H')
    data_ts = pd.concat([data_ts, df_zone], axis = 0)

In [5]:
## train-test-split
#data_train, data_test = train_test_split(data, test_size=0.25, random_state=RSEED, stratify=data.ZONEID)
data_train = data_ts['2012-01-02 01:00:00':'2013-07-01 00:00:00']
data_test = data_ts['2013-07-01 01:00:00':]

In [6]:
data_test

Unnamed: 0_level_0,ZONEID,TARGETVAR,U10,V10,U100,V100,HOUR,MONTH,WEEKDAY,IS_HOLIDAY,...,WD10CARD_NW,WD10CARD_S,WD10CARD_SE,WD10CARD_SSE,WD10CARD_SSW,WD10CARD_SW,WD10CARD_W,WD10CARD_WNW,WD10CARD_WSW,TARGETVAR_lag24
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-07-01 01:00:00,1,0.625035,5.896003,-1.520128,9.461001,-2.106530,1,7,0,0,...,0,0,0,0,0,0,0,1,0,0.375811
2013-07-01 02:00:00,1,0.791185,5.886435,-0.900037,9.019789,-1.276092,2,7,0,0,...,0,0,0,0,0,0,1,0,0,0.519688
2013-07-01 03:00:00,1,0.867400,5.899591,-0.693670,8.685795,-1.147814,3,7,0,0,...,0,0,0,0,0,0,1,0,0,0.520346
2013-07-01 04:00:00,1,0.896814,5.807502,-0.680772,8.629487,-1.117739,4,7,0,0,...,0,0,0,0,0,0,1,0,0,0.428907
2013-07-01 05:00:00,1,0.647214,4.936254,-0.752703,7.652959,-1.130014,5,7,0,0,...,0,0,0,0,0,0,1,0,0,0.665915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-12-31 20:00:00,10,0.792143,1.032363,-6.281558,2.041033,-11.220655,20,12,1,0,...,0,0,0,0,0,0,0,0,0,0.548199
2013-12-31 21:00:00,10,0.792143,1.702361,-6.202448,2.846245,-10.486079,21,12,1,0,...,0,0,0,0,0,0,0,0,0,0.472581
2013-12-31 22:00:00,10,0.792143,5.086629,-1.261378,7.382256,-3.097656,22,12,1,0,...,0,0,0,0,0,0,0,1,0,0.392045
2013-12-31 23:00:00,10,0.792143,4.183751,-1.580172,5.789054,-2.116548,23,12,1,0,...,0,0,0,0,0,0,0,1,0,0.181852


In [7]:
# define features and feature dict
feature_dict = get_features(data_ts)

features = feature_dict['all']
#features.append('TARGETVAR_lag24')
features

['U10',
 'V10',
 'U100',
 'V100',
 'HOUR',
 'MONTH',
 'WEEKDAY',
 'IS_HOLIDAY',
 'WS10',
 'WS100',
 'WD10',
 'WD100',
 'U100NORM',
 'V100NORM',
 'WD100CARD_E',
 'WD100CARD_ENE',
 'WD100CARD_ESE',
 'WD100CARD_N',
 'WD100CARD_NE',
 'WD100CARD_NNE',
 'WD100CARD_NNW',
 'WD100CARD_NW',
 'WD100CARD_S',
 'WD100CARD_SE',
 'WD100CARD_SSE',
 'WD100CARD_SSW',
 'WD100CARD_SW',
 'WD100CARD_W',
 'WD100CARD_WNW',
 'WD100CARD_WSW',
 'WD10CARD_E',
 'WD10CARD_ENE',
 'WD10CARD_ESE',
 'WD10CARD_N',
 'WD10CARD_NE',
 'WD10CARD_NNE',
 'WD10CARD_NNW',
 'WD10CARD_NW',
 'WD10CARD_S',
 'WD10CARD_SE',
 'WD10CARD_SSE',
 'WD10CARD_SSW',
 'WD10CARD_SW',
 'WD10CARD_W',
 'WD10CARD_WNW',
 'WD10CARD_WSW',
 'TARGETVAR_lag24']

In [None]:
for key in list(feature_dict.keys())[3:]:
    del feature_dict[key]

len(feature_dict.keys())

In [None]:
#model = SVR(C = 0.1, kernel='rbf')
model = LinearRegression()
scaler = MinMaxScaler()

model_dict = {}
results = {}
results_train = {}
#param_grid = [ 
    # {           'C': [0.1, 1, 5, 10], 
    #             'degree': [5, 6, 7],
    #             'kernel': ['poly']
    #             },
    # {           'C': [0.1, 1, 10, 100] ,
    #             'kernel': ['rbf']
    #             }
#                 {           'C': [0.1] ,
#                 'kernel': ['rbf']
#                 }
# ]
for key in [feature_dict.keys()][3:]:
    del feature_dict[key]

for key in feature_dict.keys():
    print(f'Features: {key}')
    features = feature_dict[key]
    #features.append('TARGETVAR_lag24')
    results_train[key],results[key], model_dict[key] = modelling(data_train, data_test, features, 
                                                                    model = model, 
                                                                    scaler = scaler, 
                                                                    print_scores = True, 
                                                                    log = False, 
                                                                    infotext_mlflow = None, 
                                                                    save_model = True, 
#                                                                    perform_gridCV = True, 
#                                                                    param_grid = param_grid, 
                                                                    n_jobs = 3)
    results[key] = {k : np.round(value,5) for k,value in results[key].items()}

In [None]:
features

In [None]:
path = save_models(model_dict)

In [None]:
save_results(results_train, results, path)

In [None]:
results

In [None]:
(results['all']['ZONE1'])

In [None]:
def result_to_df(result_dict, model_dict, save = False, file_path = None): 
    params = []
    for zone in range(1,11):
        params.append(model_dict[zone].get_params())
    params.append(np.nan)
    df_results = pd.DataFrame(results)
    df_results['FC_MIN'] = df_results.idxmin(axis = 1)
    df_results['MIN'] = df_results.min(axis = 1)
    df_results['MODEL'] = model_dict[1].__class__.__name__
    df_results['BEST_PARAMS'] = pd.Series(params, index=df_results.index)
    df_results = df_results[['FC_MIN', 'MIN', 'MODEL', 'BEST_PARAMS']]
    if save:
        df_results.to_csv(file_path)
    return df_results

In [None]:
tmp_df = pd.DataFrame.from_dict(model_dict['all'], orient= 'index', columns = ['MODEL'])
tmp_df['BEST_PARAMS'] = tmp_df.MODEL.apply(lambda x: x.get_params())
tmp_df['MODEL'] = tmp_df.MODEL.apply(lambda x: x.__class__.__name__)
tmp_df['ZONE'] = tmp_df.index
tmp_df.ZONE = tmp_df.ZONE.apply(lambda x: f'ZONE{x}')
tmp_df = tmp_df.set_index('ZONE')

# tmp_index = [f'ZONE{x}' for x in tmp_df.index]
# tmp_df = tmp_df.reindex(tmp_index)

tmp_df = tmp_df.join(pd.DataFrame.from_dict(results['all'], orient= 'index', columns = ['TESTSCORE']), how = 'right')
tmp_df['MODEL'].fillna(method='ffill', inplace = True)


tmp_df

In [None]:
df_results

In [None]:
df_results = pd.concat([df_results, df_results], axis = 0)
df_results
#df_results.groupby(df_results.index).min()

In [8]:
df_results = modelling_fc(data_train, data_test, feature_dict, LinearRegression(), scaler=None, print_scores=True, log=None, \
                infotext_mlflow=None, save_model = True, perform_gridCV = False, param_grid = None, \
                    zone_params = None, n_jobs = -1)

train-RMSE/test-RMSE LinearRegression for ZONE1: 0.179 0.191

train-RMSE/test-RMSE LinearRegression for ZONE2: 0.142 0.174

train-RMSE/test-RMSE LinearRegression for ZONE3: 0.151 0.155

train-RMSE/test-RMSE LinearRegression for ZONE4: 0.176 0.177

train-RMSE/test-RMSE LinearRegression for ZONE5: 0.177 0.18

train-RMSE/test-RMSE LinearRegression for ZONE6: 0.18 0.194

train-RMSE/test-RMSE LinearRegression for ZONE7: 0.135 0.151

train-RMSE/test-RMSE LinearRegression for ZONE8: 0.16 0.195

train-RMSE/test-RMSE LinearRegression for ZONE9: 0.164 0.164

train-RMSE/test-RMSE LinearRegression for ZONE10: 0.197 0.213

train-RMSE/test-RMSE LinearRegression for TOTAL: 0.167 0.18

train-RMSE/test-RMSE LinearRegression for ZONE1: 0.179 0.19

train-RMSE/test-RMSE LinearRegression for ZONE2: 0.142 0.174

train-RMSE/test-RMSE LinearRegression for ZONE3: 0.152 0.155

train-RMSE/test-RMSE LinearRegression for ZONE4: 0.176 0.177

train-RMSE/test-RMSE LinearRegression for ZONE5: 0.177 0.18

train-RMSE/te

In [9]:
df_results

Unnamed: 0,MODEL,BEST_PARAMS,FC,TESTSCORE,TRAINSCORE
ZONE1,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",all,0.190719,0.178812
ZONE2,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",all,0.173791,0.142371
ZONE3,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",all,0.155079,0.151462
ZONE4,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",all,0.176968,0.175807
ZONE5,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",all,0.180246,0.177331
...,...,...,...,...,...
ZONE7,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",no_card_100Norm,0.153016,0.139261
ZONE8,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",no_card_100Norm,0.196777,0.163027
ZONE9,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",no_card_100Norm,0.164749,0.165077
ZONE10,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",no_card_100Norm,0.215218,0.201563
