In [1]:
## load modules and run mlflow_logging.ipynb to get function to track model information on MLFLow
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import LinearSVR, SVR
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from datetime import datetime
import pickle
import os


from sklearn.model_selection import GridSearchCV

from modeling.functions import modelling, log_to_mlflow, get_features, save_models, load_models



In [2]:
data = pd.read_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', parse_dates=['TIMESTAMP'])
data.head()
data.dropna(inplace=True)
data.info()
RSEED = 42

data = pd.get_dummies(data, columns = ['WD100CARD','WD10CARD'])
data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175265 entries, 0 to 175433
Data columns (total 19 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   ZONEID      175265 non-null  int64         
 1   TIMESTAMP   175265 non-null  datetime64[ns]
 2   TARGETVAR   175265 non-null  float64       
 3   U10         175265 non-null  float64       
 4   V10         175265 non-null  float64       
 5   U100        175265 non-null  float64       
 6   V100        175265 non-null  float64       
 7   HOUR        175265 non-null  int64         
 8   MONTH       175265 non-null  int64         
 9   WEEKDAY     175265 non-null  int64         
 10  IS_HOLIDAY  175265 non-null  int64         
 11  WS10        175265 non-null  float64       
 12  WS100       175265 non-null  float64       
 13  WD10        175265 non-null  float64       
 14  WD100       175265 non-null  float64       
 15  WD100CARD   175265 non-null  object        
 16  WD

Unnamed: 0,ZONEID,TIMESTAMP,TARGETVAR,U10,V10,U100,V100,HOUR,MONTH,WEEKDAY,...,WD10CARD_NNW,WD10CARD_NW,WD10CARD_S,WD10CARD_SE,WD10CARD_SSE,WD10CARD_SSW,WD10CARD_SW,WD10CARD_W,WD10CARD_WNW,WD10CARD_WSW
0,1,2012-01-01 01:00:00,0.0,2.1246,-2.681966,2.86428,-3.666076,1,1,6,...,0,1,0,0,0,0,0,0,0,0
1,1,2012-01-01 02:00:00,0.054879,2.521695,-1.79696,3.344859,-2.464761,2,1,6,...,0,1,0,0,0,0,0,0,0,0
2,1,2012-01-01 03:00:00,0.110234,2.67221,-0.822516,3.508448,-1.214093,3,1,6,...,0,0,0,0,0,0,0,0,1,0
3,1,2012-01-01 04:00:00,0.165116,2.457504,-0.143642,3.215233,-0.355546,4,1,6,...,0,0,0,0,0,0,0,1,0,0
4,1,2012-01-01 05:00:00,0.15694,2.245898,0.389576,2.957678,0.332701,5,1,6,...,0,0,0,0,0,0,0,1,0,0


In [3]:
## train-test-split
data_train, data_test = train_test_split(data, test_size=0.25, random_state=RSEED, stratify=data.ZONEID)

# define features and feature dict
feature_dict = get_features(data)

In [4]:
model_dict = load_models('211201_0937_SVR')

In [5]:
model_dict

{'no_deg_comp': {2: SVR(C=10),
  10: SVR(C=10),
  4: SVR(C=10),
  8: SVR(C=10, degree=5, kernel='poly'),
  6: SVR(C=10),
  3: SVR(C=10),
  1: SVR(C=10),
  5: SVR(C=10),
  9: SVR(C=10),
  7: SVR(C=10)},
 'no_ten': {2: SVR(C=10),
  10: SVR(C=10, degree=6, kernel='poly'),
  4: SVR(C=10),
  8: SVR(C=10, degree=6, kernel='poly'),
  6: SVR(C=10, degree=6, kernel='poly'),
  3: SVR(C=10, degree=5, kernel='poly'),
  1: SVR(C=5, degree=6, kernel='poly'),
  5: SVR(C=10, degree=6, kernel='poly'),
  9: SVR(C=5, degree=5, kernel='poly'),
  7: SVR(C=10, degree=5, kernel='poly')},
 'no_comp': {2: SVR(C=10),
  10: SVR(C=10),
  4: SVR(C=10),
  8: SVR(C=10),
  6: SVR(C=10),
  3: SVR(C=10),
  1: SVR(C=10),
  5: SVR(C=10),
  9: SVR(C=10),
  7: SVR(C=10)},
 'all': {2: SVR(C=10),
  10: SVR(C=10),
  4: SVR(C=10),
  8: SVR(C=5, degree=6, kernel='poly'),
  6: SVR(C=10),
  3: SVR(C=10),
  1: SVR(C=10),
  5: SVR(C=10),
  9: SVR(C=10),
  7: SVR(C=10)},
 'no_deg_norm_U10V10': {2: SVR(C=10),
  10: SVR(C=10),
  4: SV

In [6]:
features = []
zones = []
train_score = []
test_score = []

key = 'no_deg_comp'
zone = 2


print(f'Features: {key}, ZONEID: {zone}')
X_train = data_train[feature_dict[key]]
X_test = data_test[feature_dict[key]]
y_train = data_train.TARGETVAR
y_test = data_test.TARGETVAR
features.append(key)
zones.append(zone)

model_dict[key][zone].get_params()
# model = SVR(**model_dict[key][zone].get_params())
# model.fit(X_train,y_train)
# y_pred = model.predict(X_test)
# y_pred_train = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred_train]
# train_score.append(mean_squared_error(y_train, y_pred_train, squared=False))

# y_pred_test = model_dict[key][zone].predict(X_test)
# y_pred_test = [1 if value >= 1 else 0 if value <= 0 else value for value in y_pred_test]
# test_score.append(mean_squared_error(y_test, y_pred_test, squared=False))

# results = pd.DataFrame({'features':features,'zone': zone,'train_score': train_score,'test_score': test_score})
        

Features: no_deg_comp, ZONEID: 2


{'C': 10,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [7]:
len(feature_dict.keys())

13