In [6]:
## load modules and run mlflow_logging.ipynb to get function to track model information on MLFLow
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor 
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from modeling.functions import modelling, log_to_mlflow, get_features 

In [7]:
data = pd.read_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', parse_dates=['TIMESTAMP'])
data.head()
data.dropna(inplace=True)
data.info()
RSEED = 42

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175265 entries, 0 to 175433
Data columns (total 19 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   ZONEID      175265 non-null  int64         
 1   TIMESTAMP   175265 non-null  datetime64[ns]
 2   TARGETVAR   175265 non-null  float64       
 3   U10         175265 non-null  float64       
 4   V10         175265 non-null  float64       
 5   U100        175265 non-null  float64       
 6   V100        175265 non-null  float64       
 7   HOUR        175265 non-null  int64         
 8   MONTH       175265 non-null  int64         
 9   WEEKDAY     175265 non-null  int64         
 10  IS_HOLIDAY  175265 non-null  int64         
 11  WS10        175265 non-null  float64       
 12  WS100       175265 non-null  float64       
 13  WD10        175265 non-null  float64       
 14  WD100       175265 non-null  float64       
 15  WD100CARD   175265 non-null  object        
 16  WD

In [8]:
data = pd.get_dummies(data, columns = ['WD100CARD','WD10CARD'])
data.head()

Unnamed: 0,ZONEID,TIMESTAMP,TARGETVAR,U10,V10,U100,V100,HOUR,MONTH,WEEKDAY,...,WD10CARD_NNW,WD10CARD_NW,WD10CARD_S,WD10CARD_SE,WD10CARD_SSE,WD10CARD_SSW,WD10CARD_SW,WD10CARD_W,WD10CARD_WNW,WD10CARD_WSW
0,1,2012-01-01 01:00:00,0.0,2.1246,-2.681966,2.86428,-3.666076,1,1,6,...,0,1,0,0,0,0,0,0,0,0
1,1,2012-01-01 02:00:00,0.054879,2.521695,-1.79696,3.344859,-2.464761,2,1,6,...,0,1,0,0,0,0,0,0,0,0
2,1,2012-01-01 03:00:00,0.110234,2.67221,-0.822516,3.508448,-1.214093,3,1,6,...,0,0,0,0,0,0,0,0,1,0
3,1,2012-01-01 04:00:00,0.165116,2.457504,-0.143642,3.215233,-0.355546,4,1,6,...,0,0,0,0,0,0,0,1,0,0
4,1,2012-01-01 05:00:00,0.15694,2.245898,0.389576,2.957678,0.332701,5,1,6,...,0,0,0,0,0,0,0,1,0,0


In [9]:
## train-test-split
data_train, data_test = train_test_split(data, test_size=0.25, random_state=RSEED, stratify=data.ZONEID)

In [10]:
feature_dict = get_features(data)

In [11]:
features = feature_dict['all']

X_train = data_train[features]
X_test = data_test[features]
y_train = data_train['TARGETVAR']
y_test = data_test['TARGETVAR']

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


model = KNeighborsRegressor(n_neighbors = 5)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(mean_squared_error(y_test, y_pred, squared = False))



0.17190262600704964


In [13]:
results = {}
key = 'no_deg_norm'

model = KNeighborsRegressor(n_neighbors = 5)

_,results[key] = modelling(data_train, data_test, feature_dict[key], model, scaler=MinMaxScaler(), print_scores=True, log=None, infotext_mlflow=None)
results[key] = {k : np.round(value,5) for k,value in results[key].items()}

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: 0.0, 1.02

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: 0.0, 1.0

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: 0.0, 1.03

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: 0.0, 1.02

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.01, 1.0

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.0, 1.09

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.01, 1.02

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.0, 1.04

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.06, 1.0

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: 0.0, 1.01

train-RMSE/test-RMSE linear regression model for ZONE1: 0.121 0.155
train-RMSE/test-RMSE linear regression model fo

In [14]:
results

{'no_deg_norm': {'ZONE1': 0.1546,
  'ZONE2': 0.13742,
  'ZONE3': 0.14788,
  'ZONE4': 0.16996,
  'ZONE5': 0.17409,
  'ZONE6': 0.17383,
  'ZONE7': 0.12947,
  'ZONE8': 0.14949,
  'ZONE9': 0.14912,
  'ZONE10': 0.19746,
  'TOTAL': 0.15951}}

In [16]:
results = {}

model = KNeighborsRegressor(n_neighbors = 5)
scaler = MinMaxScaler()

for key in feature_dict.keys():
    _,results[key] = modelling(data_train, data_test, feature_dict[key], model = model, scaler=scaler, print_scores=True, log=None, infotext_mlflow=None)
    results[key] = {k : np.round(value,5) for k,value in results[key].items()}

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.0, 1.02

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: 0.0, 1.0

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: 0.0, 1.03

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: 0.0, 1.02

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.01, 1.0

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.0, 1.09

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.01, 1.02

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.0, 1.04

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.06, 1.0

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.0, 1.01

train-RMSE/test-RMSE linear regression model for ZONE1: 0.123 0.157
train-RMSE/test-RMSE linear regression model 

In [17]:
results

{'all': {'ZONE1': 0.15672,
  'ZONE2': 0.13847,
  'ZONE3': 0.14915,
  'ZONE4': 0.17287,
  'ZONE5': 0.17664,
  'ZONE6': 0.1737,
  'ZONE7': 0.13239,
  'ZONE8': 0.15114,
  'ZONE9': 0.15061,
  'ZONE10': 0.19886,
  'TOTAL': 0.1612},
 'no_deg': {'ZONE1': 0.15446,
  'ZONE2': 0.13704,
  'ZONE3': 0.14726,
  'ZONE4': 0.17058,
  'ZONE5': 0.17479,
  'ZONE6': 0.17238,
  'ZONE7': 0.1301,
  'ZONE8': 0.15009,
  'ZONE9': 0.14943,
  'ZONE10': 0.19772,
  'TOTAL': 0.15956},
 'no_deg_norm': {'ZONE1': 0.1546,
  'ZONE2': 0.13742,
  'ZONE3': 0.14788,
  'ZONE4': 0.16996,
  'ZONE5': 0.17409,
  'ZONE6': 0.17383,
  'ZONE7': 0.12947,
  'ZONE8': 0.14949,
  'ZONE9': 0.14912,
  'ZONE10': 0.19746,
  'TOTAL': 0.15951},
 'no_deg_norm_U10V10': {'ZONE1': 0.15585,
  'ZONE2': 0.13825,
  'ZONE3': 0.14844,
  'ZONE4': 0.17083,
  'ZONE5': 0.17514,
  'ZONE6': 0.17471,
  'ZONE7': 0.13098,
  'ZONE8': 0.15026,
  'ZONE9': 0.14975,
  'ZONE10': 0.19832,
  'TOTAL': 0.16041},
 'no_deg_norm_WS10': {'ZONE1': 0.15791,
  'ZONE2': 0.13948,
  

In [21]:
for key, values in feature_dict.items():
    print(key,':\n', values, '\n')

all :
 ['U10', 'V10', 'U100', 'V100', 'HOUR', 'MONTH', 'WEEKDAY', 'IS_HOLIDAY', 'WS10', 'WS100', 'WD10', 'WD100', 'U100NORM', 'V100NORM', 'WD100CARD_E', 'WD100CARD_ENE', 'WD100CARD_ESE', 'WD100CARD_N', 'WD100CARD_NE', 'WD100CARD_NNE', 'WD100CARD_NNW', 'WD100CARD_NW', 'WD100CARD_S', 'WD100CARD_SE', 'WD100CARD_SSE', 'WD100CARD_SSW', 'WD100CARD_SW', 'WD100CARD_W', 'WD100CARD_WNW', 'WD100CARD_WSW', 'WD10CARD_E', 'WD10CARD_ENE', 'WD10CARD_ESE', 'WD10CARD_N', 'WD10CARD_NE', 'WD10CARD_NNE', 'WD10CARD_NNW', 'WD10CARD_NW', 'WD10CARD_S', 'WD10CARD_SE', 'WD10CARD_SSE', 'WD10CARD_SSW', 'WD10CARD_SW', 'WD10CARD_W', 'WD10CARD_WNW', 'WD10CARD_WSW'] 

no_deg :
 ['U10', 'V10', 'U100', 'V100', 'HOUR', 'MONTH', 'WEEKDAY', 'IS_HOLIDAY', 'WS10', 'WS100', 'U100NORM', 'V100NORM', 'WD100CARD_E', 'WD100CARD_ENE', 'WD100CARD_ESE', 'WD100CARD_N', 'WD100CARD_NE', 'WD100CARD_NNE', 'WD100CARD_NNW', 'WD100CARD_NW', 'WD100CARD_S', 'WD100CARD_SE', 'WD100CARD_SSE', 'WD100CARD_SSW', 'WD100CARD_SW', 'WD100CARD_W', 'WD100