# k-nearest neighbours

In [10]:
## load modules and run mlflow_logging.ipynb to get function to track model information on MLFLow
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from modeling.functions import modelling, log_to_mlflow, get_features 

RSEED = 42

### Read data and remove NaNs ###

In [11]:
## read data
data = pd.read_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', parse_dates=['TIMESTAMP'])
data.head()

Unnamed: 0,ZONEID,TIMESTAMP,TARGETVAR,U10,V10,U100,V100,HOUR,MONTH,WEEKDAY,IS_HOLIDAY,WS10,WS100,WD10,WD100,WD100CARD,WD10CARD,U100NORM,V100NORM
0,1,2012-01-01 01:00:00,0.0,2.1246,-2.681966,2.86428,-3.666076,1,1,6,1,3.42153,4.652334,321.614439,321.999735,NW,NW,0.615665,-0.788008
1,1,2012-01-01 02:00:00,0.054879,2.521695,-1.79696,3.344859,-2.464761,2,1,6,1,3.096451,4.154892,305.47368,306.385781,NW,NW,0.805041,-0.593219
2,1,2012-01-01 03:00:00,0.110234,2.67221,-0.822516,3.508448,-1.214093,3,1,6,1,2.795932,3.712577,287.108562,289.088098,WNW,WNW,0.945017,-0.327022
3,1,2012-01-01 04:00:00,0.165116,2.457504,-0.143642,3.215233,-0.355546,4,1,6,1,2.461699,3.234831,273.34516,276.310236,W,W,0.993941,-0.109912
4,1,2012-01-01 05:00:00,0.15694,2.245898,0.389576,2.957678,0.332701,5,1,6,1,2.279435,2.976332,260.159324,263.581938,W,W,0.993733,0.111782


In [12]:
## remove NaNs
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175265 entries, 0 to 175433
Data columns (total 19 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   ZONEID      175265 non-null  int64         
 1   TIMESTAMP   175265 non-null  datetime64[ns]
 2   TARGETVAR   175265 non-null  float64       
 3   U10         175265 non-null  float64       
 4   V10         175265 non-null  float64       
 5   U100        175265 non-null  float64       
 6   V100        175265 non-null  float64       
 7   HOUR        175265 non-null  int64         
 8   MONTH       175265 non-null  int64         
 9   WEEKDAY     175265 non-null  int64         
 10  IS_HOLIDAY  175265 non-null  int64         
 11  WS10        175265 non-null  float64       
 12  WS100       175265 non-null  float64       
 13  WD10        175265 non-null  float64       
 14  WD100       175265 non-null  float64       
 15  WD100CARD   175265 non-null  object        
 16  WD

In [13]:
data = pd.get_dummies(data, columns = ['WD100CARD','WD10CARD'])
data.head()

Unnamed: 0,ZONEID,TIMESTAMP,TARGETVAR,U10,V10,U100,V100,HOUR,MONTH,WEEKDAY,...,WD10CARD_NNW,WD10CARD_NW,WD10CARD_S,WD10CARD_SE,WD10CARD_SSE,WD10CARD_SSW,WD10CARD_SW,WD10CARD_W,WD10CARD_WNW,WD10CARD_WSW
0,1,2012-01-01 01:00:00,0.0,2.1246,-2.681966,2.86428,-3.666076,1,1,6,...,0,1,0,0,0,0,0,0,0,0
1,1,2012-01-01 02:00:00,0.054879,2.521695,-1.79696,3.344859,-2.464761,2,1,6,...,0,1,0,0,0,0,0,0,0,0
2,1,2012-01-01 03:00:00,0.110234,2.67221,-0.822516,3.508448,-1.214093,3,1,6,...,0,0,0,0,0,0,0,0,1,0
3,1,2012-01-01 04:00:00,0.165116,2.457504,-0.143642,3.215233,-0.355546,4,1,6,...,0,0,0,0,0,0,0,1,0,0
4,1,2012-01-01 05:00:00,0.15694,2.245898,0.389576,2.957678,0.332701,5,1,6,...,0,0,0,0,0,0,0,1,0,0


### Train-test-split ###

In [14]:
## train-test-split
data_train, data_test = train_test_split(data, test_size=0.25, random_state=RSEED, stratify=data.ZONEID)

In [15]:
feature_dict = get_features(data)

In [23]:
results = {}
for key in feature_dict.keys():
    _,results[key] = modelling(data_train, data_test, feature_dict[key], LR(), scaler=None, print_scores=True, log=None, infotext_mlflow=None)
    results[key] = {k : np.round(value,5) for k,value in results[key].items()}



train-RMSE/test-RMSE linear regression model for ZONE1: 0.181 0.183
train-RMSE/test-RMSE linear regression model for ZONE2: 0.154 0.149
train-RMSE/test-RMSE linear regression model for ZONE3: 0.152 0.154
train-RMSE/test-RMSE linear regression model for ZONE4: 0.177 0.179
train-RMSE/test-RMSE linear regression model for ZONE5: 0.18 0.183
train-RMSE/test-RMSE linear regression model for ZONE6: 0.188 0.185
train-RMSE/test-RMSE linear regression model for ZONE7: 0.138 0.139
train-RMSE/test-RMSE linear regression model for ZONE8: 0.17 0.17
train-RMSE/test-RMSE linear regression model for ZONE9: 0.165 0.164
train-RMSE/test-RMSE linear regression model for ZONE10: 0.203 0.205
train-RMSE/test-RMSE linear regression model for TOTAL: 0.172 0.172
train-RMSE/test-RMSE linear regression model for ZONE1: 0.181 0.183
train-RMSE/test-RMSE linear regression model for ZONE2: 0.154 0.149
train-RMSE/test-RMSE linear regression model for ZONE3: 0.152 0.154
train-RMSE/test-RMSE linear regression model for Z

In [25]:
results

{'all': ({'ZONE1': 0.18067391137916264,
   'ZONE2': 0.15358796465696406,
   'ZONE3': 0.15240057484533304,
   'ZONE4': 0.17695706095941097,
   'ZONE5': 0.17953875715194142,
   'ZONE6': 0.1878124719179502,
   'ZONE7': 0.13811223313680113,
   'ZONE8': 0.16984257543452808,
   'ZONE9': 0.1647455968769258,
   'ZONE10': 0.2026926245990398,
   'TOTAL': 0.17159857205907939},
  {'ZONE1': 0.18271294860319284,
   'ZONE2': 0.149293234934103,
   'ZONE3': 0.15411879173698456,
   'ZONE4': 0.17875169191112553,
   'ZONE5': 0.1830966492806365,
   'ZONE6': 0.18476674334916202,
   'ZONE7': 0.13931214687432686,
   'ZONE8': 0.17039206782964286,
   'ZONE9': 0.16410315187098454,
   'ZONE10': 0.20507115488777033,
   'TOTAL': 0.17219134111457876}),
 'no_deg': ({'ZONE1': 0.18077282591748908,
   'ZONE2': 0.1536430082962325,
   'ZONE3': 0.15242158189124694,
   'ZONE4': 0.17698641006548807,
   'ZONE5': 0.1796783796466053,
   'ZONE6': 0.18782982140861162,
   'ZONE7': 0.13814059436991954,
   'ZONE8': 0.169864878402350

https://www.kaggle.com/ankitp013/interpreting-ml-models-eli5-lime-shap-yellowbrick

https://lightgbm.readthedocs.io/en/latest/

* linear regression, KNN, SVM, Boostings, RandomForest, Neuronal Networks