In [22]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor

In [23]:
data_all = pd.read_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', date_parser='TIMESTAMP')

In [24]:
#data_all.dropna(inplace=True, axis=0)
data_all.fillna(0, inplace=True)
#data_all.info()
data_all_dummies = pd.get_dummies(data_all,columns=['WD100CARD', 'WD10CARD'], drop_first=True)
zone1 = data_all_dummies[data_all_dummies['ZONEID']==1]

In [25]:
features = data_all_dummies.columns
features
features_remove = [ 'ZONEID', 'TIMESTAMP', 'TARGETVAR', 'U10', 'V10', 'U100', 'V100', 
                    'WD10', 'WD100', 'U100NORM', 'V100NORM']
features = [ff for ff in features if ff not in features_remove]

In [26]:
y = zone1['TARGETVAR']
X = zone1[features]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

In [27]:
def fit_predict(reg, X_train, X_test, y_train, y_test):
    reg.fit(X_train, y_train)
    y_pred_train = reg.predict(X_train)
    y_pred_train = [0 if ff<0 else 1 if ff>1 else ff for ff in y_pred_train]
    y_pred_test = reg.predict(X_test)
    y_pred_test = [0 if ff<0 else 1 if ff>1 else ff for ff in y_pred_test]
    rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
    print(f'RMSE train: {rmse_train}')
    rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
    print(f'RMSE test: {rmse_test}')
    return rmse_train, rmse_test

def remove_windy_zeros(X_train, y_train):
    z = pd.concat([X_train, y_train], axis=1)
    z = z[~((z['TARGETVAR']==0) & (z['WS100']>=4))]
    X_train = z[features]
    y_train = z['TARGETVAR']
    return X_train, y_train

In [28]:
def get_train_test_split(data, features, random_state=42, test_size=0.25):
    y = data['TARGETVAR']
    X = data[features]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)
    return X_train, X_test, y_train, y_test

In [29]:
reg_lr = LinearRegression()
fit_predict(reg_lr, X_train, X_test, y_train, y_test)

RMSE train: 0.18470434495625396
RMSE test: 0.18083657845849982


(0.18470434495625396, 0.18083657845849982)

## Day / Night

In [30]:
data_all_dummies['DAYNIGHT'] = data_all_dummies['HOUR'].apply(lambda x : 0 if x<8 else 0 if x>19 else 1)

In [31]:
features = data_all_dummies.columns
features
features_remove = [ 'ZONEID', 'TIMESTAMP', 'TARGETVAR', 'U10', 'V10', 'U100', 'V100', 
                    'WD10', 'WD100', 'U100NORM', 'V100NORM', 'HOUR']
features = [ff for ff in features if ff not in features_remove]

In [32]:
#X_train, y_train = remove_windy_zeros(X_train, y_train)
reg = LinearRegression()
fit_predict(reg, X_train, X_test, y_train, y_test)

RMSE train: 0.18470434495625396
RMSE test: 0.18083657845849982


(0.18470434495625396, 0.18083657845849982)

In [33]:
# linear model, without hours, but with day/night
rmse_all_train, rmse_all_test = 0,0
count = 0
test_size = 0.25
for idx in range(1,11):
    zone = data_all_dummies[data_all_dummies['ZONEID']==idx]
    X_train, X_test, y_train, y_test = get_train_test_split(zone, features, test_size)
    #X_train, y_train = remove_windy_zeros(X_train, y_train)
    reg = LinearRegression()
    print('Zone ',idx)
    rmse_train, rmse_test = fit_predict(reg, X_train, X_test, y_train, y_test)
    rmse_all_train += rmse_train*len(y_train)
    rmse_all_test += rmse_test*len(y_test)

rmse_all_train /= data_all_dummies.shape[0]*(1-test_size)
rmse_all_test /= data_all_dummies.shape[0]*test_size
print('rmse_all_train',rmse_all_train)
print('rmse_all_test', rmse_all_test)

Zone  1
RMSE train: 0.18514396788274817
RMSE test: 0.1812698344905208
Zone  2
RMSE train: 0.15647048081492076
RMSE test: 0.15309229455174744
Zone  3
RMSE train: 0.15454437194297424
RMSE test: 0.1558680198521382
Zone  4
RMSE train: 0.17707019364885718
RMSE test: 0.18552130087587593
Zone  5
RMSE train: 0.1838313374859388
RMSE test: 0.186358656272047
Zone  6
RMSE train: 0.19098991292105771
RMSE test: 0.192383322262734
Zone  7
RMSE train: 0.14178523086357306
RMSE test: 0.14017645966860248
Zone  8
RMSE train: 0.17370232284095863
RMSE test: 0.16696160981799527
Zone  9
RMSE train: 0.16703701275060442
RMSE test: 0.16300918328112998
Zone  10
RMSE train: 0.20629486024239366
RMSE test: 0.2016621883662275
rmse_all_train 0.17368696913940268
rmse_all_test 0.17263028694390187


In [34]:
## random forest
rmse_all_train, rmse_all_test = 0,0
count = 0
test_size = 0.25
for idx in range(1,11):
    zone = data_all_dummies[data_all_dummies['ZONEID']==idx]
    X_train, X_test, y_train, y_test = get_train_test_split(zone, features, test_size)
    X_train, y_train = remove_windy_zeros(X_train, y_train)
    reg = ExtraTreesRegressor( 
        n_estimators=20,
        n_jobs=-1, 
        max_depth=15
    )
    print('Zone ',idx)
    rmse_train, rmse_test = fit_predict(reg, X_train, X_test, y_train, y_test)
    rmse_all_train += rmse_train*len(y_train)
    rmse_all_test += rmse_test*len(y_test)

rmse_all_train /= data_all_dummies.shape[0]*(1-test_size)
rmse_all_test /= data_all_dummies.shape[0]*test_size
print('rmse_all_train',rmse_all_train)
print('rmse_all_test', rmse_all_test)

Zone  1
RMSE train: 0.10689273154652121
RMSE test: 0.15836206278332407
Zone  2
RMSE train: 0.08216707458980613
RMSE test: 0.13370049013245477
Zone  3
RMSE train: 0.09795436214610449
RMSE test: 0.14305428910200627
Zone  4
RMSE train: 0.11356476438600815
RMSE test: 0.1711264948307706
Zone  5
RMSE train: 0.11032909722357144
RMSE test: 0.16758118989106363
Zone  6
RMSE train: 0.11703740615774477
RMSE test: 0.17339798329158504
Zone  7
RMSE train: 0.07856419186792533
RMSE test: 0.1228511223235335
Zone  8
RMSE train: 0.08623720751072099
RMSE test: 0.15025479494794527
Zone  9
RMSE train: 0.08874049524254987
RMSE test: 0.14946469047490576
Zone  10
RMSE train: 0.12844269403092423
RMSE test: 0.18504286861081715
rmse_all_train 0.09869536267578019
rmse_all_test 0.15548359863884062


In [49]:
# neural networks
from sklearn.neural_network import MLPRegressor
regr = MLPRegressor(random_state=1, max_iter=5000, learning_rate_init=0.001, 
learning_rate='adaptive', tol=0.0001)

rmse_all_train, rmse_all_test = 0,0
count = 0
test_size = 0.25
for idx in range(1,2):
    zone = data_all_dummies[data_all_dummies['ZONEID']==idx]
    X_train, X_test, y_train, y_test = get_train_test_split(zone, features, test_size)
    X_train, y_train = remove_windy_zeros(X_train, y_train)
    print('Zone ',idx)
    rmse_train, rmse_test = fit_predict(regr, X_train, X_test, y_train, y_test)
    rmse_all_train += rmse_train*len(y_train)
    rmse_all_test += rmse_test*len(y_test)

rmse_all_train /= data_all_dummies.shape[0]*(1-test_size)
rmse_all_test /= data_all_dummies.shape[0]*test_size
print('rmse_all_train',rmse_all_train)
print('rmse_all_test', rmse_all_test)

Zone  1
RMSE train: 0.15539718231905505
RMSE test: 0.1641129237186552
rmse_all_train 0.0150555044855093
rmse_all_test 0.016411292371865522
