In [1]:
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
warnings.filterwarnings('ignore',category=DeprecationWarning)
import sys
sys.path.append('..')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
import time
from cuml.ensemble import RandomForestRegressor as cuRF
from cuml.tsa import ARIMA
import cudf
import cupy
import math
import xgboost as xgb
from tqdm import tqdm
from scripts.function_utils import normalize_training, total_averaged_metrics

#### Output file

In [2]:
sourceFile = open("../../gpu_logs/teste_with_pub_building.txt", 'w')

### Load the data

In [3]:
df = pd.read_csv("../../data/porto.csv", decimal=",", index_col=[0])
## Leaving the first house (public building out) due to its different profile
df['Energy'] = df['Energy'].astype(np.float32)
df = df.iloc[:, [0, 2, 1]].reset_index(drop=True)
number_of_houses = df.Location.nunique()
df['Time'] = pd.to_datetime(df['Time'])
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Time,Location,Energy
0,2019-01-01 00:00:00,0,2.964000
1,2019-01-01 00:15:00,0,2.584000
2,2019-01-01 00:30:00,0,3.071000
3,2019-01-01 00:45:00,0,2.694000
4,2019-01-01 01:00:00,0,2.569000
...,...,...,...
1791931,2020-01-01 22:45:00,50,0.753222
1791932,2020-01-01 23:00:00,50,0.716855
1791933,2020-01-01 23:15:00,50,0.735802
1791934,2020-01-01 23:30:00,50,0.485237


In [4]:
#trunc = lambda x: math.trunc(10000 * x) / 10000
#df_trunc = pd.DataFrame(df['Energy']).applymap(trunc)
#df = pd.concat([df.iloc[:, 0:-1], df_trunc], axis=1)
#df

### Auxiliary functions

In [5]:
def plot_results(preds: np.array, actuals: np.array, title: str):
    
    plt.scatter(actuals, preds, c='b', label='predicted')
    plt.xlabel('actual')
    plt.ylabel('predicted')
    plt.title(title)
    plt.xlim(0, plt.xlim()[1])
    plt.ylim(0, plt.ylim()[1])
    _ = plt.plot([0, 100], [0, 100], '--r', label='y=x')
    plt.legend()
    plt.show()

def truncate_metric(metric):
    m = math.trunc(10000 * metric) / 10000
    return m 
    
def performance_metrics(preds: np.array, actuals: np.array, filename):

    # calculate performance metrics
    
    mse = truncate_metric(mean_squared_error(actuals, preds))
    wape = truncate_metric(np.sum(np.abs(preds - actuals)) / np.sum(np.abs(actuals))) * 100
    r2 = truncate_metric(r2_score(actuals, preds))
    
    # print performance metrics
    print('MSE: %.4f' % mse, file=filename)
    print('WAPE: %.2f' % wape, file=filename)
    print('R2: %.4f' % r2, file=filename)
    print('MSE: %.4f' % mse)
    print('WAPE: %.2f' % wape)
    print('R2: %.4f' % r2)
    return mse, wape, r2

#@jit(target_backend='cuda')
def build_model(estimator, X_train: np.array, y_train: np.array, X_test: np.array):
    
    model = estimator
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    return model, preds

def validate(estimator, X_train, y_train):
    scores = cross_validate(estimator, X_train, y_train, scoring=['r2', 'neg_mean_squared_error'])
    return scores
    

In [6]:
def past_timesteps(df, number_of_timesteps):
    df = df.sort_values(by=['Location', 'Time'])
    for i in tqdm(range(1, (number_of_timesteps + 1))):
        df.loc[df['Time'].shift(i) == df['Time'] - pd.Timedelta(i * 15, 'm'), f"lag_{i}"] = df['Energy'].shift(i)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [23]:
def test_leave_house_out(df, estimator, locations, filename):
    test = df[df['Location'].isin(locations)]
    train = df[~df['Location'].isin(locations)]
    print("Train set: ", train.shape)
    print("Test set: ", test.shape)
    X_train = train.drop(['Time', 'Energy', 'Location'], axis=1)
    X_test = test.drop(['Time', 'Energy', 'Location'], axis=1)
    y_train = train['Energy']
    y_test = test['Energy']

    X_train_norm, scaler = normalize_training(X_train)
    X_test_norm = scaler.transform(X_test)
    model = estimator
    init = time.time()
    model.fit(X_train_norm, y_train)
    y_pred = model.predict(X_test_norm)
    end = time.time()
    print('Elapsed time: {:.4f} s'.format(end - init), file=filename)
    mse, wape, r2 = performance_metrics(y_pred, y_test.values.reshape(-1), filename)
    return mse, wape, r2, model

## Save Dataframe with past timesteps

In [8]:
number_of_timesteps = 672

In [17]:
df_new = past_timesteps(df, number_of_timesteps)
df_new

  0%|          | 0/672 [00:00<?, ?it/s]

  df.loc[df['Time'].shift(i) == df['Time'] - pd.Timedelta(i * 15, 'm'), f"lag_{i}"] = df['Energy'].shift(i)
  df.loc[df['Time'].shift(i) == df['Time'] - pd.Timedelta(i * 15, 'm'), f"lag_{i}"] = df['Energy'].shift(i)
  df.loc[df['Time'].shift(i) == df['Time'] - pd.Timedelta(i * 15, 'm'), f"lag_{i}"] = df['Energy'].shift(i)
  df.loc[df['Time'].shift(i) == df['Time'] - pd.Timedelta(i * 15, 'm'), f"lag_{i}"] = df['Energy'].shift(i)
  df.loc[df['Time'].shift(i) == df['Time'] - pd.Timedelta(i * 15, 'm'), f"lag_{i}"] = df['Energy'].shift(i)
  df.loc[df['Time'].shift(i) == df['Time'] - pd.Timedelta(i * 15, 'm'), f"lag_{i}"] = df['Energy'].shift(i)
  df.loc[df['Time'].shift(i) == df['Time'] - pd.Timedelta(i * 15, 'm'), f"lag_{i}"] = df['Energy'].shift(i)
  df.loc[df['Time'].shift(i) == df['Time'] - pd.Timedelta(i * 15, 'm'), f"lag_{i}"] = df['Energy'].shift(i)
  df.loc[df['Time'].shift(i) == df['Time'] - pd.Timedelta(i * 15, 'm'), f"lag_{i}"] = df['Energy'].shift(i)
  df.loc[df['Time'].shift(i)

Unnamed: 0,Time,Location,Energy,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,...,lag_663,lag_664,lag_665,lag_666,lag_667,lag_668,lag_669,lag_670,lag_671,lag_672
0,2019-01-08 00:00:00,0,4.940000,3.952000,3.803000,4.095000,3.926000,4.917000,4.990000,4.524000,...,3.223000,2.994000,2.922000,2.558000,3.174000,2.569000,2.694000,3.071000,2.584000,2.964000
1,2019-01-08 00:15:00,0,4.149000,4.940000,3.952000,3.803000,4.095000,3.926000,4.917000,4.990000,...,2.553000,3.223000,2.994000,2.922000,2.558000,3.174000,2.569000,2.694000,3.071000,2.584000
2,2019-01-08 00:30:00,0,4.401000,4.149000,4.940000,3.952000,3.803000,4.095000,3.926000,4.917000,...,3.047000,2.553000,3.223000,2.994000,2.922000,2.558000,3.174000,2.569000,2.694000,3.071000
3,2019-01-08 00:45:00,0,4.431000,4.401000,4.149000,4.940000,3.952000,3.803000,4.095000,3.926000,...,3.310000,3.047000,2.553000,3.223000,2.994000,2.922000,2.558000,3.174000,2.569000,2.694000
4,2019-01-08 01:00:00,0,3.988000,4.431000,4.401000,4.149000,4.940000,3.952000,3.803000,4.095000,...,2.514000,3.310000,3.047000,2.553000,3.223000,2.994000,2.922000,2.558000,3.174000,2.569000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1757659,2020-01-01 22:45:00,50,0.753222,1.099100,0.453906,0.481770,0.483173,0.467528,1.001887,0.552398,...,0.676236,1.194125,2.010091,1.710115,1.515525,2.055415,2.535657,1.772033,1.482914,1.723943
1757660,2020-01-01 23:00:00,50,0.716855,0.753222,1.099100,0.453906,0.481770,0.483173,0.467528,1.001887,...,0.642552,0.676236,1.194125,2.010091,1.710115,1.515525,2.055415,2.535657,1.772033,1.482914
1757661,2020-01-01 23:15:00,50,0.735802,0.716855,0.753222,1.099100,0.453906,0.481770,0.483173,0.467528,...,0.697247,0.642552,0.676236,1.194125,2.010091,1.710115,1.515525,2.055415,2.535657,1.772033
1757662,2020-01-01 23:30:00,50,0.485237,0.735802,0.716855,0.753222,1.099100,0.453906,0.481770,0.483173,...,0.780631,0.697247,0.642552,0.676236,1.194125,2.010091,1.710115,1.515525,2.055415,2.535657


In [18]:
df_new['DayOfWeek'] = df_new['Time'].dt.dayofweek
df_new['Weekend'] = df_new['Time'].dt.dayofweek.isin([5, 6]).astype(int)
df_new['Hour'] = df_new['Time'].dt.hour
df_new

Unnamed: 0,Time,Location,Energy,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,...,lag_666,lag_667,lag_668,lag_669,lag_670,lag_671,lag_672,DayOfWeek,Weekend,Hour
0,2019-01-08 00:00:00,0,4.940000,3.952000,3.803000,4.095000,3.926000,4.917000,4.990000,4.524000,...,2.558000,3.174000,2.569000,2.694000,3.071000,2.584000,2.964000,1,0,0
1,2019-01-08 00:15:00,0,4.149000,4.940000,3.952000,3.803000,4.095000,3.926000,4.917000,4.990000,...,2.922000,2.558000,3.174000,2.569000,2.694000,3.071000,2.584000,1,0,0
2,2019-01-08 00:30:00,0,4.401000,4.149000,4.940000,3.952000,3.803000,4.095000,3.926000,4.917000,...,2.994000,2.922000,2.558000,3.174000,2.569000,2.694000,3.071000,1,0,0
3,2019-01-08 00:45:00,0,4.431000,4.401000,4.149000,4.940000,3.952000,3.803000,4.095000,3.926000,...,3.223000,2.994000,2.922000,2.558000,3.174000,2.569000,2.694000,1,0,0
4,2019-01-08 01:00:00,0,3.988000,4.431000,4.401000,4.149000,4.940000,3.952000,3.803000,4.095000,...,2.553000,3.223000,2.994000,2.922000,2.558000,3.174000,2.569000,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1757659,2020-01-01 22:45:00,50,0.753222,1.099100,0.453906,0.481770,0.483173,0.467528,1.001887,0.552398,...,1.710115,1.515525,2.055415,2.535657,1.772033,1.482914,1.723943,2,0,22
1757660,2020-01-01 23:00:00,50,0.716855,0.753222,1.099100,0.453906,0.481770,0.483173,0.467528,1.001887,...,2.010091,1.710115,1.515525,2.055415,2.535657,1.772033,1.482914,2,0,23
1757661,2020-01-01 23:15:00,50,0.735802,0.716855,0.753222,1.099100,0.453906,0.481770,0.483173,0.467528,...,1.194125,2.010091,1.710115,1.515525,2.055415,2.535657,1.772033,2,0,23
1757662,2020-01-01 23:30:00,50,0.485237,0.735802,0.716855,0.753222,1.099100,0.453906,0.481770,0.483173,...,0.676236,1.194125,2.010091,1.710115,1.515525,2.055415,2.535657,2,0,23


In [20]:
df_selected = df_new[["Time","Location","lag_1","lag_2","lag_3","lag_4","lag_96","lag_192","lag_288","lag_384","lag_480","lag_576","lag_672","DayOfWeek","Hour","Energy"]]
df_selected

Unnamed: 0,Time,Location,lag_1,lag_2,lag_3,lag_4,lag_96,lag_192,lag_288,lag_384,lag_480,lag_576,lag_672,DayOfWeek,Hour,Energy
0,2019-01-08 00:00:00,0,3.952000,3.803000,4.095000,3.926000,2.615000,3.321000,3.066000,2.351000,2.851000,3.054000,2.964000,1,0,4.940000
1,2019-01-08 00:15:00,0,4.940000,3.952000,3.803000,4.095000,2.634000,2.873000,3.152000,2.843000,3.102000,2.677000,2.584000,1,0,4.149000
2,2019-01-08 00:30:00,0,4.149000,4.940000,3.952000,3.803000,3.052000,3.407000,2.633000,2.949000,2.546000,3.522000,3.071000,1,0,4.401000
3,2019-01-08 00:45:00,0,4.401000,4.149000,4.940000,3.952000,2.893000,2.673000,2.590000,2.605000,3.016000,2.729000,2.694000,1,0,4.431000
4,2019-01-08 01:00:00,0,4.431000,4.401000,4.149000,4.940000,2.349000,2.390000,2.330000,3.023000,3.031000,2.368000,2.569000,1,1,3.988000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1757659,2020-01-01 22:45:00,50,1.099100,0.453906,0.481770,0.483173,0.641933,0.778195,0.753222,0.641933,0.778195,0.753222,1.723943,2,22,0.753222
1757660,2020-01-01 23:00:00,50,0.753222,1.099100,0.453906,0.481770,0.752850,0.931837,0.716855,0.752850,0.931837,0.716855,1.482914,2,23,0.716855
1757661,2020-01-01 23:15:00,50,0.716855,0.753222,1.099100,0.453906,0.935428,0.937079,0.735802,0.935428,0.937079,0.735802,1.772033,2,23,0.735802
1757662,2020-01-01 23:30:00,50,0.735802,0.716855,0.753222,1.099100,0.715781,0.803087,0.485237,0.715781,0.803087,0.485237,2.535657,2,23,0.485237


In [21]:
df_selected.to_csv("../../data/porto_full_selected.csv", index=False)

## Predict  5 folds with 10 random houses for test

In [9]:
df_test = pd.read_csv("../../data/porto_full_selected.csv", nrows=100)

float_cols = [c for c in df_test if df_test[c].dtype == "float64"]
float32_cols = {c: np.float32 for c in float_cols}

df = pd.read_csv("../../data/porto_full_selected.csv", engine='c', dtype=float32_cols)
df

Unnamed: 0,Time,Location,lag_1,lag_2,lag_3,lag_4,lag_96,lag_192,lag_288,lag_384,lag_480,lag_576,lag_672,DayOfWeek,Hour,Energy
0,2019-01-08 00:00:00,0,3.952000,3.803000,4.095000,3.926000,2.615000,3.321000,3.066000,2.351000,2.851000,3.054000,2.964000,1,0,4.940000
1,2019-01-08 00:15:00,0,4.940000,3.952000,3.803000,4.095000,2.634000,2.873000,3.152000,2.843000,3.102000,2.677000,2.584000,1,0,4.149000
2,2019-01-08 00:30:00,0,4.149000,4.940000,3.952000,3.803000,3.052000,3.407000,2.633000,2.949000,2.546000,3.522000,3.071000,1,0,4.401000
3,2019-01-08 00:45:00,0,4.401000,4.149000,4.940000,3.952000,2.893000,2.673000,2.590000,2.605000,3.016000,2.729000,2.694000,1,0,4.431000
4,2019-01-08 01:00:00,0,4.431000,4.401000,4.149000,4.940000,2.349000,2.390000,2.330000,3.023000,3.031000,2.368000,2.569000,1,1,3.988000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1757659,2020-01-01 22:45:00,50,1.099100,0.453906,0.481770,0.483173,0.641933,0.778195,0.753222,0.641933,0.778195,0.753222,1.723943,2,22,0.753222
1757660,2020-01-01 23:00:00,50,0.753222,1.099100,0.453906,0.481770,0.752850,0.931837,0.716855,0.752850,0.931837,0.716855,1.482914,2,23,0.716855
1757661,2020-01-01 23:15:00,50,0.716855,0.753222,1.099100,0.453906,0.935428,0.937079,0.735802,0.935428,0.937079,0.735802,1.772033,2,23,0.735802
1757662,2020-01-01 23:30:00,50,0.735802,0.716855,0.753222,1.099100,0.715781,0.803087,0.485237,0.715781,0.803087,0.485237,2.535657,2,23,0.485237


In [10]:
cu_df = cudf.read_csv("../../data/porto_full_selected.csv")
cu_df

Unnamed: 0,Time,Location,lag_1,lag_2,lag_3,lag_4,lag_96,lag_192,lag_288,lag_384,lag_480,lag_576,lag_672,DayOfWeek,Hour,Energy
0,2019-01-08 00:00:00,0,3.952000,3.803000,4.095000,3.926000,2.615000,3.321000,3.066000,2.351000,2.851000,3.054000,2.964000,1,0,4.940000
1,2019-01-08 00:15:00,0,4.940000,3.952000,3.803000,4.095000,2.634000,2.873000,3.152000,2.843000,3.102000,2.677000,2.584000,1,0,4.149000
2,2019-01-08 00:30:00,0,4.149000,4.940000,3.952000,3.803000,3.052000,3.407000,2.633000,2.949000,2.546000,3.522000,3.071000,1,0,4.401000
3,2019-01-08 00:45:00,0,4.401000,4.149000,4.940000,3.952000,2.893000,2.673000,2.590000,2.605000,3.016000,2.729000,2.694000,1,0,4.431000
4,2019-01-08 01:00:00,0,4.431000,4.401000,4.149000,4.940000,2.349000,2.390000,2.330000,3.023000,3.031000,2.368000,2.569000,1,1,3.988000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1757659,2020-01-01 22:45:00,50,1.099100,0.453906,0.481770,0.483173,0.641933,0.778195,0.753222,0.641933,0.778195,0.753222,1.723943,2,22,0.753222
1757660,2020-01-01 23:00:00,50,0.753222,1.099100,0.453906,0.481770,0.752850,0.931837,0.716855,0.752850,0.931837,0.716855,1.482914,2,23,0.716855
1757661,2020-01-01 23:15:00,50,0.716855,0.753222,1.099100,0.453906,0.935428,0.937079,0.735802,0.935428,0.937079,0.735802,1.772034,2,23,0.735802
1757662,2020-01-01 23:30:00,50,0.735802,0.716855,0.753222,1.099100,0.715781,0.803087,0.485237,0.715781,0.803087,0.485237,2.535657,2,23,0.485237


In [11]:
#df = df[["Time", "Location", "energy_lag_1", "energy_lag_3", "energy_lag_95", "energy_lag_671", "Energy"]]
#df

In [12]:
num_houses_test = 10
locations = []
for i in range(10):
    np.random.seed(i*5)
    l = np.random.choice(df['Location'].unique(), size=num_houses_test, replace=False)
    locations.append(l)
locations

[array([29, 11, 10, 22,  2, 28, 45, 32, 26,  4]),
 array([33, 29, 44, 19, 40, 21,  6, 32,  2,  3]),
 array([27, 38, 45, 23, 32, 20,  3, 31, 21, 48]),
 array([36, 26, 22, 45, 49, 44,  3, 34, 37, 33]),
 array([47,  4, 38, 18, 46, 43, 29, 12, 17,  8]),
 array([37, 47,  7, 42, 17, 14, 35, 34, 45, 40]),
 array([42, 20, 32, 44, 10, 29, 33, 25, 34, 19]),
 array([49, 41, 20, 28, 32, 46, 23, 42, 18, 14]),
 array([29, 34, 50, 39, 46,  0, 21,  4, 11,  2]),
 array([48,  7,  2,  9,  5, 28, 27, 38,  0, 20])]

In [13]:
## Number of NaNs
df.isna().sum().sum()

0

In [14]:
#kf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42) 
#locations = []
#for _, test_index in kf.split(df['Location'].unique()):
#    locations.append(test_index)
#locations

In [15]:
df.Location.value_counts()

0     34464
38    34464
28    34464
29    34464
30    34464
31    34464
32    34464
33    34464
34    34464
35    34464
36    34464
37    34464
39    34464
26    34464
40    34464
41    34464
42    34464
43    34464
44    34464
45    34464
46    34464
47    34464
48    34464
49    34464
27    34464
25    34464
1     34464
12    34464
2     34464
3     34464
4     34464
5     34464
6     34464
7     34464
8     34464
9     34464
10    34464
11    34464
13    34464
24    34464
14    34464
15    34464
16    34464
17    34464
18    34464
19    34464
20    34464
21    34464
22    34464
23    34464
50    34464
Name: Location, dtype: int64

### Linear Regression

In [22]:
metrics_list_lr = []
print("\n----------------------------", file=sourceFile)
print("\nLinear Regression\n", file=sourceFile)
print("----------------------------\n", file=sourceFile)
for i in range(10):
    print("\nIteration", i, file=sourceFile)
    print("\nIteration", i)
    mse, wape, r2, model_lr = test_leave_house_out(df, LinearRegression(), locations[i], sourceFile)
    metrics_list_lr.append((mse, wape, r2))


Iteration 0
Train set:  (1413024, 16)
Test set:  (344640, 16)


MSE: 0.0001
WAPE: 32.57
R2: 0.7334

Iteration 1
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0000
WAPE: 31.77
R2: 0.7331

Iteration 2
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0000
WAPE: 31.82
R2: 0.7247

Iteration 3
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0000
WAPE: 29.89
R2: 0.7718

Iteration 4
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0000
WAPE: 29.67
R2: 0.8017

Iteration 5
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0000
WAPE: 31.67
R2: 0.7451

Iteration 6
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0000
WAPE: 32.96
R2: 0.7242

Iteration 7
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0000
WAPE: 31.77
R2: 0.7506

Iteration 8
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0026
WAPE: 24.70
R2: 0.8930

Iteration 9
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0029
WAPE: 26.70
R2: 0.8788


### XGBoost

In [30]:
metrics_list_xgb = []
print("\n----------------------------", file=sourceFile)
print("\nXGBoost\n", file=sourceFile)
print("----------------------------\n", file=sourceFile)
for i in range(10):
    print("\nIteration", i, file=sourceFile)
    print("\nIteration", i)
    mse, wape, r2, model_xgb = test_leave_house_out(df, xgb.XGBRegressor(tree_method='gpu_hist', seed=0, colsample_bytree=0.7, learning_rate=0.1, max_depth=12, n_estimators=1000), locations[i], sourceFile)
    metrics_list_xgb.append((mse, wape, r2))


Iteration 0
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.1149
WAPE: 21.15
R2: 0.8820

Iteration 1
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0825
WAPE: 21.91
R2: 0.8783

Iteration 2
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0645
WAPE: 21.10
R2: 0.8862

Iteration 3
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0379
WAPE: 16.23
R2: 0.9390

Iteration 4
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.1400
WAPE: 22.22
R2: 0.8295

Iteration 5
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0445
WAPE: 17.07
R2: 0.9272

Iteration 6
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0670
WAPE: 19.81
R2: 0.8931

Iteration 7
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 0.0708
WAPE: 21.40
R2: 0.8927

Iteration 8
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 1.1390
WAPE: 19.48
R2: 0.6027

Iteration 9
Train set:  (1413024, 16)
Test set:  (344640, 16)
MSE: 1.1301
WAPE: 19.01
R2: 0.6073


### Random Forest

In [31]:
class cuMinMaxScaler():
    def __init__(self):
        self.feature_range = (0,1)

    def _reset(self):

        if hasattr(self, 'scale_'):
            del self.scale_
            del self.min_

    def fit(self, X): #X is assumed to be a cuDF dataframe, no type checking

        self._reset()        

        X = X.dropna()

        data_min = X.min(axis = 0) #cuDF series
        data_max = X.max(axis = 0) #cuDF series

        data_range = data_max - data_min #cuDF series

        data_range[data_range==0] = 1 #replaced with 1 is range is 0

        feature_range = self.feature_range

        self.scale_ = (feature_range[1] - feature_range[0]) / data_range # element-wise divison, produces #cuDF series
        self.min_ = feature_range[0] - data_min * self.scale_ # element-wise multiplication, produces #cuDF series

        return self

    def transform(self, X):

        X *= self.scale_ # element-wise divison, match dataframe column to series index
        X += self.min_ # element-wise addition, match dataframe column to series index

        return X

In [32]:
def normalize_training_gpu(X_train):
    scaler = cuMinMaxScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    return X_train, scaler

In [33]:
from cuml.metrics import mean_squared_error as mse_gpu
from cuml.metrics import r2_score as r2_gpu
def performance_metrics_gpu(preds: cupy.array, actuals: cupy.array, filename):

    # calculate performance metrics
    
    mse = mse_gpu(actuals, preds)
    wape = cupy.sum(cupy.abs(preds - actuals)) / cupy.sum(cupy.abs(actuals)) * 100
    r2 = r2_gpu(actuals, preds)
    
    # print performance metrics
    print('MSE: %.4f' % mse, file=filename)
    print('WAPE: %.2f' % wape, file=filename)
    print('R2: %.4f' % r2, file=filename)
    print('MSE: %.4f' % mse)
    print('WAPE: %.2f' % wape)
    print('R2: %.4f' % r2)
    return mse, wape, r2

In [34]:
def test_leave_house_out_gpu(df, estimator, locations, filename):
    test = df[df['Location'].isin(locations)]
    train = df[~df['Location'].isin(locations)]
    print("Train set: ", train.shape)
    print("Test set: ", test.shape)
    X_train = train.drop(['Time', 'Energy', 'Location'], axis=1)
    X_test = test.drop(['Time', 'Energy', 'Location'], axis=1)
    y_train = train['Energy']
    y_test = test['Energy']
    #X_train_norm, scaler = normalize_training_gpu(X_train)
    #X_test_norm = scaler.transform(X_test)
    model = estimator
    init = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    end = time.time()
    print('Elapsed time: {:.4f} s'.format(end - init), file=filename)
    mse, wape, r2 = performance_metrics_gpu(y_pred, y_test.values.reshape(-1), filename)
    return mse, wape, r2, model

In [35]:
metrics_list_rf = []
print("\n----------------------------", file=sourceFile)
print("\nRandom Forest\n", file=sourceFile)
print("----------------------------\n", file=sourceFile)
for i in range(10):
    print("\nIteration", i, file=sourceFile)
    print("\nIteration", i)
    mse, wape, r2, model_rf = test_leave_house_out_gpu(cu_df, cuRF(max_depth=16, n_estimators=500, max_batch_size=60), locations[i], sourceFile)
    metrics_list_rf.append((mse, wape, r2))


Iteration 0
Train set:  (1413024, 16)
Test set:  (344640, 16)


  ret = func(*args, **kwargs)


MSE: 0.1914
WAPE: 28.93
R2: 0.8035

Iteration 1
Train set:  (1413024, 16)
Test set:  (344640, 16)


  ret = func(*args, **kwargs)


MSE: 0.1336
WAPE: 28.12
R2: 0.8029

Iteration 2
Train set:  (1413024, 16)
Test set:  (344640, 16)


  ret = func(*args, **kwargs)


MSE: 0.1123
WAPE: 27.78
R2: 0.8019

Iteration 3
Train set:  (1413024, 16)
Test set:  (344640, 16)


  ret = func(*args, **kwargs)


MSE: 0.0990
WAPE: 25.83
R2: 0.8407

Iteration 4
Train set:  (1413024, 16)
Test set:  (344640, 16)


  ret = func(*args, **kwargs)


MSE: 0.1832
WAPE: 28.07
R2: 0.7770

Iteration 5
Train set:  (1413024, 16)
Test set:  (344640, 16)


  ret = func(*args, **kwargs)


MSE: 0.1027
WAPE: 26.77
R2: 0.8322

Iteration 6
Train set:  (1413024, 16)
Test set:  (344640, 16)


  ret = func(*args, **kwargs)


## Averaged Metrics

In [None]:
print("\n\n\n", file=sourceFile)

In [None]:
print("\nLinear Regression", file=sourceFile)
total_averaged_metrics(metrics_list_lr, sourceFile)

In [None]:
print("\nXGBoost", file=sourceFile)
total_averaged_metrics(metrics_list_xgb, sourceFile)

In [None]:
print("\nRandom Forest", file=sourceFile)
total_averaged_metrics(metrics_list_rf, sourceFile)

In [None]:
sourceFile.close()

### Feature Importance

In [None]:
print('XGBoost\n')
for i,v in enumerate(model_xgb.feature_importances_):
    print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
model_xgb.feature_importances_

In [None]:
plt.figure(figsize=(22,6))
plt.bar(df.columns[2:-1],model_xgb.feature_importances_)
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.savefig("images/porto_xgb_final7days_feature_importance.png")
plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.bar(df.columns[2:-1],model_lr.coef_)
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()

In [None]:
columns = df.drop(["Energy", "Location", "Time"], axis=1).columns
feature_importance = pd.Series(model_xgb.feature_importances_, index=columns)
feature_importance

In [None]:
features = feature_importance.nlargest(40).index
features

In [None]:
df_40 = df[['Time', 'Location'] + [*features] + ['Energy']]
df_40

In [None]:
df_40.to_csv("data/porto_best40features.csv", index=False)

## Hyperparameter Tuning

In [None]:
df_test = pd.read_csv("data/porto_final_7days.csv", nrows=100)

float_cols = [c for c in df_test if df_test[c].dtype == "float64"]
float32_cols = {c: np.float32 for c in float_cols}

df_tuning = pd.read_csv("data/porto_final_7days.csv", engine='c', dtype=float32_cols)
df_tuning.drop(['Time', 'Location'],axis=1, inplace=True)
df_tuning

In [None]:
params_rf = {
    #'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    #'max_features': [2, 3],
    #'min_samples_leaf': [3, 4, 5],
    #'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300],
    'max_batch_size': [1200]
}

In [None]:
params_xgb = { 
    'max_depth': [3,6,10],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 500, 1000],
    'colsample_bytree': [0.3, 0.7]
}

In [None]:
train_features = df_tuning.drop('Energy', axis=1)
train_labels = df_tuning['Energy']
test_features = 0
test_labels = 0

### XGBoost Best Parameters

In [None]:
xgb_model = xgb.XGBRegressor(tree_method="gpu_hist", seed=42)
grid_search = GridSearchCV(estimator = xgb_model, param_grid = params_xgb, cv = 5, n_jobs = 4, verbose = 2, scoring="r2")

In [None]:
grid_search.fit(train_features, train_labels)

In [None]:
grid_search.best_params_
#{'colsample_bytree': 0.7,
#'learning_rate': 0.1,
#'max_depth': 10,
#'n_estimators': 1000}

In [None]:
grid_search.best_score_

### Random Forest Best Parameters

In [None]:
rf = cuRF()
grid_search_rf = GridSearchCV(estimator = rf, param_grid = params_rf, cv = 5, n_jobs = 1, verbose = 2, scoring="r2")

In [None]:
grid_search_rf.fit(train_features, train_labels)

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
print(grid_search.best_params_)
best_grid = grid_search.best_estimator_
#grid_accuracy = evaluate(best_grid, test_features, test_labels)

In [None]:
base_model = xgb.XGBRegressor(n_estimators = 10, random_state = 42, tree_method="gpu_hist")
base_model.fit(train_features, train_labels)
base_accuracy = evaluate(base_model, test_features, test_labels)

best_random = grid_search.best_estimator_
random_accuracy = evaluate(best_random, test_features, test_labels)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

In [None]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))