In [1]:
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
warnings.filterwarnings('ignore',category=DeprecationWarning)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
import time
from cuml.ensemble import RandomForestRegressor as cuRF
from cuml.tsa import ARIMA
import cudf
import cupy
import math
import xgboost as xgb
from tqdm import tqdm

#### Output file

In [2]:
sourceFile = open("gpu_logs/teste_rf.txt", 'w')

### Load the data

In [3]:
df = pd.read_csv("data/porto.csv", decimal=",", index_col=[0])
## Leaving the first house (public building out) due to its different profile
df['Energy'] = df['Energy'].astype(np.float32)
df = df.iloc[35136:, [0, 2, 1]].reset_index(drop=True)
number_of_houses = df.Location.nunique()
df['Time'] = pd.to_datetime(df['Time'])
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Time,Location,Energy
0,2019-01-01 00:00:00,1,2.322959
1,2019-01-01 00:15:00,1,2.371797
2,2019-01-01 00:30:00,1,2.415961
3,2019-01-01 00:45:00,1,2.302538
4,2019-01-01 01:00:00,1,2.363063
...,...,...,...
1756795,2020-01-01 22:45:00,50,0.753222
1756796,2020-01-01 23:00:00,50,0.716855
1756797,2020-01-01 23:15:00,50,0.735802
1756798,2020-01-01 23:30:00,50,0.485237


In [4]:
#trunc = lambda x: math.trunc(10000 * x) / 10000
#df_trunc = pd.DataFrame(df['Energy']).applymap(trunc)
#df = pd.concat([df.iloc[:, 0:-1], df_trunc], axis=1)
#df

### Auxiliary functions

In [5]:
def plot_results(preds: np.array, actuals: np.array, title: str):
    
    plt.scatter(actuals, preds, c='b', label='predicted')
    plt.xlabel('actual')
    plt.ylabel('predicted')
    plt.title(title)
    plt.xlim(0, plt.xlim()[1])
    plt.ylim(0, plt.ylim()[1])
    _ = plt.plot([0, 100], [0, 100], '--r', label='y=x')
    plt.legend()
    plt.show()

def truncate_metric(metric):
    m = math.trunc(10000 * metric) / 10000
    return m 
    
def performance_metrics(preds: np.array, actuals: np.array, filename):

    # calculate performance metrics
    
    mse = truncate_metric(mean_squared_error(actuals, preds))
    wape = truncate_metric(np.sum(np.abs(preds - actuals)) / np.sum(np.abs(actuals))) * 100
    r2 = truncate_metric(r2_score(actuals, preds))
    
    # print performance metrics
    print('MSE: %.4f' % mse, file=filename)
    print('WAPE: %.2f' % wape, file=filename)
    print('R2: %.4f' % r2, file=filename)
    print('MSE: %.4f' % mse)
    print('WAPE: %.2f' % wape)
    print('R2: %.4f' % r2)
    return mse, wape, r2

#@jit(target_backend='cuda')
def build_model(estimator, X_train: np.array, y_train: np.array, X_test: np.array):
    
    model = estimator
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    return model, preds

def validate(estimator, X_train, y_train):
    scores = cross_validate(estimator, X_train, y_train, scoring=['r2', 'neg_mean_squared_error'])
    return scores
    

In [6]:
def total_averaged_metrics(metrics_list, filename):
    
    print("Total Averaged MSE: {}".format(np.round(sum(i for i, j, k in metrics_list)/len(metrics_list),3)), file=filename)
    print("Total Averaged WAPE: {}".format(np.round(sum(j for i, j, k in metrics_list)/len(metrics_list),3)), file=filename)
    print("Total Averaged R2: {}".format(np.round(sum(k for i, j, k in metrics_list)/len(metrics_list),3)), file=filename)
    print("Total Averaged MSE: {}".format(np.round(sum(i for i, j, k in metrics_list)/len(metrics_list),3)))
    print("Total Averaged WAPE: {}".format(np.round(sum(j for i, j, k in metrics_list)/len(metrics_list),3)))
    print("Total Averaged R2: {}".format(np.round(sum(k for i, j, k in metrics_list)/len(metrics_list),3)))

def past_timesteps(df, number_of_timesteps):
    df = df.sort_values(by=['Location', 'Time'])
    for i in tqdm(range(1, (number_of_timesteps + 1))):
        df.loc[df['Time'].shift(i) == df['Time'] - pd.Timedelta(i * 15, 'm'), f"lag_{i}"] = df['Energy'].shift(i)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [7]:
def normalize_training(X_train):
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    return X_train, scaler

In [8]:
def test_leave_house_out(df, estimator, locations, filename):
    test = df[df['Location'].isin(locations)]
    train = df[~df['Location'].isin(locations)]
    print("Train set: ", train.shape)
    print("Test set: ", test.shape)
    X_train = train.drop(['Time', 'Energy', 'Location'], axis=1)
    X_test = test.drop(['Time', 'Energy', 'Location'], axis=1)
    y_train = train['Energy']
    y_test = test['Energy']

    X_train_norm, scaler = normalize_training(X_train)
    X_test_norm = scaler.transform(X_test)
    model = estimator
    init = time.time()
    model.fit(X_train_norm, y_train)
    y_pred = model.predict(X_test_norm)
    end = time.time()
    print('Elapsed time: {:.4f} s'.format(end - init), file=filename)
    mse, wape, r2 = performance_metrics(y_pred, y_test.values.reshape(-1), filename)
    return mse, wape, r2, model

## Save Dataframe with past timesteps

In [None]:
number_of_timesteps = 96

In [None]:
df_new = past_timesteps(df, number_of_timesteps)
df_new

In [None]:
df_new['DayOfWeek'] = df_new['Time'].dt.dayofweek
df_new['Weekend'] = df_new['Time'].dt.dayofweek.isin([5, 6]).astype(int)
df_new['Hour'] = df_new['Time'].dt.hour
df_new

In [None]:
df_new.to_csv("data/porto_cluster2_96.csv", index=False)

## Predict  5 folds with 10 random houses for test

In [None]:
df = pd.read_csv("data/porto_final_7days.csv")
df

In [None]:
df_selected = df[["Time","Location","energy_lag_1","energy_lag_2","energy_lag_3","energy_lag_4","energy_lag_96","energy_lag_192","energy_lag_288","energy_lag_384","energy_lag_480","energy_lag_576","energy_lag_672","DayOfWeek","Hour","Energy"]]
df_selected

In [None]:
df_selected.to_csv("data/porto_final_7days.csv", index=None)

In [9]:
df_test = pd.read_csv("data/porto_final_7days.csv", nrows=100)

float_cols = [c for c in df_test if df_test[c].dtype == "float64"]
float32_cols = {c: np.float32 for c in float_cols}

df = pd.read_csv("data/porto_final_7days.csv", engine='c', dtype=float32_cols)
df

Unnamed: 0,Time,Location,energy_lag_1,energy_lag_2,energy_lag_3,energy_lag_4,energy_lag_96,energy_lag_192,energy_lag_288,energy_lag_384,energy_lag_480,energy_lag_576,energy_lag_672,DayOfWeek,Hour,Energy
0,2019-01-08 00:00:00,1,1.362799,1.311378,1.251099,1.477207,1.349390,1.556923,1.070138,0.982795,0.916242,0.345928,2.322959,1,0,1.243963
1,2019-01-08 00:15:00,1,1.243963,1.362799,1.311378,1.251099,1.220344,1.430336,1.457278,4.612334,0.450002,0.372131,2.371797,1,0,1.289234
2,2019-01-08 00:30:00,1,1.289234,1.243963,1.362799,1.311378,0.517908,1.304366,1.992285,4.270957,1.416435,0.521968,2.415961,1,0,1.253928
3,2019-01-08 00:45:00,1,1.253928,1.289234,1.243963,1.362799,0.245668,1.218622,2.743436,2.589786,1.370795,0.407560,2.302538,1,0,1.233753
4,2019-01-08 01:00:00,1,1.233753,1.253928,1.289234,1.243963,0.254402,0.792239,2.213350,2.268585,1.366982,0.414080,2.363063,1,1,1.287758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723195,2020-01-01 22:45:00,50,1.099100,0.453906,0.481770,0.483173,0.641933,0.778195,0.753222,0.641933,0.778195,0.753222,1.723943,2,22,0.753222
1723196,2020-01-01 23:00:00,50,0.753222,1.099100,0.453906,0.481770,0.752850,0.931837,0.716855,0.752850,0.931837,0.716855,1.482914,2,23,0.716855
1723197,2020-01-01 23:15:00,50,0.716855,0.753222,1.099100,0.453906,0.935428,0.937079,0.735802,0.935428,0.937079,0.735802,1.772033,2,23,0.735802
1723198,2020-01-01 23:30:00,50,0.735802,0.716855,0.753222,1.099100,0.715781,0.803087,0.485237,0.715781,0.803087,0.485237,2.535657,2,23,0.485237


In [10]:
cu_df = cudf.read_csv("data/porto_final_7days.csv")
cu_df

Unnamed: 0,Time,Location,energy_lag_1,energy_lag_2,energy_lag_3,energy_lag_4,energy_lag_96,energy_lag_192,energy_lag_288,energy_lag_384,energy_lag_480,energy_lag_576,energy_lag_672,DayOfWeek,Hour,Energy
0,2019-01-08 00:00:00,1,1.362799,1.311378,1.251098,1.477207,1.349390,1.556923,1.070138,0.982795,0.916242,0.345928,2.322959,1,0,1.243963
1,2019-01-08 00:15:00,1,1.243963,1.362799,1.311378,1.251098,1.220344,1.430336,1.457278,4.612334,0.450002,0.372131,2.371797,1,0,1.289234
2,2019-01-08 00:30:00,1,1.289234,1.243963,1.362799,1.311378,0.517908,1.304366,1.992285,4.270957,1.416435,0.521968,2.415961,1,0,1.253928
3,2019-01-08 00:45:00,1,1.253928,1.289234,1.243963,1.362799,0.245668,1.218622,2.743436,2.589786,1.370796,0.407560,2.302538,1,0,1.233753
4,2019-01-08 01:00:00,1,1.233753,1.253928,1.289234,1.243963,0.254402,0.792239,2.213350,2.268585,1.366982,0.414080,2.363063,1,1,1.287758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723195,2020-01-01 22:45:00,50,1.099100,0.453906,0.481770,0.483173,0.641933,0.778195,0.753222,0.641933,0.778195,0.753222,1.723943,2,22,0.753222
1723196,2020-01-01 23:00:00,50,0.753222,1.099100,0.453906,0.481770,0.752850,0.931837,0.716855,0.752850,0.931837,0.716855,1.482914,2,23,0.716855
1723197,2020-01-01 23:15:00,50,0.716855,0.753222,1.099100,0.453906,0.935428,0.937079,0.735802,0.935428,0.937079,0.735802,1.772034,2,23,0.735802
1723198,2020-01-01 23:30:00,50,0.735802,0.716855,0.753222,1.099100,0.715781,0.803087,0.485237,0.715781,0.803087,0.485237,2.535657,2,23,0.485237


In [11]:
#df = df[["Time", "Location", "energy_lag_1", "energy_lag_3", "energy_lag_95", "energy_lag_671", "Energy"]]
#df

In [12]:
num_houses_test = 10
locations = []
for i in range(10):
    np.random.seed(i*5)
    l = np.random.choice(df['Location'].unique(), size=num_houses_test, replace=False)
    locations.append(l)
locations

[array([29, 12, 11, 42,  3, 28, 39, 32, 23,  5]),
 array([43, 30,  7, 20, 29, 18,  3, 44,  4, 22]),
 array([38, 24, 45, 43, 48, 21,  4, 31,  8,  7]),
 array([36, 38, 42, 27, 39, 44,  4, 34, 31, 37]),
 array([13, 15,  5,  9, 46,  2, 26, 25, 48, 24]),
 array([37, 47,  8, 42, 18, 15, 35, 34, 46, 40]),
 array([42, 21, 33, 44, 11, 30, 34, 26, 35, 20]),
 array([40, 27, 23, 32, 30, 44, 42, 18, 26, 24]),
 array([34, 30, 50, 39, 46,  1, 19,  5, 12,  3]),
 array([48,  8,  3, 10,  6, 27, 14, 38,  1, 20])]

In [13]:
## Number of NaNs
df.isna().sum().sum()

0

In [14]:
#kf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42) 
#locations = []
#for _, test_index in kf.split(df['Location'].unique()):
#    locations.append(test_index)
#locations

In [15]:
df.Location.value_counts()

1     34464
38    34464
28    34464
29    34464
30    34464
31    34464
32    34464
33    34464
34    34464
35    34464
36    34464
37    34464
39    34464
2     34464
40    34464
41    34464
42    34464
43    34464
44    34464
45    34464
46    34464
47    34464
48    34464
49    34464
27    34464
26    34464
25    34464
24    34464
3     34464
4     34464
5     34464
6     34464
7     34464
8     34464
9     34464
10    34464
11    34464
12    34464
13    34464
14    34464
15    34464
16    34464
17    34464
18    34464
19    34464
20    34464
21    34464
22    34464
23    34464
50    34464
Name: Location, dtype: int64

### Linear Regression

In [16]:
metrics_list_lr = []
print("\n----------------------------", file=sourceFile)
print("\nLinear Regression\n", file=sourceFile)
print("----------------------------\n", file=sourceFile)
for i in range(10):
    print("\nIteration", i, file=sourceFile)
    print("\nIteration", i)
    mse, wape, r2, model_lr = test_leave_house_out(df, LinearRegression(), locations[i], sourceFile)
    metrics_list_lr.append((mse, wape, r2))


Iteration 0
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.3149
WAPE: 32.53
R2: 0.7527

Iteration 1
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.1936
WAPE: 32.52
R2: 0.7764

Iteration 2
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.1503
WAPE: 32.00
R2: 0.7412

Iteration 3
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.1327
WAPE: 30.51
R2: 0.7738

Iteration 4
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.2266
WAPE: 31.92
R2: 0.7364

Iteration 5
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.1211
WAPE: 32.58
R2: 0.7593

Iteration 6
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.2331
WAPE: 32.95
R2: 0.7428

Iteration 7
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.2122
WAPE: 32.90
R2: 0.7490

Iteration 8
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.2011
WAPE: 29.83
R2: 0.8086

Iteration 9
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.0896
WAPE: 31.72
R2: 0.7626


### XGBoost

In [17]:
metrics_list_xgb = []
print("\n----------------------------", file=sourceFile)
print("\nXGBoost\n", file=sourceFile)
print("----------------------------\n", file=sourceFile)
for i in range(10):
    print("\nIteration", i, file=sourceFile)
    print("\nIteration", i)
    mse, wape, r2, model_xgb = test_leave_house_out(df, xgb.XGBRegressor(tree_method='gpu_hist', seed=0, colsample_bytree=0.7, learning_rate=0.1, max_depth=10, n_estimators=1000), locations[i], sourceFile)
    metrics_list_xgb.append((mse, wape, r2))


Iteration 0
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.1602
WAPE: 22.91
R2: 0.8741

Iteration 1
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.1167
WAPE: 24.62
R2: 0.8651

Iteration 2
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.0847
WAPE: 24.44
R2: 0.8542

Iteration 3
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.0593
WAPE: 20.58
R2: 0.8988

Iteration 4
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.0854
WAPE: 20.04
R2: 0.9005

Iteration 5
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.0498
WAPE: 21.21
R2: 0.9009

Iteration 6
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.0955
WAPE: 20.41
R2: 0.8946

Iteration 7
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.1163
WAPE: 23.30
R2: 0.8624

Iteration 8
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.1217
WAPE: 21.68
R2: 0.8842

Iteration 9
Train set:  (1378560, 16)
Test set:  (344640, 16)
MSE: 0.0541
WAPE: 25.18
R2: 0.8566


### Random Forest

In [18]:
class cuMinMaxScaler():
    def __init__(self):
        self.feature_range = (0,1)

    def _reset(self):

        if hasattr(self, 'scale_'):
            del self.scale_
            del self.min_

    def fit(self, X): #X is assumed to be a cuDF dataframe, no type checking

        self._reset()        

        X = X.dropna()

        data_min = X.min(axis = 0) #cuDF series
        data_max = X.max(axis = 0) #cuDF series

        data_range = data_max - data_min #cuDF series

        data_range[data_range==0] = 1 #replaced with 1 is range is 0

        feature_range = self.feature_range

        self.scale_ = (feature_range[1] - feature_range[0]) / data_range # element-wise divison, produces #cuDF series
        self.min_ = feature_range[0] - data_min * self.scale_ # element-wise multiplication, produces #cuDF series

        return self

    def transform(self, X):

        X *= self.scale_ # element-wise divison, match dataframe column to series index
        X += self.min_ # element-wise addition, match dataframe column to series index

        return X

In [19]:
def normalize_training_gpu(X_train):
    scaler = cuMinMaxScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    return X_train, scaler

In [20]:
from cuml.metrics import mean_squared_error as mse_gpu
from cuml.metrics import r2_score as r2_gpu
def performance_metrics_gpu(preds: cupy.array, actuals: cupy.array, filename):

    # calculate performance metrics
    
    mse = mse_gpu(actuals, preds)
    wape = cupy.sum(cupy.abs(preds - actuals)) / cupy.sum(cupy.abs(actuals)) * 100
    r2 = r2_gpu(actuals, preds)
    
    # print performance metrics
    print('MSE: %.4f' % mse, file=filename)
    print('WAPE: %.2f' % wape, file=filename)
    print('R2: %.4f' % r2, file=filename)
    print('MSE: %.4f' % mse)
    print('WAPE: %.2f' % wape)
    print('R2: %.4f' % r2)
    return mse, wape, r2

In [21]:
def test_leave_house_out_gpu(df, estimator, locations, filename):
    test = df[df['Location'].isin(locations)]
    train = df[~df['Location'].isin(locations)]
    print("Train set: ", train.shape)
    print("Test set: ", test.shape)
    X_train = train.drop(['Time', 'Energy', 'Location'], axis=1)
    X_test = test.drop(['Time', 'Energy', 'Location'], axis=1)
    y_train = train['Energy']
    y_test = test['Energy']
    #X_train_norm, scaler = normalize_training_gpu(X_train)
    #X_test_norm = scaler.transform(X_test)
    model = estimator
    init = time.time()
    model.fit(X_train, y_train)
    print("Model fitted")
    y_pred = model.predict(X_test)
    end = time.time()
    print('Elapsed time: {:.4f} s'.format(end - init), file=filename)
    mse, wape, r2 = performance_metrics_gpu(y_pred, y_test.values.reshape(-1), filename)
    return mse, wape, r2, model

In [22]:
metrics_list_rf = []
print("\n----------------------------", file=sourceFile)
print("\nRandom Forest\n", file=sourceFile)
print("----------------------------\n", file=sourceFile)
for i in range(10):
    print("\nIteration", i, file=sourceFile)
    print("\nIteration", i)
    mse, wape, r2, model_rf = test_leave_house_out_gpu(cu_df, cuRF(max_depth=16, n_estimators=500, max_batch_size=60), locations[i], sourceFile)
    metrics_list_rf.append((mse, wape, r2))


Iteration 0
Train set:  (1378560, 16)
Test set:  (344640, 16)
Model fitted


  ret = func(*args, **kwargs)


MSE: 0.2317
WAPE: 27.72
R2: 0.8180

Iteration 1
Train set:  (1378560, 16)
Test set:  (344640, 16)


  ret = func(*args, **kwargs)


Model fitted
MSE: 0.1564
WAPE: 28.80
R2: 0.8195

Iteration 2
Train set:  (1378560, 16)
Test set:  (344640, 16)
Model fitted


  ret = func(*args, **kwargs)


MSE: 0.1085
WAPE: 27.56
R2: 0.8133

Iteration 3
Train set:  (1378560, 16)
Test set:  (344640, 16)


  ret = func(*args, **kwargs)


Model fitted
MSE: 0.0851
WAPE: 24.83
R2: 0.8551

Iteration 4
Train set:  (1378560, 16)
Test set:  (344640, 16)


  ret = func(*args, **kwargs)


Model fitted
MSE: 0.1379
WAPE: 25.19
R2: 0.8396

Iteration 5
Train set:  (1378560, 16)
Test set:  (344640, 16)
Model fitted


  ret = func(*args, **kwargs)


MSE: 0.0813
WAPE: 26.57
R2: 0.8385

Iteration 6
Train set:  (1378560, 16)
Test set:  (344640, 16)


  ret = func(*args, **kwargs)


Model fitted
MSE: 0.1505
WAPE: 26.13
R2: 0.8340

Iteration 7
Train set:  (1378560, 16)
Test set:  (344640, 16)


  ret = func(*args, **kwargs)


Model fitted
MSE: 0.1599
WAPE: 27.53
R2: 0.8110

Iteration 8
Train set:  (1378560, 16)
Test set:  (344640, 16)


  ret = func(*args, **kwargs)


Model fitted
MSE: 0.1664
WAPE: 25.60
R2: 0.8417

Iteration 9
Train set:  (1378560, 16)
Test set:  (344640, 16)


  ret = func(*args, **kwargs)


Model fitted
MSE: 0.0720
WAPE: 28.22
R2: 0.8094


## Averaged Metrics

In [None]:
print("\n\n\n", file=sourceFile)

In [None]:
print("\nLinear Regression", file=sourceFile)
total_averaged_metrics(metrics_list_lr, sourceFile)

Total Averaged MSE: 0.188
Total Averaged WAPE: 31.946
Total Averaged R2: 0.76


In [None]:
print("\nXGBoost", file=sourceFile)
total_averaged_metrics(metrics_list_xgb, sourceFile)

Total Averaged MSE: 0.094
Total Averaged WAPE: 22.437
Total Averaged R2: 0.879


In [None]:
print("\nRandom Forest", file=sourceFile)
total_averaged_metrics(metrics_list_rf, sourceFile)

Total Averaged MSE: 0.135
Total Averaged WAPE: 26.84
Total Averaged R2: 0.828


In [None]:
sourceFile.close()

### Feature Importance

In [None]:
print('XGBoost\n')
for i,v in enumerate(model_xgb.feature_importances_):
    print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
model_xgb.feature_importances_

In [None]:
plt.figure(figsize=(22,6))
plt.bar(df.columns[2:-1],model_xgb.feature_importances_)
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.savefig("images/porto_xgb_final7days_feature_importance.png")
plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.bar(df.columns[2:-1],model_lr.coef_)
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()

In [None]:
columns = df.drop(["Energy", "Location", "Time"], axis=1).columns
feature_importance = pd.Series(model_xgb.feature_importances_, index=columns)
feature_importance

In [None]:
features = feature_importance.nlargest(40).index
features

In [None]:
df_40 = df[['Time', 'Location'] + [*features] + ['Energy']]
df_40

In [None]:
df_40.to_csv("data/porto_best40features.csv", index=False)

## Hyperparameter Tuning

In [28]:
df_test = pd.read_csv("data/porto_final_7days.csv", nrows=100)

float_cols = [c for c in df_test if df_test[c].dtype == "float64"]
float32_cols = {c: np.float32 for c in float_cols}

df_tuning = pd.read_csv("data/porto_final_7days.csv", engine='c', dtype=float32_cols)
df_tuning.drop(['Time', 'Location'],axis=1, inplace=True)
df_tuning

Unnamed: 0,energy_lag_1,energy_lag_2,energy_lag_3,energy_lag_4,energy_lag_96,energy_lag_192,energy_lag_288,energy_lag_384,energy_lag_480,energy_lag_576,energy_lag_672,DayOfWeek,Hour,Energy
0,1.362799,1.311378,1.251099,1.477207,1.349390,1.556923,1.070138,0.982795,0.916242,0.345928,2.322959,1,0,1.243963
1,1.243963,1.362799,1.311378,1.251099,1.220344,1.430336,1.457278,4.612334,0.450002,0.372131,2.371797,1,0,1.289234
2,1.289234,1.243963,1.362799,1.311378,0.517908,1.304366,1.992285,4.270957,1.416435,0.521968,2.415961,1,0,1.253928
3,1.253928,1.289234,1.243963,1.362799,0.245668,1.218622,2.743436,2.589786,1.370795,0.407560,2.302538,1,0,1.233753
4,1.233753,1.253928,1.289234,1.243963,0.254402,0.792239,2.213350,2.268585,1.366982,0.414080,2.363063,1,1,1.287758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723195,1.099100,0.453906,0.481770,0.483173,0.641933,0.778195,0.753222,0.641933,0.778195,0.753222,1.723943,2,22,0.753222
1723196,0.753222,1.099100,0.453906,0.481770,0.752850,0.931837,0.716855,0.752850,0.931837,0.716855,1.482914,2,23,0.716855
1723197,0.716855,0.753222,1.099100,0.453906,0.935428,0.937079,0.735802,0.935428,0.937079,0.735802,1.772033,2,23,0.735802
1723198,0.735802,0.716855,0.753222,1.099100,0.715781,0.803087,0.485237,0.715781,0.803087,0.485237,2.535657,2,23,0.485237


In [30]:
params_rf = {
    #'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    #'max_features': [2, 3],
    #'min_samples_leaf': [3, 4, 5],
    #'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300],
    'max_batch_size': [1200]
}

In [31]:
params_xgb = { 
    'max_depth': [3,6,10],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 500, 1000],
    'colsample_bytree': [0.3, 0.7]
}

In [33]:
train_features = df_tuning.drop('Energy', axis=1)
train_labels = df_tuning['Energy']
test_features = 0
test_labels = 0

### XGBoost Best Parameters

In [None]:
xgb_model = xgb.XGBRegressor(tree_method="gpu_hist", seed=42)
grid_search = GridSearchCV(estimator = xgb_model, param_grid = params_xgb, cv = 5, n_jobs = 4, verbose = 2, scoring="r2")

In [None]:
grid_search.fit(train_features, train_labels)

In [None]:
grid_search.best_params_
#{'colsample_bytree': 0.7,
#'learning_rate': 0.1,
#'max_depth': 10,
#'n_estimators': 1000}

In [None]:
grid_search.best_score_

### Random Forest Best Parameters

In [None]:
rf = cuRF()
grid_search_rf = GridSearchCV(estimator = rf, param_grid = params_rf, cv = 5, n_jobs = 1, verbose = 2, scoring="r2")

In [None]:
grid_search_rf.fit(train_features, train_labels)

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
print(grid_search.best_params_)
best_grid = grid_search.best_estimator_
#grid_accuracy = evaluate(best_grid, test_features, test_labels)

In [None]:
base_model = xgb.XGBRegressor(n_estimators = 10, random_state = 42, tree_method="gpu_hist")
base_model.fit(train_features, train_labels)
base_accuracy = evaluate(base_model, test_features, test_labels)

best_random = grid_search.best_estimator_
random_accuracy = evaluate(best_random, test_features, test_labels)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

In [None]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))