## Import libraries

In [1]:
import numpy as np
import pandas as pd
import sys
sys.path.append('..')
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from cuml import RandomForestRegressor as cuRF
import xgboost as xgb
import math
import time
import re
from scripts.clustering_utils import read_clusters_from_file, cluster_predict, aggregate_cluster_predictions, dataframe_by_cluster, split_into_clusters, select_past_timesteps, normalize_training, performance_metrics, expanding_window_split_location

In [2]:
n_clusters = 16

## Read Dataset

In [3]:
df = pd.read_csv("../../data/loureiro_weather_45houses_selected.csv")
#df_index = pd.DataFrame(pd.date_range('2019-01-01', periods=35136, freq='15T'))
#df = pd.concat([pd.DataFrame(df_index).rename(columns={0: "Time"}),df],axis=1)
df

Unnamed: 0,Time,Location,lag_1,lag_2,lag_3,lag_4,lag_96,lag_192,lag_288,lag_384,...,Avg_Temp,Avg_Rel_Humidity,Avg_Wind_Direction,Avg_Wind_Speed,Max_Inst_Wind_Speed,Inst_Temp,Quantity_Precip,Max_Inst_Precip,Total_Global_Rad,Energy
0,2022-05-12 12:00:00,Energy_1,0.016,0.058,0.038,0.022,0.041,0.023,0.044,0.033,...,22.30,55.0,348.0,2.40,3.30,26.20,0.0,0.0,547.70,0.039
1,2022-05-12 12:15:00,Energy_1,0.039,0.016,0.058,0.038,0.487,0.034,0.058,0.034,...,21.65,62.0,328.5,3.55,4.80,26.25,0.0,0.0,372.85,0.061
2,2022-05-12 12:30:00,Energy_1,0.061,0.039,0.016,0.058,0.340,0.037,0.038,0.055,...,20.70,67.0,321.0,4.20,5.00,25.60,0.0,0.0,412.80,0.038
3,2022-05-12 12:45:00,Energy_1,0.038,0.061,0.039,0.016,0.058,0.046,0.026,0.058,...,20.20,67.5,325.0,4.15,5.15,25.05,0.0,0.0,282.10,0.069
4,2022-05-12 13:00:00,Energy_1,0.069,0.038,0.061,0.039,0.044,0.024,0.026,0.036,...,20.30,72.0,325.0,3.10,4.10,24.50,0.0,0.0,359.00,0.063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1680475,2023-06-05 10:45:00,Energy_96,0.009,0.028,0.039,0.036,0.010,0.014,0.039,0.026,...,19.80,78.0,319.0,2.20,3.30,23.30,0.0,0.0,443.55,0.010
1680476,2023-06-05 11:00:00,Energy_96,0.010,0.009,0.028,0.039,0.011,0.041,0.032,0.009,...,20.50,76.0,351.0,2.70,4.10,23.90,0.0,0.0,506.50,0.009
1680477,2023-06-05 11:15:00,Energy_96,0.009,0.010,0.009,0.028,0.041,0.038,0.009,0.009,...,21.00,74.0,343.0,2.50,4.15,24.30,0.0,0.0,516.75,0.010
1680478,2023-06-05 11:30:00,Energy_96,0.010,0.009,0.010,0.009,0.037,0.018,0.010,0.010,...,20.80,76.0,334.0,2.80,5.00,24.50,0.0,0.0,518.00,0.039


In [4]:
names = ["Linear Regression", "XGBoost", "Random Forest"]

In [5]:
## Complete script
sourceFile = open("../../gpu_logs/new_cluster_preds_loureiro_feature.txt", "w")
all_clusters = read_clusters_from_file("new_clusters_loureiro_dtw.txt", n_clusters)
estimators = [LinearRegression(), xgb.XGBRegressor(tree_method='gpu_hist', seed=0, colsample_bytree=0.7, learning_rate=0.01, max_depth=10, n_estimators=700), cuRF(bootstrap=True, max_depth=12, min_samples_leaf=5, min_samples_split=2, n_estimators=500)]
all_cluster_metrics = []
n_c = 8
for c in all_clusters[6:8]:
    print("Number of Clusters: {}".format(n_c))
    print("Number of Clusters: {}".format(n_c), file=sourceFile)
    location_to_value = dict(zip(df.Location.unique(), c))
    df_cluster = df.copy()
    df_cluster['Cluster'] = df['Location'].map(location_to_value)
    for e in estimators:
        print(e)
        model = e
        init = time.time()
        preds_list = []
        y_test_list = []
        for i in range(10):
            X_train, X_test, y_train, y_test = expanding_window_split_location(df_cluster, cv=i, n_splits=10)
            X_train = X_train.drop(['Time', 'Location', 'Energy'], axis=1)
            X_test = X_test.drop(['Time', 'Location', 'Energy'], axis=1)
            y_train = y_train['Energy']
            y_test = y_test['Energy']
            X_train_norm, scaler = normalize_training(X_train)
            X_test_norm = scaler.transform(X_test)
            model.fit(X_train_norm, y_train)
            y_pred = model.predict(X_test_norm)
            preds_list.append(y_pred)
            y_test_list.append(y_test)
        end = time.time()
        print('Elapsed time training and predicting: {:.4f} s'.format(end - init))
        mse, wape, r2 = performance_metrics(np.asarray(preds_list).reshape(-1), np.asarray(y_test_list).reshape(-1), sourceFile)
        all_cluster_metrics.append((mse, wape, r2))
    n_c += 1
sourceFile.close()

Number of Clusters: 8
LinearRegression()
Elapsed time training and predicting: 34.7878 s
RMSE: 0.0720
WAPE: 36.97
R2: 0.7041
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.7, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=10, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=700, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)
Elapsed time training and predicting: 89.2052 s
RMSE: 0.0701
WAPE: 35.17
R2: 0.7198
RandomForestRegressor()


  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)


Elapsed time training and predicting: 230.3859 s
RMSE: 0.0714
WAPE: 35.17
R2: 0.7090
Number of Clusters: 9
LinearRegression()
Elapsed time training and predicting: 35.0700 s
RMSE: 0.0720
WAPE: 36.97
R2: 0.7041
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.7,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=0,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=10, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=700, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, ...)
Elapsed time training and predicting: 89.3778 s
RMSE: 0.0701
WAPE: 35.16
R2: 0.7200
RandomF

  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)


Elapsed time training and predicting: 229.9652 s
RMSE: 0.0716
WAPE: 35.20
R2: 0.7078


  ret = func(*args, **kwargs)


In [None]:
model.feature_importances_

## Best predictions

In [6]:
with open('../../gpu_logs/new_cluster_preds_loureiro_feature.txt', 'r') as f:
    l = f.readlines()
    lines = [s for s in l if re.search(r'\w+', s)]

In [10]:
def get_cluster_metrics(lines):
    cursor = 0
    model_names, mse, wape, r2, number_clust = [], [], [], [], []
    for i in lines:
        if i.startswith("Number"):
            number_clust.append(int(i[-3:-1].strip()))
        else:
            if cursor == 0:
                model_names.append(i[:-1])
                cursor += 1
            elif cursor == 1:
                mse.append(i.split(":")[1][:-1])
                cursor += 1
            elif cursor == 2:
                wape.append(i.split(":")[1][:-1])
                cursor += 1
            elif cursor == 3:
                r2.append(i.split(":")[1][:-1])
                cursor = 0
    mse = list(map(float, mse))
    wape = list(map(float, wape))
    wape = list(map(lambda x: x, wape))
    r2 = list(map(float, r2))
    return model_names, mse, wape, r2, number_clust

In [15]:
lines

['Number of Clusters: 8\n',
 'RMSE: 0.0720\n',
 'WAPE: 36.97\n',
 'R2: 0.7041\n',
 'RMSE: 0.0701\n',
 'WAPE: 35.17\n',
 'R2: 0.7198\n',
 'RMSE: 0.0714\n',
 'WAPE: 35.17\n',
 'R2: 0.7090\n',
 'Number of Clusters: 9\n',
 'RMSE: 0.0720\n',
 'WAPE: 36.97\n',
 'R2: 0.7041\n',
 'RMSE: 0.0701\n',
 'WAPE: 35.16\n',
 'R2: 0.7200\n',
 'RMSE: 0.0716\n',
 'WAPE: 35.20\n',
 'R2: 0.7078\n']

In [11]:
def best_number_of_cluster(lines):
    model_names, mse, wape, r2, number_clust = get_cluster_metrics(lines)
    model_names = names
    min_mse, min_index_mse, mod_index_mse = min(mse), int(np.argmin(mse)/3), np.argmin(mse)%3
    min_wape, min_index_wape, mod_index_wape = min(wape), int(np.argmin(wape)/3), np.argmin(wape)%3
    max_r2, max_index_r2, mod_index_r2 = max(r2), int(np.argmax(r2)/3), np.argmax(r2)%3
    return min_mse, min_wape, max_r2, number_clust[max_index_r2], model_names[mod_index_r2]

In [12]:
best_mse, best_wape, best_r2, best_n_clust, best_alg = best_number_of_cluster(lines)
best_mse, best_wape, best_r2, best_n_clust, best_alg

(0.072, 0.0714, 35.17, 8, 'XGBoost')