## Import libraries

In [2]:
import numpy as np
import pandas as pd
import sys
sys.path.append('..')
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from cuml import RandomForestRegressor as cuRF
import xgboost as xgb
import math
import time
import re
from scripts.clustering_utils import read_clusters_from_file, cluster_predict, aggregate_cluster_predictions, dataframe_by_cluster, split_into_clusters, select_past_timesteps, normalize_training, performance_metrics, expanding_window_split_location

In [3]:
n_clusters = 16

## Read Dataset

In [3]:
df = pd.read_csv("../../data/porto_full_selected.csv")
#df_index = pd.DataFrame(pd.date_range('2019-01-01', periods=35136, freq='15T'))
#df = pd.concat([pd.DataFrame(df_index).rename(columns={0: "Time"}),df],axis=1)
df

Unnamed: 0,Time,Location,lag_1,lag_2,lag_3,lag_4,lag_96,lag_192,lag_288,lag_384,lag_480,lag_576,lag_672,DayOfWeek,Hour,Energy
0,2019-01-08 00:00:00,0,3.952000,3.803000,4.095000,3.926000,2.615000,3.321000,3.066000,2.351000,2.851000,3.054000,2.964000,1,0,4.940000
1,2019-01-08 00:15:00,0,4.940000,3.952000,3.803000,4.095000,2.634000,2.873000,3.152000,2.843000,3.102000,2.677000,2.584000,1,0,4.149000
2,2019-01-08 00:30:00,0,4.149000,4.940000,3.952000,3.803000,3.052000,3.407000,2.633000,2.949000,2.546000,3.522000,3.071000,1,0,4.401000
3,2019-01-08 00:45:00,0,4.401000,4.149000,4.940000,3.952000,2.893000,2.673000,2.590000,2.605000,3.016000,2.729000,2.694000,1,0,4.431000
4,2019-01-08 01:00:00,0,4.431000,4.401000,4.149000,4.940000,2.349000,2.390000,2.330000,3.023000,3.031000,2.368000,2.569000,1,1,3.988000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1757659,2020-01-01 22:45:00,50,1.099100,0.453906,0.481770,0.483173,0.641933,0.778195,0.753221,0.641933,0.778195,0.753221,1.723943,2,22,0.753221
1757660,2020-01-01 23:00:00,50,0.753221,1.099100,0.453906,0.481770,0.752850,0.931837,0.716855,0.752850,0.931837,0.716855,1.482914,2,23,0.716855
1757661,2020-01-01 23:15:00,50,0.716855,0.753221,1.099100,0.453906,0.935428,0.937079,0.735802,0.935428,0.937079,0.735802,1.772034,2,23,0.735802
1757662,2020-01-01 23:30:00,50,0.735802,0.716855,0.753221,1.099100,0.715781,0.803087,0.485237,0.715781,0.803087,0.485237,2.535657,2,23,0.485237


In [8]:
names = ["Linear Regression", "XGBoost", "Random Forest"]

In [None]:
## Complete script
sourceFile = open("../../gpu_logs/new_cluster_preds_porto_feature.txt", "w")
all_clusters = read_clusters_from_file("new_clusters_porto_dtw.txt", n_clusters)
estimators = [LinearRegression(), xgb.XGBRegressor(tree_method='gpu_hist', seed=0,colsample_bytree=0.7, learning_rate=0.01, max_depth=16, n_estimators=700), cuRF(bootstrap=True, max_depth=16, min_samples_leaf=1, min_samples_split=2,n_estimators=700)]
all_cluster_metrics = []
n_c = 6
for c in all_clusters[4:6]:
    print("Number of Clusters: {}".format(n_c))
    print("Number of Clusters: {}".format(n_c), file=sourceFile)
    location_to_value = dict(zip(range(51), c))
    df_cluster = df.copy()
    df_cluster['Cluster'] = df['Location'].map(location_to_value)
    for e in estimators:
        print(e)
        model = e
        init = time.time()
        preds_list = []
        y_test_list = []
        for i in range(10):
            X_train, X_test, y_train, y_test = expanding_window_split_location(df_cluster, cv=i, n_splits=10)
            X_train = X_train.drop(['Time', 'Location', 'Energy'], axis=1)
            X_test = X_test.drop(['Time', 'Location', 'Energy'], axis=1)
            y_train = y_train['Energy']
            y_test = y_test['Energy']
            X_train_norm, scaler = normalize_training(X_train)
            X_test_norm = scaler.transform(X_test)
            model.fit(X_train_norm, y_train)
            y_pred = model.predict(X_test_norm)
            preds_list.append(y_pred)
            y_test_list.append(y_test)
        end = time.time()
        print('Elapsed time training and predicting: {:.4f} s'.format(end - init))
        mse, wape, r2 = performance_metrics(np.asarray(preds_list).reshape(-1), np.asarray(y_test_list).reshape(-1), sourceFile)
        all_cluster_metrics.append((mse, wape, r2))
    n_c += 1
sourceFile.close()

In [None]:
model.feature_importances_

In [None]:
df_selected_features = pd.read_csv("../../data/porto_full_selected.csv")
df_selected_features

## Best predictions

In [4]:
with open('../../gpu_logs/new_cluster_preds_porto_feature.txt', 'r') as f:
    l = f.readlines()
    lines = [s for s in l if re.search(r'\w+', s)]

In [5]:
def get_cluster_metrics(lines):
    cursor = 0
    model_names, mse, wape, r2, number_clust = [], [], [], [], []
    for i in lines:
        if i.startswith("Number"):
            number_clust.append(int(i[-3:-1].strip()))
        else:
            if cursor == 0:
                model_names.append(i[:-1])
                cursor += 1
            elif cursor == 1:
                mse.append(i.split(":")[1][:-1])
                cursor += 1
            elif cursor == 2:
                wape.append(i.split(":")[1][:-1])
                cursor += 1
            elif cursor == 3:
                r2.append(i.split(":")[1][:-1])
                cursor = 0
    mse = list(map(float, mse))
    wape = list(map(float, wape))
    wape = list(map(lambda x: x/100, wape))
    r2 = list(map(float, r2))
    return model_names, mse, wape, r2, number_clust

In [6]:
def best_number_of_cluster(lines):
    model_names, mse, wape, r2, number_clust = get_cluster_metrics(lines)
    model_names = names
    min_mse, min_index_mse, mod_index_mse = min(mse), int(np.argmin(mse)/3), np.argmin(mse)%3
    min_wape, min_index_wape, mod_index_wape = min(wape), int(np.argmin(wape)/3), np.argmin(wape)%3
    max_r2, max_index_r2, mod_index_r2 = max(r2), int(np.argmax(r2)/3), np.argmax(r2)%3
    return min_mse, min_wape, max_r2, number_clust[max_index_r2], model_names[mod_index_r2]

In [9]:
best_mse, best_wape, best_r2, best_n_clust, best_alg = best_number_of_cluster(lines)
best_mse, best_wape, best_r2, best_n_clust, best_alg

(0.4481, 0.006518, 29.7, 6, 'XGBoost')

In [11]:
lines

['Number of Clusters: 6\n',
 'RMSE: 0.4481\n',
 'WAPE: 30.24\n',
 'R2: 0.8281\n',
 'RMSE: 0.6377\n',
 'WAPE: 30.73\n',
 'R2: 0.6519\n',
 'RMSE: 0.6549\n',
 'WAPE: 29.70\n',
 'R2: 0.6329\n',
 'Number of Clusters: 7\n',
 'RMSE: 0.4481\n',
 'WAPE: 30.24\n',
 'R2: 0.8281\n',
 'RMSE: 0.6379\n',
 'WAPE: 30.79\n',
 'R2: 0.6518\n',
 'RMSE: 0.6533\n',
 'WAPE: 29.54\n',
 'R2: 0.6347\n']