## Import libraries

In [10]:
import numpy as np
import pandas as pd
import sys
sys.path.append('..')
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from cuml import RandomForestRegressor as cuRF
import xgboost as xgb
import math
import time
import re
from scripts.clustering_utils import read_clusters_from_file, cluster_predict, aggregate_cluster_predictions, dataframe_by_cluster, split_into_clusters, select_past_timesteps, normalize_training, performance_metrics

In [2]:
n_clusters = 16

## Read Dataset

In [3]:
df = pd.read_csv("../../data/porto_full_selected.csv")
#df_index = pd.DataFrame(pd.date_range('2019-01-01', periods=35136, freq='15T'))
#df = pd.concat([pd.DataFrame(df_index).rename(columns={0: "Time"}),df],axis=1)
df

Unnamed: 0,Time,Location,lag_1,lag_2,lag_3,lag_4,lag_96,lag_192,lag_288,lag_384,lag_480,lag_576,lag_672,DayOfWeek,Hour,Energy
0,2019-01-08 00:00:00,0,3.952000,3.803000,4.095000,3.926000,2.615000,3.321000,3.066000,2.351000,2.851000,3.054000,2.964000,1,0,4.940000
1,2019-01-08 00:15:00,0,4.940000,3.952000,3.803000,4.095000,2.634000,2.873000,3.152000,2.843000,3.102000,2.677000,2.584000,1,0,4.149000
2,2019-01-08 00:30:00,0,4.149000,4.940000,3.952000,3.803000,3.052000,3.407000,2.633000,2.949000,2.546000,3.522000,3.071000,1,0,4.401000
3,2019-01-08 00:45:00,0,4.401000,4.149000,4.940000,3.952000,2.893000,2.673000,2.590000,2.605000,3.016000,2.729000,2.694000,1,0,4.431000
4,2019-01-08 01:00:00,0,4.431000,4.401000,4.149000,4.940000,2.349000,2.390000,2.330000,3.023000,3.031000,2.368000,2.569000,1,1,3.988000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1757659,2020-01-01 22:45:00,50,1.099100,0.453906,0.481770,0.483173,0.641933,0.778195,0.753221,0.641933,0.778195,0.753221,1.723943,2,22,0.753221
1757660,2020-01-01 23:00:00,50,0.753221,1.099100,0.453906,0.481770,0.752850,0.931837,0.716855,0.752850,0.931837,0.716855,1.482914,2,23,0.716855
1757661,2020-01-01 23:15:00,50,0.716855,0.753221,1.099100,0.453906,0.935428,0.937079,0.735802,0.935428,0.937079,0.735802,1.772034,2,23,0.735802
1757662,2020-01-01 23:30:00,50,0.735802,0.716855,0.753221,1.099100,0.715781,0.803087,0.485237,0.715781,0.803087,0.485237,2.535657,2,23,0.485237


In [4]:
names = ["Linear Regression", "XGBoost", "Random Forest"]

In [None]:
## Complete script
sourceFile = open("../../gpu_logs/cluster_preds_porto_feature_.txt", "w")
all_clusters = read_clusters_from_file("clusters_porto.txt", n_clusters)
estimators = [LinearRegression(), xgb.XGBRegressor(tree_method='gpu_hist', seed=0, colsample_bytree=0.7, learning_rate=0.1, max_depth=10, n_estimators=1000), cuRF()]
all_cluster_metrics = []
n_c = 2
for c in all_clusters:
    print("Number of Clusters: {}".format(n_c))
    print("Number of Clusters: {}".format(n_c), file=sourceFile)
    location_to_value = dict(zip(range(51), c))
    print(c)
    df_cluster = df.copy()
    df_cluster['Cluster'] = df['Location'].map(location_to_value)
    X_train, X_test, y_train, y_test = train_test_split(df.drop(['Energy', 'Time', 'Location'], axis=1), df['Energy'], train_size=0.8, random_state=42)
    preds_list = []
    y_test_list = []
    X_train_norm, scaler = normalize_training(X_train)
    X_test_norm = scaler.transform(X_test)
    i = 0
    for e in estimators:
        i += 1
        model = e
        init = time.time()
        model.fit(X_train_norm, y_train)
        y_pred = model.predict(X_test_norm)
        end = time.time()
        print('Elapsed time training and predicting: {:.4f} s'.format(end - init))
        mse, wape, r2 = performance_metrics(y_pred, y_test.values.reshape(-1), sourceFile)
        all_cluster_metrics.append((mse, wape, r2))
    n_c += 1
sourceFile.close()

In [14]:
model.feature_importances_

array([0.578169  , 0.0589796 , 0.03171524, 0.02450593, 0.02615522,
       0.03403929, 0.04955789, 0.05141533, 0.01846606, 0.02036919,
       0.02062017, 0.06286871, 0.02313831], dtype=float32)

In [None]:
df_selected_features = pd.read_csv("../../data/porto_full_selected.csv")
df_selected_features

## Best predictions

In [None]:
with open('../../gpu_logs/cluster_preds_porto.txt', 'r') as f:
    l = f.readlines()
    lines = [s for s in l if re.search(r'\w+', s)]

In [None]:
def get_cluster_metrics(lines):
    cursor = 0
    model_names, mse, wape, r2, number_clust = [], [], [], [], []
    for i in lines:
        if i.startswith("Number"):
            number_clust.append(int(i[-3:-1].strip()))
        else:
            if cursor == 0:
                model_names.append(i[:-1])
                cursor += 1
            elif cursor == 1:
                mse.append(i.split(":")[1][:-1])
                cursor += 1
            elif cursor == 2:
                wape.append(i.split(":")[1][:-1])
                cursor += 1
            elif cursor == 3:
                r2.append(i.split(":")[1][:-1])
                cursor = 0
    mse = list(map(float, mse))
    wape = list(map(float, wape))
    wape = list(map(lambda x: x/100, wape))
    r2 = list(map(float, r2))
    return model_names, mse, wape, r2, number_clust

In [None]:
def best_number_of_cluster(lines):
    model_names, mse, wape, r2, number_clust = get_cluster_metrics(lines)
    min_mse, min_index_mse, mod_index_mse = min(mse), int(np.argmin(mse)/3), np.argmin(mse)%3
    min_wape, min_index_wape, mod_index_wape = min(wape), int(np.argmin(wape)/3), np.argmin(wape)%3
    max_r2, max_index_r2, mod_index_r2 = max(r2), int(np.argmax(r2)/3), np.argmax(r2)%3
    return max_r2, number_clust[max_index_r2], model_names[mod_index_r2]

In [None]:
best_score, best_n_clust, best_alg = best_number_of_cluster(lines)
best_score, best_n_clust, best_alg

In [None]:
model_names, mse, wape, r2, number_clust = get_cluster_metrics(lines)
mse[(best_n_clust-2)*3+1], r2[(best_n_clust-2)*3+1], wape[(best_n_clust-2)*3+1]