## Import libraries

In [6]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from cuml import RandomForestRegressor as cuRF
import xgboost as xgb
import math
import time
import re
from scripts.function_utils import select_past_timesteps

In [2]:
n_clusters = 16

## Read Dataset

In [4]:
df = pd.read_csv("../datasets/Dataset.csv", decimal=",")
df_index = pd.DataFrame(pd.date_range('2019-01-01', periods=35136, freq='15T'))
df = pd.concat([pd.DataFrame(df_index).rename(columns={0: "Time"}),df],axis=1)
df = df.drop(["0"], axis=1)
df

Unnamed: 0,Time,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,2019-01-01 00:00:00,2.322959,1.544607,0.778310,1.962012,2.677445,0.237877,0.689194,0.358525,0.814643,...,0.898895,0.203825,0.221624,0.319531,0.830996,0.924987,0.219128,0.274880,0.990488,0.779475
1,2019-01-01 00:15:00,2.371797,1.544607,0.778310,1.962012,2.733737,0.192929,0.558967,0.358525,0.660712,...,0.917793,0.165311,0.179747,0.319531,0.848467,0.944434,0.177722,0.222940,1.011313,0.795863
2,2019-01-01 00:30:00,2.415961,1.319880,0.665072,1.676555,2.784640,0.382869,1.109272,0.377198,1.311186,...,0.934883,0.328060,0.356708,0.336174,0.864266,0.962019,0.352691,0.442426,1.030144,0.810682
3,2019-01-01 00:45:00,2.302538,1.319880,0.665072,1.676555,2.653908,0.442052,1.280743,0.377198,1.513868,...,0.890992,0.378772,0.411848,0.336174,0.823691,0.916855,0.407209,0.510816,0.981781,0.772623
4,2019-01-01 01:00:00,2.363063,0.913154,0.460128,1.159919,2.723669,0.192242,0.556976,0.668500,0.658358,...,0.914413,0.164722,0.179106,0.595793,0.845343,0.940956,0.177089,0.222146,1.007588,0.792932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35131,2020-01-01 22:45:00,2.244719,1.455982,0.733653,1.849437,2.587266,0.205654,0.595835,0.171793,0.704291,...,0.868619,0.176214,0.191602,0.153109,0.803007,0.893832,0.189444,0.237645,0.957128,0.753222
35132,2020-01-01 23:00:00,2.136340,1.201186,0.605264,1.525786,2.462348,0.201219,0.582985,0.067223,0.689101,...,0.826680,0.172414,0.187470,0.059912,0.764237,0.850676,0.185359,0.232519,0.910916,0.716855
35133,2020-01-01 23:15:00,2.192805,1.201186,0.605264,1.525786,2.527430,0.228585,0.662271,0.067223,0.782819,...,0.848530,0.195862,0.212966,0.059912,0.784436,0.873160,0.210568,0.264142,0.934992,0.735802
35134,2020-01-01 23:30:00,1.446083,0.259545,0.130782,0.329682,1.666757,0.189302,0.548459,0.070958,0.648292,...,0.559578,0.162203,0.176368,0.063241,0.517310,0.575820,0.174381,0.218749,0.616596,0.485237


## Auxiliary Functions

In [None]:
def read_clusters_from_file(filename):
    all_clusters = []
    readFile = open(filename, 'r')
    file = readFile.readlines()
    clcl = file[2:n_clusters+1]
    for c in clcl:
        clcli = list(c.split(","))
        clcli[0] = clcli[0].split("[")[-1]
        clcli.pop() ## pop \n element
        house_list = np.array(clcli)
        cluster_list = []
        for i in house_list:
            result = re.sub(r'[^0-9]','',i)
            cluster_list.append(int(result))
        cluster_list = np.array(cluster_list)
        all_clusters.append(cluster_list)
    return all_clusters

In [None]:
def split_into_clusters(cluster):
    clstr_lst = []
    for i in range(len(np.unique(cluster))):
        clstr_lst.append(np.where(cluster == i)[0])
    return clstr_lst

In [None]:
def dataframe_by_cluster(cl_list, df):
    clusters = []
    for i in cl_list:
        dataframe = df.iloc[:, i]
        clusters.append(dataframe)
    return clusters

In [None]:
def truncate_metric(metric):
    m = math.trunc(10000 * metric) / 10000
    return m 

In [None]:
def performance_metrics(preds: np.array, actuals: np.array, filename):

    # calculate performance metrics
    
    mse = truncate_metric(mean_squared_error(actuals, preds))
    wape = truncate_metric(np.sum(np.abs(preds - actuals)) / np.sum(np.abs(actuals))) * 100
    r2 = truncate_metric(r2_score(actuals, preds))
    
    # print performance metrics
    print('MSE: %.4f' % mse, file=filename)
    print('WAPE: %.2f' % wape, file=filename)
    print('R2: %.4f' % r2, file=filename)
    print('MSE: %.4f' % mse)
    print('WAPE: %.2f' % wape)
    print('R2: %.4f' % r2)
    return mse, wape, r2

In [None]:
def normalize_training(X_train):
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    return X_train, scaler

In [None]:
def cluster_predict(df, estimators):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(['Energy'], axis=1), df['Energy'], train_size=0.8, random_state=42)
    metrics_list_lr = []
    X_train_norm, scaler = normalize_training(X_train)
    X_test_norm = scaler.transform(X_test)
    for e in estimators:
        print("\n----------------------------")
        print("\n{}\n".format(e))
        print("----------------------------\n")
        for i in range(5):
            print("\nIteration", i)
            model = e
            init = time.time()
            model.fit(X_train_norm, y_train)
            y_pred = model.predict(X_test_norm)
            end = time.time()
            print('Elapsed time: {:.4f} s'.format(end - init))
    return y_pred, y_test

In [None]:
## Complete script
all_clusters = read_clusters_from_file("clusters.txt")
df_houses = df.drop("Time", axis=1)
estimators = [LinearRegression(), xgb.XGBRegressor(), cuRF()]
all_cluster_preds = []
for c in all_clusters:
    cluster_houses = split_into_clusters(c)
    #list_df_clusters = dataframe_by_cluster(cluster_houses, df_houses)
    pred_list = []
    for clust in cluster_houses:
        df_cluster = select_past_timesteps("../data/porto_final_7days.csv", clust)
        c_pred, c_test = cluster_predict(df_cluster, estimators=estimators)
        pred_list.append((c_pred, c_test))
    agg_pred = aggregate_cluster_predictions(pred_list)
    all_cluster_preds.append(agg_pred)