## Import libraries

In [1]:
import numpy as np
import pandas as pd
import sys
sys.path.append('..')
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from cuml import RandomForestRegressor as cuRF
import xgboost as xgb
import math
import time
import re
from scripts.clustering_utils import read_clusters_from_file, cluster_predict, aggregate_cluster_predictions, dataframe_by_cluster, split_into_clusters, select_past_timesteps, cluster_predict_ew

In [2]:
n_clusters = 16

## Read Dataset

In [3]:
df = pd.read_csv("~/datasets/Dataset.csv", decimal=",")
df_index = pd.DataFrame(pd.date_range('2019-01-01', periods=35136, freq='15T'))
df = pd.concat([pd.DataFrame(df_index).rename(columns={0: "Time"}),df],axis=1)
df

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,41,42,43,44,45,46,47,48,49,50
0,2019-01-01 00:00:00,2.964,2.322959,1.544607,0.778310,1.962012,2.677445,0.237877,0.689194,0.358525,...,0.898895,0.203825,0.221624,0.319531,0.830996,0.924987,0.219128,0.274880,0.990488,0.779475
1,2019-01-01 00:15:00,2.584,2.371797,1.544607,0.778310,1.962012,2.733737,0.192929,0.558967,0.358525,...,0.917793,0.165311,0.179747,0.319531,0.848467,0.944434,0.177722,0.222940,1.011313,0.795863
2,2019-01-01 00:30:00,3.071,2.415961,1.319880,0.665072,1.676555,2.784640,0.382869,1.109272,0.377198,...,0.934883,0.328060,0.356708,0.336174,0.864266,0.962019,0.352691,0.442426,1.030144,0.810682
3,2019-01-01 00:45:00,2.694,2.302538,1.319880,0.665072,1.676555,2.653908,0.442052,1.280743,0.377198,...,0.890992,0.378772,0.411848,0.336174,0.823691,0.916855,0.407209,0.510816,0.981781,0.772623
4,2019-01-01 01:00:00,2.569,2.363063,0.913154,0.460128,1.159919,2.723669,0.192242,0.556976,0.668500,...,0.914413,0.164722,0.179106,0.595793,0.845343,0.940956,0.177089,0.222146,1.007588,0.792932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35131,2020-01-01 22:45:00,1.664,2.244719,1.455982,0.733653,1.849437,2.587266,0.205654,0.595835,0.171793,...,0.868619,0.176214,0.191602,0.153109,0.803007,0.893832,0.189444,0.237645,0.957128,0.753222
35132,2020-01-01 23:00:00,1.659,2.136340,1.201186,0.605264,1.525786,2.462348,0.201219,0.582985,0.067223,...,0.826680,0.172414,0.187470,0.059912,0.764237,0.850676,0.185359,0.232519,0.910916,0.716855
35133,2020-01-01 23:15:00,1.664,2.192805,1.201186,0.605264,1.525786,2.527430,0.228585,0.662271,0.067223,...,0.848530,0.195862,0.212966,0.059912,0.784436,0.873160,0.210568,0.264142,0.934992,0.735802
35134,2020-01-01 23:30:00,1.697,1.446083,0.259545,0.130782,0.329682,1.666757,0.189302,0.548459,0.070958,...,0.559578,0.162203,0.176368,0.063241,0.517310,0.575820,0.174381,0.218749,0.616596,0.485237


In [4]:
names = ["Linear Regression", "XGBoost", "Random Forest"]

In [None]:
## Complete script
sourceFile = open("../../gpu_logs/new_clusters_porto_dtw_notew.txt", "w")
all_clusters = read_clusters_from_file("new_clusters_porto_dtw.txt", n_clusters)
estimators = [LinearRegression(), xgb.XGBRegressor(tree_method='gpu_hist', seed=0, colsample_bytree=0.7, learning_rate=0.1, max_depth=10, n_estimators=500), cuRF(bootstrap=True,max_depth=12,min_samples_leaf=1,min_samples_split=2,n_estimators=500)]
all_cluster_preds = []
n_c = 6
for c in all_clusters[0:3]:
    print("Number of Clusters: {}".format(n_c))
    print("Number of Clusters: {}".format(n_c), file=sourceFile)
    cluster_houses = split_into_clusters(c)
    i = 0
    pred_list = []
    clusters_preds, clusters_actuals = [], []
    for clust in cluster_houses:
        print("Cluster {}".format(i))
        i += 1
        df_cluster = select_past_timesteps("../../data/porto_full_selected.csv", clust)
        c_pred, c_test = cluster_predict(df_cluster, estimators=estimators, names=names)
        clusters_preds.append(c_pred)
        clusters_actuals.append(c_test)
    n_c += 1
    agg_pred = aggregate_cluster_predictions(clusters_preds, clusters_actuals, names, sourceFile)
    all_cluster_preds.append(agg_pred)
sourceFile.close()

In [None]:
df_selected_features = pd.read_csv("../../data/porto_full_selected.csv")
df_selected_features

## Best predictions

In [15]:
with open('../../gpu_logs/new_clusters_porto_euclidean.txt', 'r') as f:
    l = f.readlines()
    lines = [s for s in l if re.search(r'\w+', s)]

In [16]:
def get_cluster_metrics(lines):
    cursor = 0
    model_names, mse, wape, r2, number_clust = [], [], [], [], []
    for i in lines:
        if i.startswith("Number"):
            number_clust.append(int(i[-3:-1].strip()))
        else:
            if cursor == 0:
                model_names.append(i[:-1])
                cursor += 1
            elif cursor == 1:
                mse.append(i.split(":")[1][:-1])
                cursor += 1
            elif cursor == 2:
                wape.append(i.split(":")[1][:-1])
                cursor += 1
            elif cursor == 3:
                r2.append(i.split(":")[1][:-1])
                cursor = 0
    mse = list(map(float, mse))
    wape = list(map(float, wape))
    wape = list(map(lambda x: x/100, wape))
    r2 = list(map(float, r2))
    return model_names, mse, wape, r2, number_clust

In [17]:
def best_number_of_cluster(lines):
    model_names, mse, wape, r2, number_clust = get_cluster_metrics(lines)
    min_mse, min_index_mse, mod_index_mse = min(mse), int(np.argmin(mse)/3), np.argmin(mse)%3
    min_wape, min_index_wape, mod_index_wape = min(wape), int(np.argmin(wape)/3), np.argmin(wape)%3
    max_r2, max_index_r2, mod_index_r2 = max(r2), int(np.argmax(r2)/3), np.argmax(r2)%3
    return min_mse, min_wape, max_r2, number_clust[max_index_r2], model_names[mod_index_r2]

In [18]:
best_mse, best_wape, best_r2, best_n_clust, best_alg = best_number_of_cluster(lines)
best_mse, best_wape, best_r2, best_n_clust, best_alg

(0.4143, 0.2915, 0.8018, 8, 'Linear Regression')

In [None]:
model_names, mse, wape, r2, number_clust = get_cluster_metrics(lines)
mse[(best_n_clust-2)*3+1], r2[(best_n_clust-2)*3+1], wape[(best_n_clust-2)*3+1]