In [2]:
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
warnings.filterwarnings('ignore',category=DeprecationWarning)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate, KFold, RepeatedKFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.linear_model import LinearRegression
import time
import cudf
from cuml.ensemble import RandomForestRegressor as cuRF
import math
import xgboost as xgb
from tqdm import tqdm

#### Output file

In [3]:
sourceFile = open("gpu_logs/ensemble_coop.txt", 'w')

### Load the data

In [4]:
df = pd.read_csv("data/coopernico_50houses_location.csv")
## Leaving the first house (public building out) due to its different profile
df = df.iloc[:, [0, 2, 1]].reset_index(drop=True)
number_of_houses = df.Location.nunique()
num_samples_per_house = df.Location.value_counts()[1]
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Time,Location,Energy
0,2021-11-01 00:15:00,PT84ZD,0.144
1,2021-11-01 00:30:00,PT84ZD,0.144
2,2021-11-01 00:45:00,PT84ZD,0.144
3,2021-11-01 01:00:00,PT84ZD,0.144
4,2021-11-01 01:15:00,PT84ZD,0.028
...,...,...,...
1138131,2023-02-28 23:00:00,PT37FP,0.000
1138132,2023-02-28 23:15:00,PT37FP,1.000
1138133,2023-02-28 23:30:00,PT37FP,0.000
1138134,2023-02-28 23:45:00,PT37FP,1.000


In [4]:
#trunc = lambda x: math.trunc(10000 * x) / 10000
#df_trunc = pd.DataFrame(df['Energy']).applymap(trunc)
#df = pd.concat([df.iloc[:, 0:-1], df_trunc], axis=1)
#df

### Auxiliary functions

In [5]:
def plot_results(preds: np.array, actuals: np.array, title: str):
    
    plt.scatter(actuals, preds, c='b', label='predicted')
    plt.xlabel('actual')
    plt.ylabel('predicted')
    plt.title(title)
    plt.xlim(0, plt.xlim()[1])
    plt.ylim(0, plt.ylim()[1])
    _ = plt.plot([0, 100], [0, 100], '--r', label='y=x')
    plt.legend()
    plt.show()

def truncate_metric(metric):
    m = math.trunc(10000 * metric) / 10000
    return m 
    
def performance_metrics(preds: np.array, actuals: np.array, filename):

    # calculate performance metrics
    
    mse = truncate_metric(mean_squared_error(actuals, preds))
    wape = truncate_metric(np.sum(np.abs(preds - actuals)) / np.sum(np.abs(actuals))) * 100
    r2 = truncate_metric(r2_score(actuals, preds))
    
    # print performance metrics
    print('MSE: %.4f' % mse, file=filename)
    print('WAPE: %.2f' % wape, file=filename)
    print('R2: %.4f' % r2, file=filename)
    return mse, wape, r2

#@jit(target_backend='cuda')
def build_model(estimator, X_train: np.array, y_train: np.array, X_test: np.array):
    
    model = estimator
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    return model, preds

def validate(estimator, X_train, y_train):
    scores = cross_validate(estimator, X_train, y_train, scoring=['r2', 'neg_mean_squared_error'])
    return scores
    

In [14]:
def total_averaged_metrics(metrics_list, filename):
    
    print("Total Averaged MSE: {}".format(np.round(sum(i for i, j, k in metrics_list)/len(metrics_list),3)), file=filename)
    print("Total Averaged WAPE: {}".format(np.round(sum(j for i, j, k in metrics_list)/len(metrics_list),3)), file=filename)
    print("Total Averaged R2: {}".format(np.round(sum(k for i, j, k in metrics_list)/len(metrics_list),3)), file=filename)

def past_timesteps(df, number_of_timesteps):
    df['Time'] = pd.to_datetime(df['Time'])
    df = df.sort_values(by=['Location', 'Time'])
    for i in tqdm(range(1, (number_of_timesteps + 1))):
        df.loc[df['Time'].shift(i) == df['Time'] - pd.Timedelta(i * 15, 'm'), f"energy_lag_{i}"] = df['Energy'].shift(i)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

#def past_timesteps(df, number_of_timesteps):
#    # Sort the dataframe by location and time
#    df = df.sort_values(by=['Location', 'Time'])
#
#    # Compute rolling window over time axis to extract past energy values for each location
#    energy_lags = [f"energy_lag_{i}" for i in range(1, number_of_timesteps+1)]
#    past_energy = df.groupby('Location')['Energy'].rolling(window=number_of_timesteps, min_periods=1).apply(np.mean())
#    # Add new columns to the dataframe for each past energy value
#    for i, energy_lag in enumerate(energy_lags):
#        df[energy_lag] = past_energy.apply(lambda x: x[-i-1] if isinstance(x, list) and len(x)>=i+1 else x)
#
#    # Drop rows with missing values
#    df.dropna(inplace=True)
#    df.reset_index(drop=True, inplace=True)
#
#    return df


def last_energy_points_full(df, number_timesteps, num_samples_per_house):
    X = pd.DataFrame()
    other_feats = df.iloc[:,:2]
    for i in range(1, (number_timesteps + 1) ):
        X[f'Energy_{i*15}'] = df['Energy'].shift(i)
    y = df.copy().iloc[:,2]
    y.iloc[:number_timesteps] = np.nan
    ## Remove samples in between each house
    for h in range(1, number_of_houses):
        for i in range(0, number_timesteps):
            X.iloc[(num_samples_per_house+i)*h] = np.nan
            y.iloc[(num_samples_per_house+i)*h] = np.nan
    X = pd.concat([other_feats, X], axis=1)
    X.dropna(inplace=True)
    X.reset_index(drop=True, inplace=True)
    y.dropna(inplace=True)
    y.reset_index(drop=True, inplace=True)
    y.columns = ["Energy"]
    dataframe = pd.concat([X,y.rename('Energy')], axis=1)
    assert number_of_houses == (df.shape[0] - dataframe.shape[0]) / number_timesteps, "Something went wrong with preprocessing"
    return dataframe
    

In [7]:
def normalize_training(X_train):
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    return X_train, scaler

### Cross Validate

In [8]:
#scores = validate(xgb.XGBRegressor(seed=0), X_train, y_train)
#cv_mse = np.round(scores['test_neg_mean_squared_error'].mean() * (-1),4)
#cv_r2 = np.round(scores['test_r2'].mean(),5)
#print("CV MSE: {} ".format(cv_mse))
#print("CV R2: {} ".format(cv_r2))
#metrics_list.append((cv_mse,cv_rmse,cv_mae,mape,cv_r2))

## Leave 10 houses for test (demonstration)

In [9]:
number_of_timesteps = 97

In [15]:
df_new = past_timesteps(df, number_of_timesteps)
df_new

100%|██████████| 97/97 [00:03<00:00, 24.59it/s]


Unnamed: 0,Time,Location,Energy,energy_lag_1,energy_lag_2,energy_lag_3,energy_lag_4,energy_lag_5,energy_lag_6,energy_lag_7,...,energy_lag_88,energy_lag_89,energy_lag_90,energy_lag_91,energy_lag_92,energy_lag_93,energy_lag_94,energy_lag_95,energy_lag_96,energy_lag_97
0,2021-11-02 00:30:00,PT01NZ,1.000,0.000,1.000,0.000,1.000,0.000,0.000,1.000,...,1.000,0.000,1.000,0.000,0.000,1.000,0.000,1.000,0.000,1.000
1,2021-11-02 00:45:00,PT01NZ,0.000,1.000,0.000,1.000,0.000,1.000,0.000,0.000,...,1.000,1.000,0.000,1.000,0.000,0.000,1.000,0.000,1.000,0.000
2,2021-11-02 01:00:00,PT01NZ,1.000,0.000,1.000,0.000,1.000,0.000,1.000,0.000,...,0.000,1.000,1.000,0.000,1.000,0.000,0.000,1.000,0.000,1.000
3,2021-11-02 01:15:00,PT01NZ,1.000,1.000,0.000,1.000,0.000,1.000,0.000,1.000,...,1.000,0.000,1.000,1.000,0.000,1.000,0.000,0.000,1.000,0.000
4,2021-11-02 01:30:00,PT01NZ,0.000,1.000,1.000,0.000,1.000,0.000,1.000,0.000,...,0.000,1.000,0.000,1.000,1.000,0.000,1.000,0.000,0.000,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091882,2022-10-31 23:00:00,PT92QP,0.020,0.016,0.020,0.020,0.020,0.020,0.016,0.020,...,0.016,0.020,0.020,0.020,0.016,0.020,0.020,0.016,0.020,0.020
1091883,2022-10-31 23:15:00,PT92QP,0.020,0.020,0.016,0.020,0.020,0.020,0.020,0.016,...,0.020,0.016,0.020,0.020,0.020,0.016,0.020,0.020,0.016,0.020
1091884,2022-10-31 23:30:00,PT92QP,0.020,0.020,0.020,0.016,0.020,0.020,0.020,0.020,...,0.020,0.020,0.016,0.020,0.020,0.020,0.016,0.020,0.020,0.016
1091885,2022-10-31 23:45:00,PT92QP,0.016,0.020,0.020,0.020,0.016,0.020,0.020,0.020,...,0.020,0.020,0.020,0.016,0.020,0.020,0.020,0.016,0.020,0.020


In [16]:
df_new['DayOfWeek'] = df_new['Time'].dt.dayofweek
df_new['Weekend'] = df_new['Time'].dt.dayofweek.isin([5, 6]).astype(int)
df_new['Hour'] = df_new['Time'].dt.hour
df_new

Unnamed: 0,Time,Location,Energy,energy_lag_1,energy_lag_2,energy_lag_3,energy_lag_4,energy_lag_5,energy_lag_6,energy_lag_7,...,energy_lag_91,energy_lag_92,energy_lag_93,energy_lag_94,energy_lag_95,energy_lag_96,energy_lag_97,DayOfWeek,Weekend,Hour
0,2021-11-02 00:30:00,PT01NZ,1.000,0.000,1.000,0.000,1.000,0.000,0.000,1.000,...,0.000,0.000,1.000,0.000,1.000,0.000,1.000,1,0,0
1,2021-11-02 00:45:00,PT01NZ,0.000,1.000,0.000,1.000,0.000,1.000,0.000,0.000,...,1.000,0.000,0.000,1.000,0.000,1.000,0.000,1,0,0
2,2021-11-02 01:00:00,PT01NZ,1.000,0.000,1.000,0.000,1.000,0.000,1.000,0.000,...,0.000,1.000,0.000,0.000,1.000,0.000,1.000,1,0,1
3,2021-11-02 01:15:00,PT01NZ,1.000,1.000,0.000,1.000,0.000,1.000,0.000,1.000,...,1.000,0.000,1.000,0.000,0.000,1.000,0.000,1,0,1
4,2021-11-02 01:30:00,PT01NZ,0.000,1.000,1.000,0.000,1.000,0.000,1.000,0.000,...,1.000,1.000,0.000,1.000,0.000,0.000,1.000,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091882,2022-10-31 23:00:00,PT92QP,0.020,0.016,0.020,0.020,0.020,0.020,0.016,0.020,...,0.020,0.016,0.020,0.020,0.016,0.020,0.020,0,0,23
1091883,2022-10-31 23:15:00,PT92QP,0.020,0.020,0.016,0.020,0.020,0.020,0.020,0.016,...,0.020,0.020,0.016,0.020,0.020,0.016,0.020,0,0,23
1091884,2022-10-31 23:30:00,PT92QP,0.020,0.020,0.020,0.016,0.020,0.020,0.020,0.020,...,0.020,0.020,0.020,0.016,0.020,0.020,0.016,0,0,23
1091885,2022-10-31 23:45:00,PT92QP,0.016,0.020,0.020,0.020,0.016,0.020,0.020,0.020,...,0.016,0.020,0.020,0.020,0.016,0.020,0.020,0,0,23


In [18]:
df_new.to_csv("data/coopernico_halfweek.csv", index=False)

In [14]:
def test_leave_house_out(df, estimator, locations, filename):
    #df_new = past_timesteps(df, number_timesteps)
    #df_new['DayOfWeek'] = df_new['Time'].dt.dayofweek
    #df_new['Weekend'] = df_new['Time'].dt.dayofweek.isin([5,6]).astype(int)
    #df_new['Hour'] = df_new['Time'].dt.hour
    test = df[df['Location'].isin(locations)]
    train = df[~df['Location'].isin(locations)]
    print("Train set: ", train.shape)
    print("Test set: ", test.shape)
    X_train = train.drop(['Time', 'Energy', 'Location'], axis=1)
    X_test = test.drop(['Time', 'Energy', 'Location'], axis=1)
    y_train = train['Energy']
    y_test = test['Energy']
    X_train_norm, scaler = normalize_training(X_train)
    X_test_norm = scaler.transform(X_test)

    model = estimator
    init = time.time()
    model.fit(X_train_norm, y_train)
    y_pred = model.predict(X_test_norm)
    end = time.time()
    print('Elapsed time: {:.4f} s'.format(end - init), file=filename)
    mse, wape, r2 = performance_metrics(y_pred, y_test.values.reshape(-1), filename)
    return mse, wape, r2, model

## Predict  10 folds with 10 random houses for test

In [8]:
df = pd.read_csv("data/coopernico_oneweek.csv")
df

Unnamed: 0,Time,Location,Energy,energy_lag_1,energy_lag_2,energy_lag_3,energy_lag_4,energy_lag_5,energy_lag_6,energy_lag_7,...,energy_lag_666,energy_lag_667,energy_lag_668,energy_lag_669,energy_lag_670,energy_lag_671,energy_lag_672,DayOfWeek,Weekend,Hour
0,2019-01-08 00:00:00,0,4.940000,3.952000,3.803000,4.095000,3.926000,4.917000,4.990000,4.524000,...,2.558000,3.174000,2.569000,2.694000,3.071000,2.584000,2.964000,1,0,0
1,2019-01-08 00:15:00,0,4.149000,4.940000,3.952000,3.803000,4.095000,3.926000,4.917000,4.990000,...,2.922000,2.558000,3.174000,2.569000,2.694000,3.071000,2.584000,1,0,0
2,2019-01-08 00:30:00,0,4.401000,4.149000,4.940000,3.952000,3.803000,4.095000,3.926000,4.917000,...,2.994000,2.922000,2.558000,3.174000,2.569000,2.694000,3.071000,1,0,0
3,2019-01-08 00:45:00,0,4.431000,4.401000,4.149000,4.940000,3.952000,3.803000,4.095000,3.926000,...,3.223000,2.994000,2.922000,2.558000,3.174000,2.569000,2.694000,1,0,0
4,2019-01-08 01:00:00,0,3.988000,4.431000,4.401000,4.149000,4.940000,3.952000,3.803000,4.095000,...,2.553000,3.223000,2.994000,2.922000,2.558000,3.174000,2.569000,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1757659,2020-01-01 22:45:00,50,0.753222,1.099100,0.453906,0.481770,0.483173,0.467528,1.001887,0.552398,...,1.710115,1.515524,2.055415,2.535657,1.772033,1.482914,1.723943,2,0,22
1757660,2020-01-01 23:00:00,50,0.716855,0.753222,1.099100,0.453906,0.481770,0.483173,0.467528,1.001887,...,2.010091,1.710115,1.515524,2.055415,2.535657,1.772033,1.482914,2,0,23
1757661,2020-01-01 23:15:00,50,0.735802,0.716855,0.753222,1.099100,0.453906,0.481770,0.483173,0.467528,...,1.194125,2.010091,1.710115,1.515524,2.055415,2.535657,1.772033,2,0,23
1757662,2020-01-01 23:30:00,50,0.485237,0.735802,0.716855,0.753222,1.099100,0.453906,0.481770,0.483173,...,0.676236,1.194125,2.010091,1.710115,1.515524,2.055415,2.535657,2,0,23


In [9]:
df.isna().sum().sum()

0

In [10]:
#num_houses_test = 10

In [11]:
kf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42) 
locations = []
for _, test_index in kf.split(df['Location'].unique()):
    locations.append(test_index)
locations

[array([ 3, 12, 13, 17, 24, 30, 31, 32, 40, 43, 46]),
 array([ 4,  6,  8, 15, 19, 33, 36, 47, 48, 49]),
 array([ 0,  9, 11, 16, 25, 26, 27, 34, 44, 45]),
 array([ 1,  2,  5, 21, 23, 29, 35, 37, 39, 41]),
 array([ 7, 10, 14, 18, 20, 22, 28, 38, 42, 50]),
 array([ 0,  2, 10, 18, 23, 30, 36, 41, 45, 47, 49]),
 array([ 4,  8, 12, 20, 21, 26, 29, 31, 32, 33]),
 array([ 9, 14, 15, 22, 24, 37, 40, 42, 44, 48]),
 array([ 3,  5, 11, 17, 25, 28, 35, 38, 39, 50]),
 array([ 1,  6,  7, 13, 16, 19, 27, 34, 43, 46])]

In [12]:
#locations = []
#for i in range(10):
#    l = np.random.choice(df['Location'].unique(), size=num_houses_test, replace=False)
#    locations.append(l)
#locations

### Linear Regression

In [17]:
metrics_list_lr = []
print("\n----------------------------", file=sourceFile)
print("\nLinear Regression\n", file=sourceFile)
print("----------------------------\n", file=sourceFile)
for i in range(10):
    print("\nIteration", i, file=sourceFile)
    mse, wape, r2, model_lr = test_leave_house_out(df, LinearRegression(), locations[i], sourceFile)
    metrics_list_lr.append((mse, wape, r2))

Train set:  (1378560, 678)
Test set:  (379104, 678)


### XGBoost

In [None]:
metrics_list_xgb = []
print("\n----------------------------", file=sourceFile)
print("\nXGBoost\n", file=sourceFile)
print("----------------------------\n", file=sourceFile)
for i in range(10):
    print("\nIteration", i, file=sourceFile)
    mse, wape, r2, model_xgb = test_leave_house_out(df, xgb.XGBRegressor(tree_method='gpu_hist', seed=0), locations[i], sourceFile)
    metrics_list_xgb.append((mse, wape, r2))

### Random Forest

In [None]:
metrics_list_rf = []
print("\n----------------------------", file=sourceFile)
print("\nRandom Forest\n", file=sourceFile)
print("----------------------------\n", file=sourceFile)
for i in range(10):
    print("\nIteration", i, file=sourceFile)
    mse, wape, r2, model_rf = test_leave_house_out(df, cuRF(), locations[i], sourceFile)
    metrics_list_rf.append((mse, wape, r2))

## Averaged Metrics

In [None]:
print("\n\n\n", file=sourceFile)

In [None]:
print("\nLinear Regression", file=sourceFile)
total_averaged_metrics(metrics_list_lr, sourceFile)

In [None]:
print("\nXGBoost", file=sourceFile)
total_averaged_metrics(metrics_list_xgb, sourceFile)

In [None]:
print("\nRandom Forest", file=sourceFile)
total_averaged_metrics(metrics_list_rf, sourceFile)

### Feature Importance

In [None]:
print('Linear Regression\n')
for i,v in enumerate(model_lr.coef_):
    print('Feature: %0d, Score: %.5f' % (i,v))
print('XGBoost\n')
for i,v in enumerate(model_xgb.feature_importances_):
    print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
plt.bar(range(len(model_xgb.feature_importances_)),model_xgb.feature_importances_)
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()

In [None]:
plt.bar(range(len(model_lr.coef_)),model_lr.coef_)
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()

In [None]:
sourceFile.close()