In [1]:
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
warnings.filterwarnings('ignore',category=DeprecationWarning)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold, RepeatedKFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
import time
import math
import xgboost as xgb

In [2]:
df = pd.read_pickle("df_location.pkl")
number_of_houses = 51
num_samples_per_house = 35136
df.reset_index(drop=True, inplace=True)
df = df.iloc[:, [0, 2, 1]]
df

Unnamed: 0,Time,Location,Energy
0,2019-01-01 00:00:00,0,2.964000
1,2019-01-01 00:15:00,0,2.584000
2,2019-01-01 00:30:00,0,3.071000
3,2019-01-01 00:45:00,0,2.694000
4,2019-01-01 01:00:00,0,2.569000
...,...,...,...
1791931,2020-01-01 22:45:00,50,0.753222
1791932,2020-01-01 23:00:00,50,0.716855
1791933,2020-01-01 23:15:00,50,0.735802
1791934,2020-01-01 23:30:00,50,0.485237


In [23]:
#trunc = lambda x: math.trunc(10000 * x) / 10000
#df_trunc = pd.DataFrame(df['Energy']).applymap(trunc)
#df = pd.concat([df.iloc[:, 0:-1], df_trunc], axis=1)
#df

Unnamed: 0,Time,Location,Energy
0,2019-01-01 00:00:00,0,2.9640
1,2019-01-01 00:15:00,0,2.5840
2,2019-01-01 00:30:00,0,3.0710
3,2019-01-01 00:45:00,0,2.6940
4,2019-01-01 01:00:00,0,2.5690
...,...,...,...
1791931,2020-01-01 22:45:00,50,0.7532
1791932,2020-01-01 23:00:00,50,0.7168
1791933,2020-01-01 23:15:00,50,0.7358
1791934,2020-01-01 23:30:00,50,0.4852


### Auxiliary functions

In [3]:
def plot_results(preds: np.array, actuals: np.array, title: str):
    
    plt.scatter(actuals, preds, c='b', label='predicted')
    plt.xlabel('actual')
    plt.ylabel('predicted')
    plt.title(title)
    plt.xlim(0, plt.xlim()[1])
    plt.ylim(0, plt.ylim()[1])
    _ = plt.plot([0, 100], [0, 100], '--r', label='y=x')
    plt.legend()
    plt.show()
    
    
def performance_metrics(preds: np.array, actuals: np.array):

    # calculate performance metrics
    
    mse = mean_squared_error(actuals, preds)
    wape = np.sum(np.abs(preds - actuals)) / np.sum(np.abs(actuals)) * 100
    r2 = r2_score(actuals, preds)
    actuals = pd.DataFrame(actuals)
    preds = pd.DataFrame(preds)
    wape3 = (actuals - preds).abs().sum() / preds.sum() * 100
    print('WAPE3: %.4f' % wape3)
    # print performance metrics
    print('MSE: %.4f' % mse)
    print('WAPE: %.4f' % wape)
    print('R2: %.4f' % r2)
    return mse, wape, r2

#@jit(target_backend='cuda')
def build_model(estimator, X_train: np.array, y_train: np.array, X_test: np.array):
    
    model = estimator
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    return model, preds

def validate(estimator, X_train, y_train):
    scores = cross_validate(estimator, X_train, y_train, scoring=['r2', 'neg_mean_squared_error'])
    return scores
    

In [4]:
def total_averaged_metrics(metrics_list):
    
    print("Total Averaged MSE: {}".format(np.round(sum(i for i, j, k in metrics_list)/len(metrics_list),3)))
    print("Total Averaged WAPE: {}".format(np.round(sum(j for i, j, k in metrics_list)/len(metrics_list),3)))
    print("Total Averaged R2: {}".format(np.round(sum(k for i, j, k in metrics_list)/len(metrics_list),3)))


def last_energy_points(df, number_timesteps, num_samples_per_house):
    X = pd.DataFrame()
    for i in range(1, (number_timesteps + 1) ):
        X[f'Energy_{i*15}'] = df['Energy'].shift(i)
    y = pd.DataFrame(df[number_timesteps:])
    y = y['Energy']
    ## Remove samples in between each house
    for h in range(1, number_of_houses):
        for i in range(0, number_timesteps):
            X.iloc[(num_samples_per_house+i)*h] = np.nan
            y.iloc[(num_samples_per_house+i)*h] = np.nan
    X.dropna(inplace=True)
    X.reset_index(drop=True, inplace=True)
    y.dropna(inplace=True)
    y.reset_index(drop=True, inplace=True)
    y.columns = ["Energy"]
    return X, y

def last_energy_points_full(df, number_timesteps, num_samples_per_house):
    X = pd.DataFrame()
    other_feats = df.iloc[:,:2]
    for i in range(1, (number_timesteps + 1) ):
        X[f'Energy_{i*15}'] = df['Energy'].shift(i)
    y = df.copy().iloc[:,2]
    y.iloc[:number_timesteps] = np.nan
    ## Remove samples in between each house
    for h in range(1, number_of_houses):
        for i in range(0, number_timesteps):
            X.iloc[(num_samples_per_house+i)*h] = np.nan
            y.iloc[(num_samples_per_house+i)*h] = np.nan
    X = pd.concat([other_feats, X], axis=1)
    X.dropna(inplace=True)
    X.reset_index(drop=True, inplace=True)
    y.dropna(inplace=True)
    y.reset_index(drop=True, inplace=True)
    y.columns = ["Energy"]
    dataframe = pd.concat([X,y.rename('Energy')], axis=1)
    assert number_of_houses == (df.shape[0] - dataframe.shape[0]) / number_timesteps, "Something went wrong with preprocessing"
    return dataframe

def build_predict_show(df, number_timesteps, estimator, normalize=False, train_size=0.8, start_timestep=1 ):
    full_start = time.time()
    metrics_list = []
    for i in range(start_timestep,(number_timesteps + 1)):
        start = time.time()
        print("\nIteration ", i)
        X, y = last_energy_points(df, i)

        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size)
        if(normalize):
            scaler = MinMaxScaler().fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
        model, preds, scores = build_model(estimator, X_train, y_train, X_test)
        mse, rmse, mae, mape, r2 = performance_metrics(preds, y_test.values.reshape(-1))
        cv_mse = np.round(scores['test_neg_mean_squared_error'].mean() * (-1),4)
        cv_r2 = np.round(scores['test_r2'].mean(),5)
        print("CV MSE: {} ".format(cv_mse))
        print("CV R2: {} ".format(cv_r2))
        metrics_list.append((cv_mse,cv_rmse,cv_mae,mape,cv_r2))
        print("\nElapsed time: %.3f seconds" % (time.time() - start))
    print("\nFull Elapsed time: %.3f seconds" % (time.time() - full_start))
    return model, preds, scores, metrics_list

def show_graphic_per_timestep(metrics_list, number_timesteps, start_timestep=1):
    mse_list = []
    wape_list = []
    r2_list = []

    for i in range(0,len(metrics_list)):
        mse_list.append(metrics_list[i][0])
        wape_list.append(metrics_list[i][3])
        r2_list.append(metrics_list[i][4])
        
    plt.plot(range(start_timestep,(number_timesteps+1)), mse_list)
    plt.title('MSE per past timestep')
    plt.xlabel('Number of past timesteps')
    plt.ylabel('MSE')
    plt.show()
    
    plt.plot(range(start_timestep,(number_timesteps+1)), wape_list)
    plt.title('WAPE per past timestep')
    plt.xlabel('Number of past timesteps')
    plt.ylabel('WAPE')
    plt.show()
    
    plt.plot(range(start_timestep,(number_timesteps+1)), r2_list)
    plt.title('R2 per past timestep')
    plt.xlabel('Number of past timesteps')
    plt.ylabel('R2')
    plt.show()
    

In [5]:
def normalize_training(X_train):
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    return X_train, scaler

### Cross Validate

In [None]:
scores = validate(xgb.XGBRegressor(seed=0), X_train, y_train)
cv_mse = np.round(scores['test_neg_mean_squared_error'].mean() * (-1),4)
cv_r2 = np.round(scores['test_r2'].mean(),5)
print("CV MSE: {} ".format(cv_mse))
print("CV R2: {} ".format(cv_r2))
metrics_list.append((cv_mse,cv_rmse,cv_mae,mape,cv_r2))

## Leave 10 houses for test (demonstration)

In [7]:
number_of_timesteps = 12

In [8]:
df_new = last_energy_points_full(df, number_of_timesteps, num_samples_per_house)
df_new

Unnamed: 0,Time,Location,Energy_15,Energy_30,Energy_45,Energy_60,Energy_75,Energy_90,Energy_105,Energy_120,Energy_135,Energy_150,Energy_165,Energy_180,Energy
0,2019-01-01 03:00:00,0,3.047000,2.553000,3.223000,2.994000,2.922000,2.558000,3.174000,2.569000,2.694000,3.071000,2.584000,2.964000,3.310000
1,2019-01-01 03:15:00,0,3.310000,3.047000,2.553000,3.223000,2.994000,2.922000,2.558000,3.174000,2.569000,2.694000,3.071000,2.584000,2.514000
2,2019-01-01 03:30:00,0,2.514000,3.310000,3.047000,2.553000,3.223000,2.994000,2.922000,2.558000,3.174000,2.569000,2.694000,3.071000,2.972000
3,2019-01-01 03:45:00,0,2.972000,2.514000,3.310000,3.047000,2.553000,3.223000,2.994000,2.922000,2.558000,3.174000,2.569000,2.694000,2.976000
4,2019-01-01 04:00:00,0,2.976000,2.972000,2.514000,3.310000,3.047000,2.553000,3.223000,2.994000,2.922000,2.558000,3.174000,2.569000,2.916000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1791319,2020-01-01 22:45:00,50,1.099100,0.453906,0.481770,0.483173,0.467528,1.001887,0.552398,0.533286,0.533451,0.464143,0.552398,0.937120,0.753222
1791320,2020-01-01 23:00:00,50,0.753222,1.099100,0.453906,0.481770,0.483173,0.467528,1.001887,0.552398,0.533286,0.533451,0.464143,0.552398,0.716855
1791321,2020-01-01 23:15:00,50,0.716855,0.753222,1.099100,0.453906,0.481770,0.483173,0.467528,1.001887,0.552398,0.533286,0.533451,0.464143,0.735802
1791322,2020-01-01 23:30:00,50,0.735802,0.716855,0.753222,1.099100,0.453906,0.481770,0.483173,0.467528,1.001887,0.552398,0.533286,0.533451,0.485237


In [9]:
df_new.isna().sum()

Time          0
Location      0
Energy_15     0
Energy_30     0
Energy_45     0
Energy_60     0
Energy_75     0
Energy_90     0
Energy_105    0
Energy_120    0
Energy_135    0
Energy_150    0
Energy_165    0
Energy_180    0
Energy        0
dtype: int64

In [10]:
def test_leave_house_out(df, estimator, number_timesteps, num_houses_test, locations):
    df_new = last_energy_points_full(df, number_timesteps, num_samples_per_house)
    test = df_new[df_new['Location'].isin(locations)]
    train = df_new[~df_new['Location'].isin(locations)]
    print("Train set: ", train.shape)
    print("Test set: ", test.shape)
    X_train = train.drop(['Time', 'Energy', 'Location'], axis=1)
    X_test = test.drop(['Time', 'Energy', 'Location'], axis=1)
    y_train = train['Energy']
    y_test = test['Energy']
    X_train_norm, scaler = normalize_training(X_train)
    X_test_norm = scaler.transform(X_test)

    model = estimator
    init = time.time()
    model.fit(X_train_norm, y_train)
    y_pred = model.predict(X_test_norm)
    end = time.time()
    print('Elapsed time: {:.4f} s'.format(end - init))
    mse, wape, r2 = performance_metrics(y_pred, y_test.values.reshape(-1))
    return mse, wape, r2

## Predict  10 folds with 10 random houses for test

In [11]:
num_houses_test = 10

In [38]:
kf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42) 
locations = []
for _, test_index in kf.split(df['Location'].unique()):
    locations.append(test_index)
locations

[array([ 3, 12, 13, 17, 24, 30, 31, 32, 40, 43, 46]),
 array([ 4,  6,  8, 15, 19, 33, 36, 47, 48, 49]),
 array([ 0,  9, 11, 16, 25, 26, 27, 34, 44, 45]),
 array([ 1,  2,  5, 21, 23, 29, 35, 37, 39, 41]),
 array([ 7, 10, 14, 18, 20, 22, 28, 38, 42, 50]),
 array([ 0,  2, 10, 18, 23, 30, 36, 41, 45, 47, 49]),
 array([ 4,  8, 12, 20, 21, 26, 29, 31, 32, 33]),
 array([ 9, 14, 15, 22, 24, 37, 40, 42, 44, 48]),
 array([ 3,  5, 11, 17, 25, 28, 35, 38, 39, 50]),
 array([ 1,  6,  7, 13, 16, 19, 27, 34, 43, 46])]

In [13]:
locations = []
for i in range(10):
    l = np.random.choice(df['Location'].unique(), size=num_houses_test, replace=False)
    locations.append(l)
locations

[array([18, 34, 37, 38, 49, 45, 31, 16,  3,  9]),
 array([ 6,  2, 37, 42, 24,  0, 23,  1,  9, 31]),
 array([32, 11,  7, 26,  0, 31,  4, 42, 45,  1]),
 array([31, 22, 28, 35, 24, 34, 46,  3, 45, 13]),
 array([21, 38, 13, 34, 18,  5, 20, 44,  3, 10]),
 array([ 5, 11,  8, 42, 23, 21, 13, 35, 10, 40]),
 array([23, 11, 17, 50,  2, 10, 40, 42, 41, 21]),
 array([16, 32, 47,  0, 42, 17, 30, 27, 29, 50]),
 array([27, 31, 25, 21,  4, 38, 16, 17, 33, 22]),
 array([27, 13,  5, 17, 42, 36, 29, 43, 16, 26])]

### XGBoost

In [None]:
metrics_list = []
for i in range(10):
    print("\nIteration", i)
    mse, wape, r2 = test_leave_house_out(df, xgb.XGBRegressor(seed=0), number_of_timesteps, num_houses_test, locations[i])
    metrics_list.append((mse, wape, r2))

### Linear Regression

In [None]:
metrics_list_lr = []
for i in range(10):
    print("\nIteration", i)
    mse, wape, r2 = test_leave_house_out(df, LinearRegression(), number_of_timesteps, num_houses_test, locations[i])
    metrics_list_lr.append((mse, wape, r2))

### Random Forest

In [None]:
metrics_list_rf = []
for i in range(10):
    print("\nIteration", i)
    mse, wape, r2 = test_leave_house_out(df, RandomForestRegressor(), number_of_timesteps, num_houses_test, locations[i])
    metrics_list_rf.append((mse, wape, r2))

### Gradient Boosting

In [None]:
metrics_list_gb = []
for i in range(10):
    print("\nIteration", i)
    mse, wape, r2 = test_leave_house_out(df, GradientBoostingRegressor(), number_of_timesteps, num_houses_test)
    metrics_list_gb.append((mse, wape, r2))

In [None]:
total_averaged_metrics(metrics_list)

In [None]:
total_averaged_metrics(metrics_list_lr)

In [None]:
total_averaged_metrics(metrics_list_rf)

In [None]:
total_averaged_metrics(metrics_list_gb)

In [None]:
plt.bar(range(len(model.feature_importances_)),model.feature_importances_)
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()