# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import TargetEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error

from xgboost import XGBRegressor

# Loading the Dataset

In [2]:
# Load dataset
df = pd.read_csv("data/building_dataset.csv")

# Display first few rows
df.head()

Unnamed: 0,BuildingID,BuildingType,YearBuilt,MonthlyElectricityConsumption_kWh,PeakUsageTime_Hour,RenewableCapacity_kWh,RenewableType,RenewableContributionPercentage,EnergySource,EnergyEfficiency_kWh_per_m2,WeatherData_Temperature_C,WeatherData_SolarIntensity_Hours,WeatherData_WindSpeed_km_h
0,B000001,Educational,,673.62,8,3292.66,,30.27,Electricity,21.15,12.51,5.44,71.64
1,B000002,Agricultural,,294.6,10,0.0,Tidal,0.0,Coal,21.01,18.22,8.91,10.85
2,B000003,Retail,2020.0,210.2,20,6483.89,Solar,90.72,Electricity,34.53,39.4,3.59,13.45
3,B000004,Commercial,,174.14,5,12150.11,Wind,45.16,Biomass,48.66,17.97,3.65,40.85
4,B000005,Educational,,61.27,23,3516.48,Solar,96.59,Electricity,25.46,16.18,8.11,72.2


# Data Preprocessing

In [3]:
df = df.drop(columns=['BuildingID', 'YearBuilt', 'RenewableContributionPercentage'])

In [4]:
X = df.drop(columns='MonthlyElectricityConsumption_kWh')
y = df['MonthlyElectricityConsumption_kWh']

categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

# Pipeline Models

In [5]:
categorical_transformer = Pipeline(steps=[
    ('encoder', TargetEncoder(random_state=42))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

pipeline_svr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', SVR())
])

pipeline_dtr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [6]:
cv_results_rf = cross_validate(pipeline_rf, X, y, cv=5, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'r2','neg_root_mean_squared_error'], return_train_score=False, n_jobs=-1)
cv_results_SVR = cross_validate(pipeline_svr, X, y, cv=5, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'r2','neg_root_mean_squared_error'], return_train_score=False, n_jobs=-1)
cv_results_dtr = cross_validate(pipeline_dtr, X, y, cv=5, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'r2','neg_root_mean_squared_error'], return_train_score=False, n_jobs=-1)

In [7]:
rf_mse = -(cv_results_rf['test_neg_mean_squared_error'].mean())
rf_mae = -(cv_results_rf['test_neg_mean_absolute_error'].mean())
rf_mape = -(cv_results_rf['test_neg_mean_absolute_percentage_error'].mean())
rf_rmse =-(cv_results_rf['test_neg_root_mean_squared_error'].mean())
rf_r2 = (cv_results_rf['test_r2'].mean())

std_mse = -(cv_results_rf['test_neg_mean_squared_error'].std())
std_mae = -(cv_results_rf['test_neg_mean_absolute_error'].std())
std_mape = -(cv_results_rf['test_neg_mean_absolute_percentage_error'].std())
std_r2 = (cv_results_rf['test_r2'].std())
std_rmse = (cv_results_rf['test_neg_root_mean_squared_error'].std())

print(f'mse = {rf_mse} +- {std_mse}')
print(f'mae = {rf_mae} +- {std_mae}')
print(f'mape = {rf_mape} +- {std_mape}')
print(f'r2 = {rf_r2} +- {std_r2}')
print(f'rmse = {rf_rmse} +- {std_rmse}')

mse = 46920.79774004162 +- -959.7127120080833
mae = 125.79235799957851 +- -1.5832048822131086
mape = 0.6578482431865901 +- -0.010412219337742005
r2 = 0.7034433143924248 +- 0.03328018854381374
rmse = 216.6007286686351 +- 2.2185761780800553


In [8]:
rf_mse = -(cv_results_SVR['test_neg_mean_squared_error'].mean())
rf_mae = -(cv_results_SVR['test_neg_mean_absolute_error'].mean())
rf_mape = -(cv_results_SVR['test_neg_mean_absolute_percentage_error'].mean())
rf_rmse =-(cv_results_SVR['test_neg_root_mean_squared_error'].mean())
rf_r2 = (cv_results_SVR['test_r2'].mean())

std_mse = -(cv_results_SVR['test_neg_mean_squared_error'].std())
std_mae = -(cv_results_SVR['test_neg_mean_absolute_error'].std())
std_mape = -(cv_results_SVR['test_neg_mean_absolute_percentage_error'].std())
std_r2 = (cv_results_SVR['test_r2'].std())
std_rmse = (cv_results_SVR['test_neg_root_mean_squared_error'].std())

print(f'mse = {rf_mse} +- {std_mse}')
print(f'mae = {rf_mae} +- {std_mae}')
print(f'mape = {rf_mape} +- {std_mape}')
print(f'r2 = {rf_r2} +- {std_r2}')
print(f'rmse = {rf_rmse} +- {std_rmse}')

mse = 176338.6639086408 +- -18552.003456182578
mae = 229.24847667871168 +- -6.634792870287341
mape = 1.3950678949599935 +- -0.0313018879593966
r2 = -0.10193919073105402 +- 0.009440657490027286
rmse = 419.34233999272664 +- 22.150977361399377


In [9]:
rf_mse = -(cv_results_dtr['test_neg_mean_squared_error'].mean())
rf_mae = -(cv_results_dtr['test_neg_mean_absolute_error'].mean())
rf_mape = -(cv_results_dtr['test_neg_mean_absolute_percentage_error'].mean())
rf_rmse =-(cv_results_dtr['test_neg_root_mean_squared_error'].mean())
rf_r2 = (cv_results_dtr['test_r2'].mean())

std_mse = -(cv_results_dtr['test_neg_mean_squared_error'].std())
std_mae = -(cv_results_dtr['test_neg_mean_absolute_error'].std())
std_mape = -(cv_results_dtr['test_neg_mean_absolute_percentage_error'].std())
std_r2 = (cv_results_dtr['test_r2'].std())
std_rmse = (cv_results_dtr['test_neg_root_mean_squared_error'].std())

print(f'mse = {rf_mse} +- {std_mse}')
print(f'mae = {rf_mae} +- {std_mae}')
print(f'mape = {rf_mape} +- {std_mape}')
print(f'r2 = {rf_r2} +- {std_r2}')
print(f'rmse = {rf_rmse} +- {std_rmse}')

mse = 89209.73570648827 +- -5868.854684174787
mae = 167.23366639141528 +- -3.6809645324832165
mape = 0.7980081093816169 +- -0.011437524355507473
r2 = 0.4368812046839584 +- 0.0655512760019683
rmse = 298.5185176457725 +- 9.819894554366574
