In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from green_city.utils import index2datetime
from green_city.regression import plot_ts, error_metrics, train_test_time_split
from green_city.regression import seasons, time_of_day, forecast_dates

import warnings
warnings.filterwarnings('ignore')

RSEED = 42

In [7]:
building_nr = "all"
column_to_predict = "net_load_kWh"

#document wide parameters that don't change in experiments
#for logging to mlflow server
global_params = {
    "building_nr": building_nr,
    "predicted_feature": column_to_predict,
    "resolution": "daily",
}

In [8]:
# Set write_date to True for logging to mlflow and SQL database
write_data = False

if write_data:
    
    # ## MLFLOW ##
    import mlflow
    from green_city.mlflow_config import get_mlflow_config

    flow_conf = get_mlflow_config()
    tracking_uri = flow_conf["TRACKING_URI"]
    mlflow.set_tracking_uri(flow_conf["TRACKING_URI"])
    mlflow.set_experiment(flow_conf["EXPERIMENT_NAME"]);
    

    # ## DB CONNECTION ##
    from sqlalchemy import create_engine
    from decouple import Config, RepositoryEnv

    config = Config(RepositoryEnv("../.db_credentials"))
    db_connection_credentials = {
        "database": config('POSTGRES_DB'),
        "user": config('POSTGRES_USER'),
        "password": config('POSTGRES_PASSWORD'),
        "host": config('POSTGRES_HOST'),
        "port": config('POSTGRES_PORT'),
    }
    DB_STRING = "postgresql://{user}:{password}@{host}:{port}/{database}".format(**db_connection_credentials)
    db = create_engine(DB_STRING)

In [9]:
# Load data
if building_nr == 'all':
    df = pd.read_csv(f"../data/preprocessed/Agg_buildings.csv").astype({'datetime': 'datetime64'}).set_index('datetime')
else:
    df = pd.read_csv(f"../data/preprocessed/Building_{building_nr}.csv").astype({'datetime': 'datetime64'}).set_index('datetime')
df.fillna(0.0, inplace=True)

In [10]:
# Add additional columns for time
df['hour'] = df.index.hour.astype('category')
df['month'] = df.index.month.astype('category')
df['year'] = df.index.year.astype('category')
df['holiday'] = df['holiday'].astype('category')
df['workday'] = df['workday'].astype('category')

df['season'] = df['month'].apply(seasons).astype('category')
df['time_of_day'] = df['hour'].apply(time_of_day).astype('category')
df['day_of_week'] = df.index.day_of_week.astype('category')

# Creating lag variables
for i in range(24):
    df['net_load_kW_lag'+str(i+1)] = df['net_load_kW'].shift(i+1)
df['net_load_kW_lag168'] = df['net_load_kW'].shift(24*7)

# Exogenous variables for yearly, weekly and hourly seasonality
df['year_sin365'] = np.sin(2 * np.pi * df.index.dayofyear / 365)
df['year_cos365'] = np.cos(2 * np.pi * df.index.dayofyear / 365)
df['year_sin365_2'] = np.sin(4 * np.pi * df.index.dayofyear / 365)
df['year_cos365_2'] = np.cos(4 * np.pi * df.index.dayofyear / 365)
df['week_sin365'] = np.sin(2 * np.pi * df.index.dayofweek/7)
df['week_cos365'] = np.cos(2 * np.pi * df.index.dayofweek/7)
df['week_sin365_2'] = np.sin(4 * np.pi * df.index.dayofweek/7)
df['week_cos365_2'] = np.cos(4 * np.pi * df.index.dayofweek/7)
df['hour_sin365'] = np.sin(2 * np.pi * df.index.hour/24)
df['hour_cos365'] = np.cos(2 * np.pi * df.index.hour/24) 
df['hour_sin365_2'] = np.sin(4 * np.pi * df.index.hour/24)
df['hour_cos365_2'] = np.cos(4 * np.pi * df.index.hour/24) 

# Shift predicted weather values by 24hr
df['pred_24h_diffuse_solar_W_m2_shift'] = df['pred_24h_diffuse_solar_W_m2'].shift(periods=24)
df['pred_24h_direct_solar_W_m2_shift'] = df['pred_24h_direct_solar_W_m2'].shift(periods=24)
df['pred_24h_outdoor_temp_shift'] = df['pred_24h_outdoor_temp'].shift(periods=24)
df['pred_24h_outdoor_hum_shift'] = df['pred_24h_outdoor_hum'].shift(periods=24)
df = df.dropna()