In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from green_city.utils import index2datetime
from green_city.regression import plot_ts, error_metrics, train_test_time_split
from green_city.regression import seasons, time_of_day, forecast_dates

#import warnings
#warnings.filterwarnings('ignore')

RSEED = 42

In [None]:
#def add_up(cols):
#    def add_up_cols(s):
#        net = sum([s[col] for col in cols])
#        return net
#    return add_up_cols

def load_external_features():
    ex_cols = [
        "outdoor_temp",
        "outdoor_hum",
        "diffuse_solar_W_m2",
        "direct_solar_W_m2",
    ]
    pred_ex_cols = [f"pred_24h_{ec}" for ec in ex_cols]
    
    df = (
        pd.read_csv("../data/preprocessed/Building_1.csv")
        .astype({'datetime': 'datetime64'})
        .set_index('datetime')
    )
    df_external = df[ex_cols]
    df_external_p24h = df[pred_ex_cols]

    #replacing the values in the last year with the respective predictions:
    for col in df_external.columns:
        df_external.loc[df_external.index >= pd.Timestamp('2011-01-01'), col] = df_external_p24h[f"pred_24h_{col}"]

    df_external["holiday"] = df["holiday"]
    #add date information
    df_external = df_external.assign(
        hour =    df_external.index.hour,
        month =   df_external.index.month,
        weekday = df_external.index.weekday,
    )
    
    df_external['year_sin365'] = np.sin(2 * np.pi * df.index.dayofyear / 365)
    df_external['year_cos365'] = np.cos(2 * np.pi * df.index.dayofyear / 365)
    df_external['year_sin365_2'] = np.sin(4 * np.pi * df.index.dayofyear / 365)
    df_external['year_cos365_2'] = np.cos(4 * np.pi * df.index.dayofyear / 365)
    df_external['week_sin365'] = np.sin(2 * np.pi * df.index.dayofweek/7)
    df_external['week_cos365'] = np.cos(2 * np.pi * df.index.dayofweek/7)
    df_external['week_sin365_2'] = np.sin(4 * np.pi * df.index.dayofweek/7)
    df_external['week_cos365_2'] = np.cos(4 * np.pi * df.index.dayofweek/7)
    df_external['hour_sin365'] = np.sin(2 * np.pi * df.index.hour/24)
    df_external['hour_cos365'] = np.cos(2 * np.pi * df.index.hour/24) 
    df_external['hour_sin365_2'] = np.sin(4 * np.pi * df.index.hour/24)
    df_external['hour_cos365_2'] = np.cos(4 * np.pi * df.index.hour/24)
    
    #.apply(lambda x: [x.name.hour, x.name.month, x.name.weekday()], axis=1, result_type='expand')
    #.rename(columns={0: 'hour', 1: 'month', 2: 'weekday'})
    #)
    #df_external
    
    return df_external

def load_features(i, energy_usage_cols):
    df = (
        pd.read_csv(f"../data/preprocessed/Building_{i+1}.csv")
        .astype({'datetime': 'datetime64'})
        .set_index('datetime')
        .assign(neg_solar_generation_kW = lambda x: -x.solar_generation_kW,
                #my_net = add_up(energy_usage_cols),
        )[[*energy_usage_cols]]
    )
    return df

#for the last year we have to replace actual temp and hum with their 24h preds

energy_usage_cols = [
    'equipment_electric_power_kW',
    'dhw_heating_kW',
    'neg_solar_generation_kW',
    'electric_load_hp_kW',
]

#df["diff"] = df["my_net"] - df["net_load_kW"] #seems to be correct up to rounding


# Load data
dfs = [load_features(i, energy_usage_cols) for i in range(9)]

all_features = pd.DataFrame({(i, col): dfs[i][col] for i in range(9) for col in energy_usage_cols})
external_features = load_external_features()

#36 features to individually predict

# Create a prediction based on all buildings loads separately
- at the end the error can be split up to the individual buildings
- and to the individual components (is heating more miss-predicted then solar)?
- because
  + error_tot = error_1_tot + ... + error_9_tot and
  + error_1_tot = error_1_eqp + error_1_solar + error_1_heating + error_1_cooling
- The components for each building are:
  + `equipment_electric_power_kW + dhw_heating_kW + cooling_load_kW - solar_generation_kW + electric_load_hp_kW`
- second step: use weather etc. for predictions

In [None]:
def predict_target_col(df):
    dummy_features = ['hour', 'month', 'holiday', 'weekday']
    target = 'target'
    
    #add shifted columns for the previous day:
    #for i in [*list(range(24, 49)), 24*7]:
    for i in [24, 24*7]:
        df[f"target_shift_{i}"]= df.target.shift(i, fill_value=0)
    #df['net_load_kW_lag168'] = df['net_load_kW'].shift(24*7)
    
    features = [c for c in df.columns if not c == target]
    
    # Do train-test split
    X_train, y_train, X_test, y_test, df_train, df_test = train_test_time_split(
                                                            df, features, target, 
                                                            dummy_features)
    
    # Create the model
    rf = RandomForestRegressor(random_state=RSEED)
    
    # Fit on training data
    rf.fit(X_train, y_train)
    
    # Predict train and test data
    y_pred_train = rf.predict(X_train)
    y_pred_test = rf.predict(X_test)
    df_test['y_pred_test'] = y_pred_test
    
    # Calculate and print error metrics
    _ = error_metrics(y_train, y_pred_train, y_test, y_pred_test, title="Random Forest Regressor")
    return y_pred_test                                           

# summing up all single components

In [None]:
predictions_df_dict = {}
for (b, f) in all_features.columns:
    if not b == 5:
        continue
    building_nr = b
    feature_name = f
    target = all_features[(building_nr, feature_name)].rename("target")

    input_df = pd.concat([target, external_features], axis=1)

    #now we can start predicting
    predictions_df_dict[(b, f)] = predict_target_col(input_df)

In [None]:
new_preds = pd.DataFrame({
    'pred': pd.DataFrame(predictions_df_dict).sum(axis=1),
    'datetime': pd.date_range('2011-01-01', '2011-12-31 23:00', freq='H'),
}).set_index('datetime')

total_net = (
    pd.read_csv("../data/preprocessed/Agg_buildings.csv")
    .astype({'datetime': 'datetime64'})
    .set_index('datetime')
    ["net_load_kW"]
)

new_preds["actual"] = total_net

In [None]:
new_preds[["actual", "pred"]].plot()
mean_absolute_error(new_preds["pred"], new_preds["actual"])

# only total net

In [None]:
#directly predict net usage:
input_df = pd.concat([total_net, external_features], axis=1).rename(columns={'net_load_kW': 'target'})

#now we can start predicting
net_prediction = predict_target_col(input_df)
net_pred_df = pd.DataFrame({
    'pred': net_prediction,
    'datetime': pd.date_range('2011-01-01', '2011-12-31 23:00', freq='H'),
}).set_index('datetime')

net_pred_df["actual"] = total_net

net_pred_df[["actual", "pred"]].plot()
mean_absolute_error(net_pred_df["pred"], net_pred_df["actual"])

# using component predictions as additional inputs
- for this I have to predict in the train years as well, where the error may be different.