In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error
sns.set_theme(style="darkgrid")

In [24]:
load_df = pd.read_csv('data/load.csv')

In [79]:
def df_formatting(load_df):
    load_df = pd.melt(load_df, id_vars=["meter_id", "date"], value_vars=load_df.columns.difference(["meter_id", "date"]), 
                                var_name="hour", value_name="load")
    load_df["hour"] = load_df["hour"].str.strip("h").astype(int) - 1
    load_df["timestamp"] = pd.to_datetime(load_df["date"] + " " + load_df["hour"].astype(str) + ":00:00", format="%m/%d/%Y %H:%M:%S")
    load_df["meter_id"] = load_df["meter_id"].astype(int)
    load_df['month'] = load_df['timestamp'].dt.month
    load_df['day_of_week'] = load_df['timestamp'].dt.dayofweek
    load_df['year'] = load_df['timestamp'].dt.year
    load_df = load_df.drop(columns=["date"])
    return load_df

data_df = df_formatting(load_df)

In [80]:
def train_test_split(df):
    train_df = df[df["year"] < 2011]
    test_df = df[df["year"] == 2011]
    train_df = train_df.drop(columns=["year"])
    test_df = test_df.drop(columns=["year"])
    return train_df, test_df
train_df, test_df = train_test_split(data_df)

In [39]:
def train_historical_averages_baseline(train_df):
    meter_historical_averages = train_df.groupby(["hour", "day_of_week", "month", "meter_id"]).mean().to_dict()["load"]
    time_historical_averages = train_df.groupby(["hour", "day_of_week", "month"]).mean().to_dict()["load"]
    return meter_historical_averages, time_historical_averages

In [81]:
def get_average(row, meter_historical_averages, time_historical_averages):
    try:
        return meter_historical_averages[(row["hour"], row["day_of_week"], row["month"], row["meter_id"])]
    except KeyError:
        return time_historical_averages[(row["hour"], row["day_of_week"], row["month"])]

In [82]:
def test_historical_averages_baseline(X_test, meter_historical_averages, time_historical_averages):
    X_test["load"] = X_test.apply(lambda row: get_average(row, meter_historical_averages, time_historical_averages), axis=1)
    return X_test["load"]

In [84]:
train_df = train_df.dropna()
test_df = test_df.dropna()
y_test = test_df["load"]
X_test = test_df.drop(columns="load")

In [85]:
meter_historical_averages, time_historical_averages = train_historical_averages_baseline(train_df)

In [87]:
y_pred = test_historical_averages_baseline(X_test, meter_historical_averages, time_historical_averages)
pred_df = pd.concat([X_test, y_pred], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)
y_pred = pred_df.groupby("timestamp").sum()["load"]
y_test = test_df.groupby("timestamp").sum()["load"]


rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print("Root Mean Squared Error on the test set: "+"{:.3f}".format(rmse))
print("Mean Absolute Error on the test set: "+"{:.3f}".format(mae))

Root Mean Squared Error on the test set: 119038.104
Mean Absolute Error on the test set: 62794.141
