In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import holidays
from sklearn.metrics import mean_squared_error, mean_absolute_error
import gc
sns.set_theme(style="darkgrid")

In [24]:
load_df = pd.read_csv('data/load.csv')

In [79]:
def df_formatting(load_df):
    load_df = pd.melt(load_df, id_vars=["meter_id", "date"], value_vars=load_df.columns.difference(["meter_id", "date"]), 
                                var_name="hour", value_name="load")
    load_df["hour"] = load_df["hour"].str.strip("h").astype(int) - 1
    load_df["timestamp"] = pd.to_datetime(load_df["date"] + " " + load_df["hour"].astype(str) + ":00:00", format="%m/%d/%Y %H:%M:%S")
    load_df["meter_id"] = load_df["meter_id"].astype(int)
    load_df['month'] = load_df['timestamp'].dt.month
    load_df['day_of_week'] = load_df['timestamp'].dt.dayofweek
    load_df['year'] = load_df['timestamp'].dt.year
    load_df = load_df.drop(columns=["date"])
    return load_df

data_df = df_formatting(load_df)

In [80]:
def train_test_split(df):
    train_df = df[df["year"] < 2011]
    test_df = df[df["year"] == 2011]
    train_df = train_df.drop(columns=["year"])
    test_df = test_df.drop(columns=["year"])
    return train_df, test_df
train_df, test_df = train_test_split(data_df)

In [70]:
test_df[test_df["meter_id"]==499]

Unnamed: 0,meter_id,hour,load,month,day_of_week
397218,499,0,0.0,5,6
397219,499,0,0.0,5,0
397220,499,0,0.0,5,1
397221,499,0,0.0,5,2
397222,499,0,0.0,5,3
...,...,...,...,...,...
9539107,499,8,878.0,12,1
9539108,499,8,970.0,12,2
9539109,499,8,1234.0,12,3
9539110,499,8,1138.0,12,4


In [67]:
dict_train = train_df.groupby("meter_id")["load"].count().to_dict()
dict_test = test_df.groupby("meter_id")["load"].count().to_dict()
print(set(list(dict_train.keys())).difference(set(list(dict_test.keys()))))
print(set(list(dict_test.keys())).difference(set(list(dict_train.keys()))))

{4, 5, 166, 486, 270, 84, 186, 446}
{499}


In [72]:
filtered_train = {key: value for key, value in dict_train.items() if value < 51000}
print(filtered_train)

{4: 15838, 5: 37987, 84: 7295, 166: 16774, 186: 8015, 270: 42355, 446: 18933, 453: 48955, 454: 45283, 455: 45283, 456: 40196, 458: 37268, 468: 26301, 469: 26301, 470: 25557, 472: 22102, 477: 19726, 478: 18982, 481: 15013, 482: 13895, 486: 2232, 491: 9503, 492: 9071, 496: 7343}


In [76]:
filtered_test = {key: value for key, value in dict_test.items() if value < 8759}
print(filtered_test)

{236: 8016, 499: 5880}


In [39]:
def train_historical_averages_baseline(train_df):
    meter_historical_averages = train_df.groupby(["hour", "day_of_week", "month", "meter_id"]).mean().to_dict()["load"]
    time_historical_averages = train_df.groupby(["hour", "day_of_week", "month"]).mean().to_dict()["load"]
    return meter_historical_averages, time_historical_averages

In [81]:
def get_average(row, meter_historical_averages, time_historical_averages):
    try:
        return meter_historical_averages[(row["hour"], row["day_of_week"], row["month"], row["meter_id"])]
    except KeyError:
        return time_historical_averages[(row["hour"], row["day_of_week"], row["month"])]

In [82]:
def test_historical_averages_baseline(X_test, meter_historical_averages, time_historical_averages):
    X_test["load"] = X_test.apply(lambda row: get_average(row, meter_historical_averages, time_historical_averages), axis=1)
    return X_test["load"]

In [84]:
train_df = train_df.dropna()
test_df = test_df.dropna()
y_test = test_df["load"]
X_test = test_df.drop(columns="load")

In [85]:
meter_historical_averages, time_historical_averages = train_historical_averages_baseline(train_df)

In [87]:
y_pred = test_historical_averages_baseline(X_test, meter_historical_averages, time_historical_averages)
pred_df = pd.concat([X_test, y_pred], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)
y_pred = pred_df.groupby("timestamp").sum()["load"]
y_test = test_df.groupby("timestamp").sum()["load"]


rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print("Root Mean Squared Error on the test set: "+"{:.3f}".format(rmse))
print("Mean Absolute Error on the test set: "+"{:.3f}".format(mae))

Root Mean Squared Error on the test set: 119038.104
Mean Absolute Error on the test set: 62794.141


In [58]:
print(y_pred)
print(y_test)

2191       4229.037037
2192       4050.925926
2193       3768.814815
2194       3562.296296
2195       3767.846154
              ...     
9539107    8661.061927
9539108    8305.477960
9539109    8103.914361
9539110    8130.542413
9539111    7938.129758
Name: load, Length: 1406577, dtype: float64
2191       4592.0
2192       2499.0
2193       3052.0
2194       6146.0
2195       4543.0
            ...  
9539107     878.0
9539108     970.0
9539109    1234.0
9539110    1138.0
9539111     744.0
Name: load, Length: 1406577, dtype: float64
