# Non-time series baseline

In [2]:
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import holidays
sns.set_theme(style="darkgrid")

# Data preprocessing and feature engineering

In [3]:
load_df = pd.read_csv('data/load.csv')
hierarchy_df = pd.read_csv("data/hierarchy.csv")
humidity_df = pd.read_csv("data/relative humidity.csv")
temperature_df = pd.read_csv("data/temperature.csv")

In [4]:
def df_formatting(load_df, hierarchy_df, humidity_df, temperature_df):
    load_df = pd.melt(load_df, id_vars=["meter_id", "date"], value_vars=load_df.columns.difference(["meter_id", "date"]), 
                                var_name="hour", value_name="load")
    load_df["hour"] = load_df["hour"].str.strip("h").astype(int) - 1
    load_df["timestamp"] = pd.to_datetime(load_df["date"] + " " + load_df["hour"].astype(str) + ":00:00", format="%m/%d/%Y %H:%M:%S")
    load_df["meter_id"] = load_df["meter_id"].astype(int)
    load_df = load_df.drop(columns=["date", "hour"])
    data_df = load_df.merge(hierarchy_df, on="meter_id", how="left")
    data_df[hierarchy_df.columns.difference(["meter_id"])] = data_df[hierarchy_df.columns.difference(["meter_id"])].astype(str)
    
    humidity_df["timestamp"] = pd.to_datetime(humidity_df["date"] + " " + (humidity_df["hr"] - 1).astype(str) + ":00:00", format="%d%b%Y %H:%M:%S")
    temperature_df["timestamp"] = pd.to_datetime(temperature_df["date"] + " " + (temperature_df["hr"] - 1).astype(str) + ":00:00", format="%d%b%Y %H:%M:%S")
    humidity_df = humidity_df.drop(columns=["date", "hr"])
    temperature_df = temperature_df.drop(columns=["date", "hr"])
    data_df = data_df.merge(humidity_df, on="timestamp", how="left")
    data_df = data_df.merge(temperature_df, on="timestamp", how="left")
    return data_df

joined_data_df = df_formatting(load_df, hierarchy_df, humidity_df, temperature_df)

In [5]:
pd.set_option('display.max_rows', 100)
joined_data_df.dtypes

meter_id              int32
load                float64
timestamp    datetime64[ns]
mid_level            object
aggregate            object
rh_ws1              float64
rh_ws2              float64
rh_ws3              float64
rh_ws4              float64
rh_ws5              float64
rh_ws6              float64
rh_ws7              float64
rh_ws8              float64
rh_ws9              float64
rh_ws10             float64
rh_ws11             float64
rh_ws12             float64
rh_ws13             float64
rh_ws14             float64
rh_ws15             float64
rh_ws16             float64
rh_ws17             float64
rh_ws18             float64
rh_ws19             float64
rh_ws20             float64
rh_ws21             float64
rh_ws22             float64
rh_ws23             float64
rh_ws24             float64
rh_ws25             float64
rh_ws26             float64
rh_ws27             float64
rh_ws28             float64
t_ws1               float64
t_ws2               float64
t_ws3               

In [6]:
def feature_engineering(df):
    # Basic feature engineering: indicator of day of the week, month, is_holiday (in MA), one-hot encoding of hierarchies and of the meter id
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    holidays_MA = holidays.US(years=range(2005, 2012), state="MA")
    df['is_holiday'] = df['timestamp'].dt.date.isin(holidays_MA.keys())
    #df['holiday_name'] = df.apply(lambda row: "None" if not df['is_holiday'] else holidays_MA[row["timestamp"].date()], axis=1): too slow, to be optimized

    meter_onehot = pd.get_dummies(df['meter_id'], drop_first=True, prefix="meter")
    mid_level_onehot = pd.get_dummies(df['mid_level'], drop_first=True, prefix="mid_level")
    aggregate_onehot = pd.get_dummies(df['aggregate'], drop_first=True, prefix="aggregate")
    month_onehot = pd.get_dummies(df['month'], drop_first=True, prefix="month")
    dow_onehot = pd.get_dummies(df['day_of_week'], drop_first=True, prefix="dow")
    df = df.drop(columns=["meter_id", "mid_level", "aggregate", "month", "day_of_week"])
    features_df = pd.concat([df, meter_onehot, mid_level_onehot, aggregate_onehot, month_onehot, dow_onehot], axis=1)
    return features_df

transformed_data_df = feature_engineering(joined_data_df)

In [7]:
transformed_data_df.columns

Index(['load', 'timestamp', 'rh_ws1', 'rh_ws2', 'rh_ws3', 'rh_ws4', 'rh_ws5',
       'rh_ws6', 'rh_ws7', 'rh_ws8',
       ...
       'month_9', 'month_10', 'month_11', 'month_12', 'dow_1', 'dow_2',
       'dow_3', 'dow_4', 'dow_5', 'dow_6'],
      dtype='object', length=261)

In [9]:
def train_test_split(transformed_data):
    train_df = transformed_data[transformed_data["year"] < 2011]
    test_df = transformed_data[transformed_data["year"] == 2012]
    return train_df, test_df
train_df, test_df = train_test_split(transformed_data_df)