In [1]:
from pathlib import Path
from dateutil.relativedelta import relativedelta
import datetime as dt

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


ROOT_DIR = Path("../../")
DATA_DIR = ROOT_DIR / "data"
TRAIN_DATA_PATH = DATA_DIR / "02_intermediate" / "train_power.csv"
FUTURE_DATA_PATH = DATA_DIR / "02_intermediate" / "cv_power.csv"

In [2]:
def load_from_file(path):
    return pd.read_csv(path, parse_dates=["datetime"])


def preprocess(df):
    return (
        df.assign(
            hour_of_day=lambda d: d.datetime.dt.hour,
            day_of_week=lambda d: d.datetime.dt.dayofweek,
            week_of_year=lambda d: d.datetime.dt.isocalendar().week,
        )
        .drop(columns=["datetime"])
    )

def calculate_errors_from_year_of_data(y, y_hat):
    y = np.array(y)
    y_hat = np.array(y_hat)
    
    errors = []
    error_periods_days = (1, 7, 30, 365)
    for period in error_periods_days:
        errors.append(rmse(y[:period*24], y_hat[:period*24]))
    return errors
    
    

def rmse(y, y_hat):
    return np.sqrt(np.mean((y-y_hat)**2))


In [3]:
# Round 1
train_data = (
    load_from_file(TRAIN_DATA_PATH)
    .pipe(preprocess)
)

model = RandomForestRegressor()

X = train_data.drop(columns="power_kw")
y = train_data.power_kw

model.fit(X, y)

In [4]:
future_data = (
    load_from_file(FUTURE_DATA_PATH)
    .pipe(preprocess)
)

# The data is two years long so we can just half it
half_of_future_data = int(len(future_data)/2)
first_year_future = future_data[:half_of_future_data]
second_year_future = future_data[half_of_future_data:]

first_year_forecast = model.predict(first_year_future.drop(columns=["power_kw"]))

In [5]:
first_errors = calculate_errors_from_year_of_data(first_year_future.power_kw, first_year_forecast)
first_errors

[0.027473807904974997,
 0.18314781964327626,
 0.18877052252406148,
 0.15570872703570582]

In [6]:
# Round 2

model = RandomForestRegressor()

second_train = pd.concat([train_data, first_year_future])

X = second_train.drop(columns="power_kw")
y = second_train.power_kw

model.fit(X, y)

In [7]:
second_year_forecast = model.predict(second_year_future.drop(columns=["power_kw"]))

In [8]:
second_errors = calculate_errors_from_year_of_data(second_year_future.power_kw, second_year_forecast)
second_errors

[0.05491256000204222,
 0.1043116929673631,
 0.08261396958651213,
 0.15449941219679283]

In [9]:
overall_errors = (np.array(first_errors) + np.array(second_errors))/2
overall_errors

array([0.04119318, 0.14372976, 0.13569225, 0.15510407])

In [10]:
year_error = overall_errors[3]
year_error

0.15510406961624934