In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from datetime import datetime
from sklearn.preprocessing import StandardScaler


In [2]:
def get_lag_response_and_dependent(df: pd.DataFrame, input_size: int, horizon_size:int )-> pd.DataFrame:
    n, d = df.shape
    w = input_size
    h = horizon_size
    res_df = np.zeros([n - w - h + 1,d * w + d * h])
    
    for i in range(0, n-w-h+1):
        res_df[i] = df.values.flatten()[i*d:i*d+w*d + h*d]
    X = res_df[:, 0:w*d]
    y = res_df[:, w*d:]
    return X, y

def read_ett(filename: str) -> pd.DataFrame:
    etth1 = pd.read_csv("data/ETT-small-20231205T092053Z-001/ETT-small/ETTh1.csv")
    etth1["date"] = pd.to_datetime(etth1["date"])
    return etth1


In [3]:
sc = StandardScaler()

etth1 = read_ett("data/ETT-small-20231205T092053Z-001/ETT-small/ETTh1.csv")
cut_off = int(len(etth1) * 0.75)
sc.fit(etth1.drop(columns="date").iloc[0:cut_off])

#Spliting data into train and test. Will the test data need padding?
start_date = etth1["date"].min()
train = etth1.loc[etth1["date"] < (start_date + pd.DateOffset(months=16))].drop(columns="date")
test = etth1.loc[(etth1["date"] >= (start_date + pd.DateOffset(months=16))) & (etth1["date"] < (start_date + pd.DateOffset(months=20)))].drop(columns="date")

cut_off = int(len(train) * 0.75)
sc.fit(train.iloc[0:cut_off])

train = pd.DataFrame(sc.transform(train))
test = pd.DataFrame(sc.transform(test))

# Build Model

## One day horizon

In [4]:
X_train, y_train = get_lag_response_and_dependent(train, input_size=1, horizon_size=1)
X_test, y_test = get_lag_response_and_dependent(test, input_size=1, horizon_size=1)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mse = mean_squared_error(y_pred,y_test)
mae = mean_absolute_error(y_pred,y_test)
print(f"mse: {mse}, mae: {mae}")

mse: 0.17092914014227018, mae: 0.2661943578259878


## Multi day horizont

In [5]:
%%time
pred = np.array([])
truth = np.array([])
lr = LinearRegression(fit_intercept = False)
input_size=672
d = train.shape[1]



for h in [24, 48, 168, 336, 720]:
    y_pred = np.empty((test.shape[0] - h + 1, h, d))
    y_truth = np.empty((test.shape[0] - h + 1, h, d))
    for p in train.columns:
        X_train, y_train = get_lag_response_and_dependent(train[[p]], input_size=input_size, horizon_size=h)
        
        #Pad test wiht the window size
        test_padded = pd.concat([train.iloc[-input_size:],test])
        X_test, y_test = get_lag_response_and_dependent(test_padded[[p]], input_size=input_size, horizon_size=h)

        lr.fit(X_train, y_train)
        y_pred[:,:,p] = lr.predict(X_test)
        y_truth[:,:,p] = y_test

    mae = np.mean(np.abs(y_pred - y_truth))
    mse = np.mean((y_pred - y_truth)**2)
    print(f"horizon: {h}, mse: {mse}, mae: {mae}")

horizon: 24, mse: 0.3041115416681452, mae: 0.35498849323911175
horizon: 48, mse: 0.3420245243955978, mae: 0.37838870757472964
horizon: 168, mse: 0.40871408005425447, mae: 0.42405140637411987
horizon: 336, mse: 0.44459149947044807, mae: 0.4510744424444384
horizon: 720, mse: 0.502096515529627, mae: 0.5028583444464004
CPU times: user 1min 18s, sys: 44.2 s, total: 2min 3s
Wall time: 19.3 s
