In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from typing import Tuple, Iterator

In [2]:
class DataLoader:
    def __init__(self, filepath:str):
        raise NotImplementedError()
    
    def load_data(self) -> pd.DataFrame:
        raise NotImplementedError()

    def get_train_test_data(self, horizn_size: int, input_size: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
        raise NotImplementedError
    
    def get_lag_response_and_dependent(self, df: pd.DataFrame, input_size: int, horizon_size:int )-> pd.DataFrame:
        n, d = df.shape
        w = input_size
        h = horizon_size
        res_df = np.zeros([n + 1 - w - h,d * w + d * h])
        flat = df.values.flatten()
        for i in range(0, res_df.shape[0]):
            tmp = flat[i*d:i*d + d*(w+h)]
            res_df[i,:] = tmp
        X = res_df[:, 0:w*d]
        y = res_df[:, w*d:]
        return X, y
    
    def get_datasets(self, train_size_months = 16, input_size = 672, horizon_size = 24, cut_off=0.75) -> Iterator[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]]:
        df = self.df.copy()
        #Build train and test dataset
        train, test = self.get_train_test_data(df, input_size = input_size, train_size_months = train_size_months)

        sc = StandardScaler()
        cut_off = int(len(train)*cut_off)
        sc.fit(train.iloc[0:cut_off])
        train = pd.DataFrame(sc.transform(train))
        test = pd.DataFrame(sc.transform(test))
        
        X_train, y_train = self.get_lag_response_and_dependent(train, input_size, horizon_size)
        X_test, y_test = self.get_lag_response_and_dependent(test, input_size, horizon_size)
        d = train.shape[1]
        for p in range(0, train.shape[1]):
            x_cols = [i*d + p for i in range(0,input_size)]
            y_cols = [i*d + p for i in range(0, horizon_size)]
            yield X_train[:, x_cols], y_train[:, y_cols], X_test[:, x_cols], y_test[:, y_cols]
    

class ETTDataLoader(DataLoader):
    def __init__(self, filepath:str, months=20):
        self.filepath = filepath
        self.months = months
        self.dataset_name = self.filepath.split("/")[-1].split(".")[0]
        self.df = self.load_data()
        self.vars = ["HUFL", "HULL", "MUFL", "MULL", "LUFL", "LULL", "OT"]
        
    def load_data(self) -> pd.DataFrame:
        # Read file
        ett = pd.read_csv(self.filepath)

        #Check if it contains all columns
        expected_columns = ["date", "HUFL", "HULL", "MUFL", "MULL", "LUFL", "LULL", "OT"]
        for expected in expected_columns:
            assert expected in ett.columns, f"Column: {expected} not in dataframe."

        #Convert data types
        ett["date"] = pd.to_datetime(ett["date"])

        #get correct rows
        start_date = ett["date"].min()
        ett = ett.loc[ett["date"] < (start_date + pd.DateOffset(months=self.months))]
        return ett
        
    def get_train_test_data(self, df:pd.DataFrame, input_size = 672, train_size_months = 16, padding=True) -> Tuple[pd.DataFrame, pd.DataFrame]:
        tmp = df.copy()
        start_date = tmp["date"].min()
        mask = tmp["date"] < (start_date + pd.DateOffset(months=train_size_months))
        train = tmp.loc[mask].drop(columns = "date")
        test = tmp.loc[~mask].drop(columns = "date")
        if padding:
            test = pd.concat([train.iloc[-input_size:], test])
        return train, test
        

In [3]:
def run_experiment(dataLoader: DataLoader, horizons, input_sizes):
    lr = LinearRegression(fit_intercept = False)
    for i, h in enumerate(horizons):
        mse = mae = div = 0
        Datasets = dataLoader.get_datasets(horizon_size=horizons[i], input_size=input_sizes[i])
        for X_train, y_train, X_test, y_test in Datasets:
            lr = lr.fit(X_train, y_train)
            predict = lr.predict(X_test)
            div += predict.shape[0] * predict.shape[1]
            mae += np.sum(np.abs(predict - y_test))
            mse += np.sum((predict-y_test)**2)
        print(f"mse: {np.mean(mse/div)}, mae: {np.mean(mae/div)}")
    
    

In [4]:
%%time
dl = ETTDataLoader("../exercise2/data/ETT-small-20231205T092053Z-001/ETT-small/ETTh1.csv")
run_experiment(dl, horizons = [24, 48, 168, 336, 720], input_sizes = [672, 672, 672, 672, 672])

mse: 0.3041115416681451, mae: 0.35498849323911175
mse: 0.3420245243955977, mae: 0.37838870757472953
mse: 0.4087140800542539, mae: 0.42405140637412
mse: 0.4445914994704474, mae: 0.4510744424444385
mse: 0.5020965155296265, mae: 0.5028583444464008
CPU times: user 1min 19s, sys: 42 s, total: 2min 1s
Wall time: 19.8 s


In [None]:
%%time
dl = ETTDataLoader("../exercise2/data/ETT-small-20231205T092053Z-001/ETT-small/ETTh2.csv")
run_experiment(dl, horizons = [24, 48, 168, 336, 720], input_sizes = [672, 672, 672, 672, 672])

mse: 0.17109527657886725, mae: 0.26583507708955284
mse: 0.22350248970878317, mae: 0.30426933417387936
mse: 0.33732819848543993, mae: 0.3857293397885034
mse: 0.42400305895576784, mae: 0.44269177032103946


In [None]:
%%time
dl = ETTDataLoader("../exercise2/data/ETT-small-20231205T092053Z-001/ETT-small/ETTm1.csv")
run_experiment(dl, horizons = [24, 48, 168, 336, 720], input_sizes = [672, 672, 672, 672, 672])

In [None]:
pd.read_csv("../exercise2/data/ETT-small-20231205T092053Z-001/ETT-small/ETTm1.csv")