# Notes
- The original authors build univariate models
    - Created one multivariate model per variable in dataset
    - Computed mse and mae based on all of their results
- Reference articles ceated univariate and multivariate models.
    - The univariate models were only build with the OT variable
- The splits assume a month is 30 days long

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from typing import Tuple, Iterator, List
from tqdm import tqdm

In [2]:
class DataLoader:
    def __init__(self, filepath:str):
        self.filepath = filepath
        self.dataset_name = self.filepath.split("/")[-1].split(".")[0]
        self.df = self.load_data(self.filepath)
        self.columns = self.df.columns
    
    def load_data(self, filepath:str) -> pd.DataFrame:
        df = pd.read_csv(filepath)
        df = df.drop(columns="date")
        return df

    def get_train_test_data(self, df:pd.DataFrame, input_size = 672, padding=True) -> Tuple[pd.DataFrame, pd.DataFrame]:
        n, d = df.shape
        breakpoint = int(n*0.8)
        train = df.iloc[0:breakpoint]
        test = df.iloc[breakpoint-input_size:] if padding else df.iloc[breakpoint:]
        return train, test
    
    def get_lag_response_and_dependent(self, df: pd.DataFrame, input_size: int, horizon_size:int)-> pd.DataFrame:
        n, d = df.shape
        w = input_size
        h = horizon_size
        X = np.zeros([n + 1 - w - h, d*w])
        y = np.zeros([n + 1 - w - h, d*h])
        x_flat = df.values.flatten()
        for i in range(0,n + 1 - w - h):
            X[i] = x_flat[i*d:d*(i+w)]
            y[i] = x_flat[d*(i+w):d*(i+w) + d*h]
        return X, y

    def build_dataset(self, input_size = 672, horizon_size = 24, cut_off=0.75) -> Iterator[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]]:
        df = self.df.copy()
        #Build train and test dataset
        train, test = self.get_train_test_data(df, input_size = input_size)

        sc = StandardScaler()
        cut_off = int(len(train)*cut_off)
        self.columns = train.columns
        sc.fit(train.iloc[0:cut_off])
        train = pd.DataFrame(sc.transform(train))
        test = pd.DataFrame(sc.transform(test))
        
        X_train, y_train = self.get_lag_response_and_dependent(train, input_size, horizon_size)
        X_test, y_test = self.get_lag_response_and_dependent(test, input_size, horizon_size)

        X_train = pd.DataFrame(X_train)
        y_train = pd.DataFrame(y_train)
        X_test = pd.DataFrame(X_test)
        y_test = pd.DataFrame(y_test)

        # Create hierarhical columns
        X_cols_lvl2 = [ f"t={i-input_size}" for i in range(0,input_size) for _ in range(len(self.columns))]
        X_cols_lvl1 = [col for _ in range(input_size) for col in self.columns]
        X_cols = pd.MultiIndex.from_tuples(zip(X_cols_lvl1, X_cols_lvl2))
        y_cols_lvl2 = [ f"t={i}" for i in range(0,horizon_size) for _ in range(len(self.columns))]
        y_cols_lvl1 = [col for _ in range(horizon_size) for col in self.columns]
        y_cols = pd.MultiIndex.from_tuples(zip(y_cols_lvl1, y_cols_lvl2))
        X_train.columns = X_cols
        y_train.columns = y_cols
        X_test.columns = X_cols
        y_test.columns = y_cols
        return X_train, y_train, X_test, y_test
        
class ETTDataLoader(DataLoader):
    def __init__(self, filepath:str, months=20, day_length=24):
        self.filepath = filepath
        self.months = months
        self.dataset_name = self.filepath.split("/")[-1].split(".")[0]
        self.boundries = [0, day_length*16*30, day_length*20*30]
        self.df = self.load_data(self.filepath)
        self.columns = self.df.columns
        
    def load_data(self, filepath:str) -> pd.DataFrame:
        # Read file
        ett = pd.read_csv(filepath)
        ett = ett.drop(columns="date")
        
        #get correct rows
        ett = ett.iloc[0:self.boundries[2]]
        return ett
        
    def get_train_test_data(self, df:pd.DataFrame, input_size = 672, padding=True, day_length=20) -> Tuple[pd.DataFrame, pd.DataFrame]:
        tmp = df.copy()
        train = tmp.iloc[0: self.boundries[1]]
        test = tmp.iloc[self.boundries[1]:]
        if padding:
            test = pd.concat([train.iloc[-input_size:], test])
        return train, test

def run_Experiment(data_loaders, horizons, input_size=672):
    res = []
    lr = LinearRegression()
    dataset_names = []
    for j, loader in enumerate(dataLoaders):
        mse = np.zeros([1, len(horizons)])
        mae = np.zeros([1, len(horizons)])
        for i, h in enumerate(horizons):
            diffs = []
            for col in tqdm(loader.columns):
                X_train, y_train, X_test, y_test = loader.build_dataset(horizon_size=h, input_size=input_size)
                lr.fit(X_train[col], y_train[col])
                y_pred = lr.predict(X_test[col])
                diff = y_pred - y_test[col].values
                diffs += list(diff)
            diffs = np.array(diffs)
            mae[0,i] = np.mean(np.abs(diffs))
            mse[0,i] = np.mean(diffs**2)
        res.append(mae[0])
        res.append(mse[0])
    
    
    dataset_names = [l.dataset_name for l in dataLoaders for i in range(2)]
    metrics = [metric for metric in ("mae", "mse") for i in range(len(dataLoaders))]
    ml_index = pd.MultiIndex.from_arrays([dataset_names, metrics])
    return pd.DataFrame(res, index=ml_index, columns=horizons)

In [3]:
def run_Experiment(data_loaders, horizons, input_size=672):
    res = []
    lr = LinearRegression()
    dataset_names = []
    for j, loader in enumerate(dataLoaders):
        mse = np.zeros([1, len(horizons)])
        mae = np.zeros([1, len(horizons)])
        for i, h in enumerate(horizons):
            diffs = []
            for col in tqdm(loader.columns):
                X_train, y_train, X_test, y_test = loader.build_dataset(horizon_size=h, input_size=input_size)
                lr.fit(X_train[col], y_train[col])
                y_pred = lr.predict(X_test[col])
                diff = y_pred - y_test[col].values
                diffs += list(diff)
            diffs = np.array(diffs)
            mae[0,i] = np.mean(np.abs(diffs))
            mse[0,i] = np.mean(diffs**2)
        res.append(mae[0])
        res.append(mse[0])
    
    
    dataset_names = [l.dataset_name for l in dataLoaders for i in range(2)]
    metrics = [metric for metric in ("mae", "mse") for i in range(len(dataLoaders))]
    ml_index = pd.MultiIndex.from_arrays([dataset_names, metrics])
    return pd.DataFrame(res, index=ml_index, columns=horizons)

In [6]:
## ETTH
etth1DataLoader = ETTDataLoader("../exercise2/data/ETT-small-20231205T092053Z-001/ETT-small/ETTh1.csv")
etth2DataLoader = ETTDataLoader("../exercise2/data/ETT-small-20231205T092053Z-001/ETT-small/ETTh2.csv")

dataLoaders = [etth1DataLoader, etth2DataLoader]
horizons = [24, 48, 168, 336, 720]
etth_res = run_Experiment(dataLoaders, horizons)
etth_res.to_csv("data/reproduced_results/etth.csv")

## ETTM

ettm1DataLoader = ETTDataLoader("../exercise2/data/ETT-small-20231205T092053Z-001/ETT-small/ETTm1.csv")
dataLoaders = [ettm1DataLoader]
horizons = [24, 48, 96, 228, 672]
ettm_res = run_Experiment(dataLoaders, horizons)
ettm_res.ro_csv("data/reproduced_results/ettm.csv")

## ILI
iliLoader = DataLoader("../exercise2/data/illness-20231205T092100Z-001/illness/national_illness.csv")
dataLoaders = [iliLoader]
horizons = [24, 36, 48, 60]
ili_res = run_Experiment(dataLoaders, horizons, input_size=96)
ili_res.to_csv("data/reproduced_results/ili.csv")

##WTH
wthLoader = DataLoader("../exercise2/data/WTH.csv-20231205T092445Z-001/WTH.csv")
dataLoaders = [wthLoader]
horizons = [24, 48, 168, 338, 720]
wth_res = run_Experiment(dataLoaders, horizons)
wth_res.to_csv("data/reproduced_results/wth.csv")

## WEATHER
weatherLoader = DataLoader("../exercise2/data/weather-20231205T093714Z-001/weather/weather.csv")
dataLoaders = [weatherLoader]
horizons = [96, 192, 336, 720]
weather_res = run_Experiment(dataLoaders, horizons)
weather_res.to_csv("data/reproduced_results/weather.csv")

#exchange
exchange_rate = DataLoader("../exercise2/data/exchange_rate-20231205T092055Z-001/exchange_rate/exchange_rate.csv")
dataLoaders = [exchange_rate]
horizons = [24, 36, 48, 60]
exchange_res = run_Experiment(dataLoaders, horizons, input_size=31)
exchange_res.to_csv("data/reproduced_results/exchange.csv")

#ECL
ECL = DataLoader("../exercise2/data/ECL.csv-20231205T092501Z-001/ECL.csv")
dataLoaders = [ECL]
horizons = [48, 168, 336, 720, 960]
ecl_res = run_Experiment(dataLoaders, horizons)
ecl_res.to_csv("data/reproduced_results/ecl.csv")

100%|█████████████████████████████████████████████| 7/7 [00:03<00:00,  1.84it/s]
100%|█████████████████████████████████████████████| 7/7 [00:04<00:00,  1.73it/s]
100%|█████████████████████████████████████████████| 7/7 [00:04<00:00,  1.48it/s]
100%|█████████████████████████████████████████████| 7/7 [00:05<00:00,  1.20it/s]
100%|█████████████████████████████████████████████| 7/7 [00:08<00:00,  1.18s/it]


Unnamed: 0,Unnamed: 1,24,48,168,336,720
ETTh1,mae,0.357471,0.378909,0.421562,0.44532,0.501034
ETTh1,mse,0.304926,0.340621,0.404459,0.433502,0.490237


In [None]:
exchange_rate = DataLoader("../exercise2/data/exchange_rate-20231205T092055Z-001/exchange_rate/exchange_rate.csv")