In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from typing import Tuple, Iterator, List

In [38]:
class DataLoader:
    def __init__(self, filepath:str):
        self.filepath = filepath
        self.dataset_name = self.filepath.split("/")[-1].split(".")[0]
        self.df = self.load_data(self.filepath)
        self.columns = self.df.columns
    
    def load_data(self, filepath:str) -> pd.DataFrame:
        df = pd.read_csv(filepath)
        df = df.drop(columns="date")
        return df

    def get_train_test_data(self, df:pd.DataFrame, input_size = 672, padding=True) -> Tuple[pd.DataFrame, pd.DataFrame]:
        n, d = df.shape
        breakpoint = int(n*0.8)
        train = df.iloc[0:breakpoint]
        test = df.iloc[breakpoint-input_size:] if padding else df.iloc[breakpoint:]
        return train, test
    
    def get_lag_response_and_dependent(self, df: pd.DataFrame, input_size: int, horizon_size:int)-> pd.DataFrame:
        n, d = df.shape
        w = input_size
        h = horizon_size
        X = np.zeros([n + 1 - w - h, d*w])
        y = np.zeros([n + 1 - w - h, d*h])
        x_flat = df.values.flatten()
        for i in range(0,n + 1 - w - h):
            X[i] = x_flat[i*d:d*(i+w)]
            y[i] = x_flat[d*(i+w):d*(i+w) + d*h]
        return X, y

    def build_dataset(self, input_size = 672, horizon_size = 24, cut_off=0.75) -> Iterator[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]]:
        df = self.df.copy()
        #Build train and test dataset
        train, test = self.get_train_test_data(df, input_size = input_size)

        sc = StandardScaler()
        cut_off = int(len(train)*cut_off)
        self.columns = train.columns
        sc.fit(train.iloc[0:cut_off])
        train = pd.DataFrame(sc.transform(train))
        test = pd.DataFrame(sc.transform(test))
        
        X_train, y_train = self.get_lag_response_and_dependent(train, input_size, horizon_size)
        X_test, y_test = self.get_lag_response_and_dependent(test, input_size, horizon_size)

        X_train = pd.DataFrame(X_train)
        y_train = pd.DataFrame(y_train)
        X_test = pd.DataFrame(X_test)
        y_test = pd.DataFrame(y_test)

        # Create hierarhical columns
        X_cols_lvl2 = [ f"t={i-input_size}" for i in range(0,input_size) for _ in range(len(self.columns))]
        X_cols_lvl1 = [col for _ in range(input_size) for col in self.columns]
        X_cols = pd.MultiIndex.from_tuples(zip(X_cols_lvl1, X_cols_lvl2))
        y_cols_lvl2 = [ f"t={i}" for i in range(0,horizon_size) for _ in range(len(self.columns))]
        y_cols_lvl1 = [col for _ in range(horizon_size) for col in self.columns]
        y_cols = pd.MultiIndex.from_tuples(zip(y_cols_lvl1, y_cols_lvl2))
        X_train.columns = X_cols
        y_train.columns = y_cols
        X_test.columns = X_cols
        y_train.columns = y_cols
        return X_train, y_train, X_test, y_test
        
class ETTDataLoader(DataLoader):
    def __init__(self, filepath:str, months=20):
        self.filepath = filepath
        self.months = months
        self.dataset_name = self.filepath.split("/")[-1].split(".")[0]
        self.df = self.load_data(self.filepath)
        self.columns = self.df.columns
        
    def load_data(self, filepath:str) -> pd.DataFrame:
        # Read file
        ett = pd.read_csv(filepath)

        #Convert data types
        ett["date"] = pd.to_datetime(ett["date"])

        #get correct rows
        start_date = ett["date"].min()
        ett = ett.loc[ett["date"] < (start_date + pd.DateOffset(months=self.months))]
        return ett
        
    def get_train_test_data(self, df:pd.DataFrame, input_size = 672, padding=True) -> Tuple[pd.DataFrame, pd.DataFrame]:
        tmp = df.copy()
        start_date = tmp["date"].min()
        mask = tmp["date"] < (start_date + pd.DateOffset(months=16))
        train = tmp.loc[mask].drop(columns = "date")
        test = tmp.loc[~mask].drop(columns = "date")
        if padding:
            test = pd.concat([train.iloc[-input_size:], test])
        return train, test
        

In [43]:
ettDataLoader = ETTDataLoader("../exercise2/data/ETT-small-20231205T092053Z-001/ETT-small/ETTh1.csv")
X_train, y_train, X_test, y_test = ettDataLoader.build_dataset(horizon_size=24, input_size=672)
X_train

Unnamed: 0_level_0,HUFL,HULL,MUFL,MULL,LUFL,LULL,OT,HUFL,HULL,MUFL,...,LUFL,LULL,OT,HUFL,HULL,MUFL,MULL,LUFL,LULL,OT
Unnamed: 0_level_1,t=-672,t=-672,t=-672,t=-672,t=-672,t=-672,t=-672,t=-671,t=-671,t=-671,...,t=-2,t=-2,t=-2,t=-1,t=-1,t=-1,t=-1,t=-1,t=-1,t=-1
0,-0.358148,-0.004038,-0.623418,-0.158013,1.389325,0.886239,1.466613,-0.381198,0.028197,-0.642773,...,3.183258,1.028760,2.431783,1.635176,0.672899,1.337040,0.415657,2.585608,0.839764,2.377779
1,-0.381198,0.028197,-0.642773,-0.176753,1.329462,0.934262,1.165416,-0.473399,-0.132979,-0.681302,...,2.585608,0.839764,2.377779,1.600601,1.027486,1.362727,0.544759,2.226429,0.839764,2.470421
2,-0.473399,-0.132979,-0.681302,-0.213714,0.971264,0.697242,1.165416,-0.484924,-0.036273,-0.681302,...,2.226429,0.839764,2.470421,1.220445,0.543959,1.015784,0.045531,1.718082,0.650768,2.331348
3,-0.484924,-0.036273,-0.681302,-0.194973,1.000705,0.791741,0.864329,-0.438824,-0.036273,-0.642773,...,1.718082,0.650768,2.331348,1.277898,0.769605,1.060644,0.637421,1.270580,0.556270,2.570747
4,-0.438824,-0.036273,-0.642773,-0.158013,1.060568,0.791741,0.524494,-0.392724,0.060432,-0.636261,...,1.270580,0.556270,2.570747,0.978418,0.511724,0.790760,0.359956,1.150854,0.745266,2.354618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11012,-0.312048,-1.421903,-0.192725,-0.454218,-0.612657,-2.841014,-0.981162,-0.415774,-1.744254,-0.263452,...,0.433477,0.415297,-0.880836,-0.081546,-0.390379,-0.121998,-0.454218,0.164583,0.461772,-0.911680
11013,-0.415774,-1.744254,-0.263452,-0.768643,-0.852110,-2.794539,-0.788194,-0.058496,-1.228492,0.083671,...,0.164583,0.461772,-0.911680,-0.242897,-0.422614,-0.218411,-0.472438,-0.165155,0.509795,-0.911680
11014,-0.058496,-1.228492,0.083671,-0.213714,-0.702943,-2.511045,-0.811354,-0.093071,-0.777200,0.147886,...,-0.165155,0.509795,-0.911680,0.229459,-0.454849,0.237968,-0.546359,-0.074870,0.980737,-0.911680
11015,-0.093071,-0.777200,0.147886,0.082492,-0.911973,-2.557519,-0.795878,0.114208,-0.648260,0.269985,...,-0.074870,0.980737,-0.911680,0.240984,0.253843,0.334381,0.100712,-0.224037,0.886239,-0.934841


In [44]:
iliDataLoader = DataLoader("../exercise2/data/illness-20231205T092100Z-001/illness/national_illness.csv")
X_train, y_train, X_test, y_test = iliDataLoader.build_dataset(horizon_size=24, input_size=672, cut_off=0.875)
X_train

Unnamed: 0_level_0,% WEIGHTED ILI,%UNWEIGHTED ILI,AGE 0-4,AGE 5-24,ILITOTAL,NUM. OF PROVIDERS,OT,% WEIGHTED ILI,%UNWEIGHTED ILI,AGE 0-4,...,ILITOTAL,NUM. OF PROVIDERS,OT,% WEIGHTED ILI,%UNWEIGHTED ILI,AGE 0-4,AGE 5-24,ILITOTAL,NUM. OF PROVIDERS,OT
Unnamed: 0_level_1,t=-672,t=-672,t=-672,t=-672,t=-672,t=-672,t=-672,t=-671,t=-671,t=-671,...,t=-2,t=-2,t=-2,t=-1,t=-1,t=-1,t=-1,t=-1,t=-1,t=-1
0,-0.422367,-0.473102,-0.981728,-0.692729,-0.819749,-1.150344,-1.384259,-0.332141,-0.429835,-0.934329,...,-0.520820,0.084803,0.372641,-0.762174,-0.774676,-0.630688,-0.489794,-0.521153,0.036207,0.376380
1,-0.332141,-0.429835,-0.934329,-0.676955,-0.796772,-1.087574,-1.341516,-0.343662,-0.504463,-0.953570,...,-0.521153,0.036207,0.376380,-0.752654,-0.749833,-0.584226,-0.454951,-0.474200,0.068604,0.508836
2,-0.343662,-0.504463,-0.953570,-0.675543,-0.806873,-0.994432,-1.314812,-0.200736,-0.398605,-0.913210,...,-0.474200,0.068604,0.508836,-0.754629,-0.714674,-0.463145,-0.403629,-0.421473,0.080753,0.620943
3,-0.200736,-0.398605,-0.913210,-0.636227,-0.759919,-0.929638,-1.249107,-0.219372,-0.355259,-0.868626,...,-0.421473,0.080753,0.620943,-0.717053,-0.650145,-0.487080,-0.342654,-0.386508,0.064555,0.549583
4,-0.219372,-0.355259,-0.868626,-0.602326,-0.725731,-0.836495,-1.180551,-0.130636,-0.265684,-0.838590,...,-0.386508,0.064555,0.549583,-0.695857,-0.598302,-0.450474,-0.371611,-0.385731,0.054430,0.394659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,-0.355036,-0.351012,-0.644297,-0.586788,-0.638705,-0.644136,-0.922246,-0.532857,-0.585744,-0.806208,...,1.345349,1.326025,1.076155,1.324583,1.596049,2.364424,0.799619,1.715541,1.226808,0.908814
73,-0.532857,-0.585744,-0.806208,-0.637875,-0.724732,-0.757527,-0.927247,-0.610221,-0.608897,-0.820757,...,1.715541,1.226808,0.908814,1.103595,1.189455,1.896995,0.715337,1.635398,1.421192,1.272084
74,-0.610221,-0.608897,-0.820757,-0.668009,-0.743824,-0.866868,-0.969200,-0.581104,-0.572274,-0.842814,...,1.635398,1.421192,1.272084,1.084519,1.135375,1.570358,1.199603,1.715208,1.366521,1.446821
75,-0.581104,-0.572274,-0.842814,-0.673188,-0.747820,-0.860793,-1.031672,-0.556476,-0.600412,-0.912271,...,1.715208,1.366521,1.446821,1.445169,1.470355,1.748225,1.977440,2.146341,1.431316,1.536993


In [42]:
iliDataLoader = DataLoader("../exercise2/data/illness-20231205T092100Z-001/illness/national_illness.csv")
X_train, y_train, X_test, y_test = iliDataLoader.build_dataset(horizon_size=24, input_size=672, cut_off=0.875)
X_train

Unnamed: 0_level_0,% WEIGHTED ILI,%UNWEIGHTED ILI,AGE 0-4,AGE 5-24,ILITOTAL,NUM. OF PROVIDERS,OT,% WEIGHTED ILI,%UNWEIGHTED ILI,AGE 0-4,...,ILITOTAL,NUM. OF PROVIDERS,OT,% WEIGHTED ILI,%UNWEIGHTED ILI,AGE 0-4,AGE 5-24,ILITOTAL,NUM. OF PROVIDERS,OT
Unnamed: 0_level_1,t=0,t=0,t=0,t=0,t=0,t=0,t=0,t=1,t=1,t=1,...,t=22,t=22,t=22,t=23,t=23,t=23,t=23,t=23,t=23,t=23
0,-0.752654,-0.749833,-0.584226,-0.454951,-0.474200,0.068604,0.508836,-0.754629,-0.714674,-0.463145,...,0.689992,1.388794,1.090862,0.415000,0.417849,1.132965,0.737938,0.965499,1.425241,1.458771
1,-0.754629,-0.714674,-0.463145,-0.403629,-0.421473,0.080753,0.620943,-0.717053,-0.650145,-0.487080,...,0.965499,1.425241,1.458771,0.512732,0.511280,1.155022,0.848822,1.054412,1.386769,1.441824
2,-0.717053,-0.650145,-0.487080,-0.342654,-0.386508,0.064555,0.549583,-0.695857,-0.598302,-0.450474,...,1.054412,1.386769,1.441824,0.862163,0.837479,1.394368,1.103314,1.368548,1.372596,1.398815
3,-0.695857,-0.598302,-0.450474,-0.371611,-0.385731,0.054430,0.394659,-0.596736,-0.544977,-0.319538,...,1.368548,1.372596,1.398815,1.158844,1.140169,1.559564,1.313076,1.691232,1.358422,1.409048
4,-0.596736,-0.544977,-0.319538,-0.236243,-0.264516,0.094927,0.690229,-0.549678,-0.463931,-0.232716,...,1.691232,1.358422,1.409048,1.148325,1.217472,1.701294,1.501886,1.884598,1.283503,1.551525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,1.103595,1.189455,1.896995,0.715337,1.635398,1.421192,1.272084,1.084519,1.135375,1.570358,...,-0.214787,0.346006,0.671103,-0.680839,-0.575878,-0.293256,-0.388091,-0.311803,0.299435,0.609334
73,1.084519,1.135375,1.570358,1.199603,1.715208,1.366521,1.446821,1.445169,1.470355,1.748225,...,-0.311803,0.299435,0.609334,-0.682819,-0.624190,-0.349104,-0.418225,-0.355760,0.297410,0.590024
74,1.445169,1.470355,1.748225,1.977440,2.146341,1.431316,1.536993,1.678239,1.578246,2.010567,...,-0.355760,0.297410,0.590024,-0.717981,-0.651207,-0.423724,-0.470725,-0.427246,0.202243,0.386325
75,1.678239,1.578246,2.010567,2.430865,2.422071,1.415117,1.714464,2.248504,2.080190,2.242404,...,-0.427246,0.202243,0.386325,-0.750981,-0.656231,-0.443904,-0.514042,-0.457216,0.271087,0.278328


In [45]:
wthDataLoader = DataLoader("../exercise2/data/WTH.csv-20231205T092445Z-001/WTH.csv")
X_train, y_train, X_test, y_test = wthDataLoader.build_dataset(horizon_size=24, input_size=672, cut_off=0.875)
X_train

Unnamed: 0_level_0,Visibility,DryBulbFarenheit,DryBulbCelsius,WetBulbFarenheit,DewPointFarenheit,DewPointCelsius,RelativeHumidity,WindSpeed,WindDirection,StationPressure,...,DryBulbCelsius,WetBulbFarenheit,DewPointFarenheit,DewPointCelsius,RelativeHumidity,WindSpeed,WindDirection,StationPressure,Altimeter,WetBulbCelsius
Unnamed: 0_level_1,t=-672,t=-672,t=-672,t=-672,t=-672,t=-672,t=-672,t=-672,t=-672,t=-672,...,t=-1,t=-1,t=-1,t=-1,t=-1,t=-1,t=-1,t=-1,t=-1,t=-1
0,0.299896,-1.665559,-1.685078,-1.742165,-1.019867,-1.038262,0.736390,0.264077,-0.046853,0.674032,...,-1.043812,-0.689234,0.213261,0.197426,1.737958,-1.169162,-1.309143,-0.813049,-0.787999,-0.678641
1,0.299896,-1.665559,-1.685078,-1.742165,-1.019867,-1.038262,0.736390,-0.145419,0.147346,0.616836,...,-1.043812,-0.689234,0.213261,0.197426,1.737958,-1.169162,-1.309143,-0.813049,-0.787999,-0.678641
2,0.299896,-1.665559,-1.685078,-1.742165,-1.019867,-1.038262,0.736390,-0.145419,0.535743,0.674032,...,-1.043812,-0.689234,0.213261,0.197426,1.737958,-1.169162,-1.309143,-0.813049,-0.831478,-0.678641
3,0.299896,-1.665559,-1.685078,-1.742165,-1.019867,-1.038262,0.736390,0.264077,0.438644,0.674032,...,-1.043812,-0.689234,0.213261,0.197426,1.737958,-1.169162,-1.309143,-0.813049,-0.831478,-0.678641
4,0.299896,-1.665559,-1.685078,-1.661170,-0.882853,-0.914693,1.016829,0.059329,-0.143952,0.616836,...,-1.043812,-0.770229,0.076247,0.073857,1.457519,0.059329,0.050246,-0.813049,-0.831478,-0.737567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27351,0.299896,-0.837637,-0.830057,-0.851223,-0.540317,-0.543987,0.095386,-0.145419,0.535743,-0.469876,...,0.131842,-0.041276,-0.266289,-0.296849,-0.625744,-1.169162,-1.309143,0.960009,0.994631,-0.030448
27352,0.299896,-0.837637,-0.830057,-0.851223,-0.540317,-0.543987,0.095386,0.468826,1.215438,-0.469876,...,0.131842,-0.122271,-0.677331,-0.667555,-0.946246,-1.169162,-1.309143,0.960009,0.994631,-0.163033
27353,0.299896,-0.955912,-0.936935,-0.932218,-0.540317,-0.543987,0.295699,-1.169162,-1.309143,-0.469876,...,0.131842,-0.122271,-0.677331,-0.667555,-0.946246,-0.554916,1.603835,0.960009,0.951152,-0.163033
27354,0.299896,-1.133324,-1.150690,-1.094207,-0.540317,-0.543987,0.576139,-1.169162,-1.309143,-0.412681,...,-0.081913,-0.284260,-0.540317,-0.543987,-0.705869,-1.169162,-1.309143,0.960009,0.951152,-0.236691
