## Imports

In [None]:
from autogluon.tabular import TabularPredictor
import autosklearn
import autosklearn.regression
from tpot import TPOTRegressor
import h2o 
from h2o.automl import H2OAutoML

In [None]:
# lets us clear the displayed output at every iteration to avoid excessively polluting the notebook 
from IPython.display import clear_output

In [None]:
from datetime import datetime
import pandas as pd
import numpy as np

## AutoML calling function

In [None]:
#each framework requires a specific format. This is managed with mulitple if conditions in a unique function.
def call_automl(framework_name, max_time_given, X_train, y_train, X_test, dataset_name, iteration, identifier):
    
    if isinstance(y_train,pd.Series):
            y_train = y_train.to_frame()
    
    if framework_name == 'tpot':
        x = list(X_train.columns.values)    
        y = "target"
 
        automl = TPOTRegressor(max_time_mins = max_time_given//60)
        automl.fit(X_train,y_train)
        
        #save leaderboard
        automl.export('./exports/'+identifier+'_tpot_exported_pipeline.py')
                    
        predictions = automl.predict(X_test)    
 
        if len(predictions.shape) > 1:
            predictions = np.concatenate(predictions,axis = 0)
            
    elif framework_name == 'autosklearn':  
        automl = autosklearn.regression.AutoSklearnRegressor(
            time_left_for_this_task=max_time_given
        )
        automl.fit(X_train, y_train, dataset_name=dataset_name)
        
        predictions = automl.predict(X_test)  
        
        #save leaderboard --> error in saving leaderboard with autosklearn! 
        #with open("./exports/"+identifier+".out", "w") as text_file: 
        #    print(automl.get_models_with_weights())
        #    text_file.write(ss)
                          
    elif framework_name == 'autogluon':  
        automl = TabularPredictor(label='target', path='./AutoGluonModel/')
        automl.fit(pd.concat([X_train, y_train], axis=1), time_limit=max_time_given)
        
        #save leaderboard
        with open("./exports/"+identifier+".out", "w") as text_file: #
            text_file.write(str(automl.fit_summary(verbosity=1)))
        
        predictions = automl.predict(X_test)    
    
    elif framework_name == 'h2o': 
        
        h2o.init() 
        
        x = list(X_train.columns.values)    
        y = "target"
 
        automl = H2OAutoML(max_runtime_secs = max_time_given)
        df = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
        automl.train(x=x,y=y, training_frame = df)
 
        dft = h2o.H2OFrame(X_test)
        predictions = automl.predict(dft)
        
        lb = automl.leaderboard
        
        #save leaderboard
        with open("./exports/"+identifier+".out", "w") as text_file: 
            text_file.write(h2o.as_list(lb.head(rows=lb.nrows)).to_string())
     
 
        predictions = h2o.as_list(predictions)
        predictions = predictions.values
        predictions = np.concatenate( predictions, axis=0 )
        
    
    
    return predictions

## Additional functions

In [None]:
def timestamp(string):
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    ss = string + "--> Current Time ="+ current_time
    print(ss)
    return ss
 
def train_test_split(considered_slice, fixed_lag, num_test_days, horizon, variables):
    considered_slice = get_top_correlated(considered_slice,variables)
    _ , sequence = build_XY_multivariate(considered_slice, fixed_lag, horizon)
    
    dataset = sequence[0] #works both for SISO and MISO (target 0) 
    
    dataset = dataset.rename({'Y0': 'target'}, axis='columns')
    # MIMO not implemented
    
    test_set = dataset.iloc[-num_test_days:,:]
    train_set = dataset.iloc[:-num_test_days,:]
    y_train = train_set['target'].reset_index(drop=True)
    X_train = train_set.drop(['target'], axis=1).reset_index(drop=True)
    y_test = test_set['target'].reset_index(drop=True)
    X_test = test_set.drop(columns='target').reset_index(drop=True)
    
        
    return  X_train,X_test,y_train,y_test

#returns a DF composed by the `num_variables` most correlated columns with column 0, and column 0 itself 
def get_top_correlated(multi_time_series, num_variables):
    s = multi_time_series.corr().abs()[0].sort_values(kind="quicksort",ascending=False)
    to_pick = s.index[:num_variables]
    return multi_time_series[to_pick]

#performs embedding for a single variable given the lag and the horizon
def build_XY_univariate(uni_time_series, lag, horizon = 1):
    horizon_shift = horizon - 1 
    Xy = pd.DataFrame(columns=range(lag))
    for i in range(lag):
        Xy[i] = uni_time_series[:].shift(-i)
    Xy['target'] = uni_time_series[:].shift(- lag - horizon_shift)    
    return Xy.dropna()
 
def build_XY_multivariate(multi_time_series,lag , horizon = 1):
    
    if isinstance(multi_time_series,pd.Series):
        multi_time_series = multi_time_series.to_frame()
 
    
    num_rows = len(multi_time_series.index) - lag #final matrix rows 
    num_cols = len(multi_time_series.columns) 
    
    multi_X = pd.DataFrame(index=np.arange(num_rows))
    multi_y = pd.DataFrame(index=np.arange(num_rows))
    X_names_list = [ "X"+str(i)+"_"+str(j) for i in range(num_cols) for j in range(lag)]
    y_names_list = [ "Y"+str(i) for i in range(num_cols)]
    
    for column_idx in range(num_cols):
        single_col = multi_time_series.iloc[:,column_idx]
        single_Xy = build_XY_univariate(single_col, lag, horizon)
        single_y = single_Xy['target']
        single_X = single_Xy.drop(columns ='target')
        
        multi_X = pd.concat([multi_X,single_X],axis=1)
        multi_y = pd.concat([multi_y,single_y],axis=1)
    
    multi_y.columns = y_names_list
    multi_X.columns = X_names_list
    
    multi_Xy = pd.concat([multi_X,multi_y],axis=1)    
    
    single_target_list = [pd.concat([multi_X,multi_y[column]],axis=1) for column in multi_y.columns]
    return multi_Xy, single_target_list

def execute(model_name, max_time_given, series_name, time_series, test_parts, num_test_days, fixed_lag, horizon, variables, identifier):
    performances = pd.DataFrame(columns=['Model','Horizon','Variables','Max Time Given','Data','Lag','Test partition','Test Day','Truth','Prediction','Error','Naive pred', 'Naive error', 'MASE'])
    window_size = len(time_series) // test_parts
    for i in range(test_parts): 
                
        considered_slice = time_series[: window_size * (i+1)]
        X_train,X_test,y_train,y_test = train_test_split(considered_slice,fixed_lag,num_test_days,horizon, variables)
        
        predictions = call_automl(model_name, max_time_given, X_train, y_train, X_test, series_name, i, identifier)
        
        print(predictions)     
        for index_prediction in range(len(predictions)):
            
            single_pred = predictions[index_prediction]
            truth = y_test[index_prediction]
            pred_naive = X_test.iloc[index_prediction, (fixed_lag - 1)]
            error_model = abs(single_pred - truth )
            error_naive = abs(pred_naive - truth)
            MASE = error_model/error_naive
            
            performances.loc[len(performances)] = [ model_name,
                                                   horizon,
                                                   variables,
                                                   max_time_given,
                                                   series_name,
                                                   fixed_lag,
                                                   i,
                                                   index_prediction,
                                                   truth, 
                                                   single_pred,
                                                   error_model, 
                                                   pred_naive,
                                                   error_naive, 
                                                   MASE]
        
    performances.to_csv("./results/"+identifier+".csv",index=False)    
    


## Load unpreprocessed data

In [None]:
exchange = pd.read_csv("../../multivariate-time-series-data/exchange_rate/exchange_rate.txt",header=None)
electricity = pd.read_csv("../../multivariate-time-series-data/electricity/electricity.txt",header=None)

## Execution

In [None]:
fixed_lag = 5
test_parts = 3
test_size = 17 #total of test_parts * test_size data points in test_parts different moments of time 
frameworks = ["autogluon", "autosklearn", "tpot", "h2o"]
datasets_names = ["Exchange", "Electricity"]
datasets = [exchange, electricity]
times = [60, 120, 300, 600]
variables_num = [1, 3, 5, 8]
horizons = [1, 2]

#Creates current state log
with open("current_state.tmp", "w") as text_file: #
                        text_file.write( timestamp("ABSOLUTE BEGIN") + "\n")
        
for max_time_given in times:
    for horizon in horizons:
        for variables in variables_num:
            for framework_name in frameworks:
                for index, dataset in enumerate(datasets):
                    
                    dataset_name = datasets_names[index]
                    identifier =  framework_name+'_'+dataset_name+'_h'+str(horizon)+'_v'+str(variables)+'_'+str(max_time_given)+'_l'+str(fixed_lag) 
                    
                    #logs before execution 
                    with open("current_state.tmp", "a") as text_file: #
                        text_file.write( timestamp("Begin of one cycle") + ' --- ' + identifier + "\n")
                     
                    execute(framework_name, max_time_given, dataset_name, dataset, test_parts, test_size, fixed_lag, horizon, variables, identifier)
                    
                    #logs after execution
                    with open("current_state.tmp", "a") as text_file: #
                        text_file.write( timestamp("End of one cycle") + ' --- ' + identifier + "\n")
                    
                    clear_output()