In [1]:
%matplotlib inline
import os
import pandas as pd
import re
import matplotlib.pyplot as plt
import pickle as pkl
import seaborn as sns
from utils.metrics import *

width = 6
height = 4
plt.figure(figsize=(width,height))
sns.set(rc={"figure.figsize":(width, height)})
root_baseline_data_path = '../output/%s/%s/%s/'
seq_len = 96

def get_borders(data_name, data_len):
    border1s = [0, 12 * 30 * 24 * 4 - seq_len, 12 * 30 * 24 * 4 + 4 * 30 * 24 * 4 - seq_len]
    border2s = [12 * 30 * 24 * 4, 12 * 30 * 24 * 4 + 4 * 30 * 24 * 4, 12 * 30 * 24 * 4 + 8 * 30 * 24 * 4]

    if data_name.find('ett') != -1:
        return border1s, border2s

    num_train = int(data_len * 0.7)
    num_test = int(data_len * 0.2)
    num_vali = data_len - num_train - num_test
    border1s = [0, num_train - seq_len, data_len - num_test - seq_len]
    border2s = [num_train, num_train + num_vali, data_len]

    return border1s, border2s

def get_raw_columns(columns):
    return [col for col in columns if 'R' in col]


def get_eb_columns(columns, eb):
    return [col for col in columns if col.endswith(f'E{eb}')]


def get_eb(exp_str):
    eb = re.findall('eb[_.0-9]+', exp_str)[0]
    eb = eb.replace('eb', '')
    eb = eb.replace('_', '')
    eb = float(eb)
    if eb >= 1:
        eb *= 0.01

    return eb


def load(path):
    with open(path, 'rb') as f:
        return pkl.load(f)


def metrics_ensemble(pred, true):
    mae = MAE(pred, true)
    rmse = RMSE(pred, true)
    rse = RSE(pred, true)
    nrmse = NRMSE(pred, true)
    corr = CORR(pred, true)
    psnr = PSNR(pred, true)

    return {'mae': mae,
            'rmse': rmse,
            'nrmse': nrmse,
            'rse': rse,
            'corr': corr,
            'psnr': psnr}


def load_pkl(path):
    with open(path, 'rb') as f:
        return pkl.load(f)


def get_baseline(model:str, data:str):
    raw_data_path = root_baseline_data_path%(model, data, 'raw') + 'true.pkl'
    raw_data = load(raw_data_path)

    predictions_path = root_baseline_data_path%(model, data, 'raw') + 'output.pkl'
    pred_data = load(predictions_path)
    results = metrics_ensemble(pred_data, raw_data)
    df = pd.DataFrame(results, index=[0.0])
    df.index.name = 'eb'
    return df


def get_forecasting_results(model: str, data_file: str, data_name: str,
                            eblc_name: str, target_ot: str, ebs_values: tuple):
    df = pd.read_parquet(f'../data/compressed/{eblc_name}/{data_file}')
    tmetrics = []
    ebs = []
    border1s, border2s = get_borders(data_name, len(df))
    raw_df = df[f'{target_ot}-R'].values[border1s[2]:border2s[2]]
    for eb in ebs_values:
        decomp_target_var = df[[f'{target_ot}-E{eb}']].values[border1s[2]:border2s[2]][:, 0]
        results = metrics_ensemble(decomp_target_var, raw_df)
        tmetrics.append(results)
        ebs.append(eb)

    df = pd.DataFrame(tmetrics)
    df['eb'] = ebs
    de = df.groupby(['eb']).median()

    raw_data_path = root_baseline_data_path%(model, data_name, 'raw') + 'true.pkl'
    raw_data = load(raw_data_path)

    tmetrics = []
    ebs = []
    for root, dr, files in os.walk(root_baseline_data_path%(model, data_name, eblc_name)+ 'predictions'):
        for file in files:
            results = metrics_ensemble(load(root+os.sep+file), raw_data)
            tmetrics.append(results)
            ebs.append(get_eb(file))

    df = pd.DataFrame(tmetrics)
    df['eb'] = ebs
    fr = df.groupby(['eb']).median()
    fr.sort_index(inplace=True)
    return fr, de


def get_solar_baseline_true():
    var_map = dict()
    all_results = list()
    all_true = dict()
    for root, dr, files in os.walk('../output/arima/solar/raw/'):
        for file in files:
            t, var = file.strip('.pkl').split('_')
            if var in var_map:
                ifile = var_map[var]
                pf = load_pkl(f'../output/arima/solar/raw/{ifile}')
                nf = load_pkl(f'../output/arima/solar/raw/{file}')
                if 'true' in ifile:
                    all_true[var] = pf
                    results = metrics_ensemble(nf, pf)
                else:
                    all_true[var] = nf
                    results = metrics_ensemble(pf, nf)
                all_results.append(results)
            else:
                var_map[var] = file
    df = pd.DataFrame(all_results)
    df = df.mean().to_frame().T
    df.index._name = 'eb'
    return df, all_true

def get_forecasting_results_solar(model: str, eblc_name: str, ebs_values: tuple):
    df = pd.read_parquet(f'../data/compressed/{eblc_name}/solar_output_data_points.parquet')
    tmetrics = []
    ebs = []
    border1s, border2s = get_borders('solar', len(df))
    raw_columns = get_raw_columns(df.columns)
    raw_df = df[raw_columns].values[border1s[2]:border2s[2]]
    for eb in ebs_values:
        eb_columns = get_eb_columns(df.columns, eb)
        decomp_target_var = df[eb_columns].values[border1s[2]:border2s[2]]
        results = metrics_ensemble(decomp_target_var, raw_df)
        tmetrics.append(results)
        ebs.append(eb)

    df = pd.DataFrame(tmetrics)
    df['eb'] = ebs
    de = df.groupby(['eb']).mean()

    solar_baseline, raw_data_map = get_solar_baseline_true()
    all_results = []
    ebs = []
    # tvars = []
    for root, dr, files in os.walk(root_baseline_data_path%(model, 'solar', eblc_name)+ 'predictions'):
        for file in files:
            var = re.findall('v[0-9]+', file)[0]
            results = metrics_ensemble(load(root+os.sep+file), raw_data_map[var])
            all_results.append(results)
            ebs.append(get_eb(file))
            # tvars.append(var)

    df = pd.DataFrame(all_results)
    df['eb'] = ebs
    fr = df.groupby(['eb']).mean()
    fr.sort_index(inplace=True)
    return solar_baseline, fr, de


def get_transformation_error(data_file: str, data_name: str,
                            eblc_name: str, target_ot: str, ebs_values: tuple):
    df = pd.read_parquet(f'../data/compressed/{eblc_name}/{data_file}')
    tmetrics = []
    ebs = []
    border1s, border2s = get_borders(data_name, len(df))
    raw_df = df[f'{target_ot}-R'].values[border1s[2]:border2s[2]]
    for eb in ebs_values:
        decomp_target_var = df[[f'{target_ot}-E{eb}']].values[border1s[2]:border2s[2]][:, 0]
        results = metrics_ensemble(decomp_target_var, raw_df)
        tmetrics.append(results)
        ebs.append(eb)

    df = pd.DataFrame(tmetrics)
    df['eb'] = ebs
    return df


def concat_baseline_forecasting_result(br, fr, de):
    concat_forecasting_results = pd.concat([br, fr], axis=0)
    metric_indexed_results = pd.DataFrame()
    metric_indexed_results['error'] = [0.0]+list(de['nrmse'].values)
    metric_indexed_results['mae'] = concat_forecasting_results['mae'].values
    metric_indexed_results['rmse'] = concat_forecasting_results['rmse'].values
    metric_indexed_results['nrmse'] = concat_forecasting_results['nrmse'].values
    metric_indexed_results['rse'] = concat_forecasting_results['rse'].values
    metric_indexed_results['corr'] = concat_forecasting_results['corr'].values
    metric_indexed_results['data_corr'] = [1.0]+list(de['corr'].values)
    metric_indexed_results.set_index('error', inplace=True)
    metric_indexed_results['eb'] = [0.0]+list(de.index)
    return metric_indexed_results



<Figure size 432x288 with 0 Axes>

In [2]:
target_variables_map = {'ettm1':'OT', 'ettm2': 'OT', 'aus_electrical_demand': 'y', 'weather': 'OT', 'wind': 'active power'}
bounds = [(0.01, 0.03, 0.05, 0.07, 0.10, 0.15, 0.20, 0.25, 0.30, 0.40, 0.50, 0.65, 0.8), (1, 3, 5, 7, 10, 15, 20, 25, 30, 40, 50, 65, 80)]
all_results = pd.DataFrame()
for data in ['ettm1', 'ettm2', 'weather', 'aus_electrical_demand', 'solar']:
    data_results = pd.DataFrame()
    for eblc in ['sz', 'pmc', 'swing']:
        print(eblc, data)
        if 'solar' in data:
            baseline_results, forecasting_results, dec_error = get_forecasting_results_solar(model='arima', eblc_name=eblc, ebs_values=bounds[0] if eblc == 'sz' else bounds[1])
        elif 'aus' in data:
            baseline_results = get_baseline('arima', 'aus')
            forecasting_results, dec_error = get_forecasting_results(model='arima',
                                                                     data_file=f'{data}_points.parquet',
                                                                     data_name='aus',
                                                                     eblc_name=eblc,
                                                                     target_ot='y',
                                                                     ebs_values=bounds[0] if eblc == 'sz' else np.asarray(bounds[1])*1.0)
        else:
            baseline_results = get_baseline('arima', data)
            forecasting_results, dec_error = get_forecasting_results(model='arima',
                                                                     data_file=f'{data}_output_data_points.parquet',
                                                                     data_name=data,
                                                                     eblc_name=eblc,
                                                                     target_ot=target_variables_map[data] if eblc == 'sz' else target_variables_map[data].replace(' ', '_'),
                                                                     ebs_values=bounds[0] if eblc == 'sz' else bounds[1])
        concatenated = concat_baseline_forecasting_result(baseline_results, forecasting_results, dec_error)
        concatenated['eblc'] = eblc
        concatenated.at[0, 'eblc'] = 'baseline'
        data_results = pd.concat([data_results, concatenated])
        data_results.drop_duplicates(inplace=True)

    data_results['data'] = data
    all_results = pd.concat([all_results, data_results])
all_results.to_csv('../results/tfe/arima_results.csv')

sz ettm1
pmc ettm1
swing ettm1
sz ettm2
pmc ettm2
swing ettm2
sz weather


  return (u/d).mean()


pmc weather
swing weather
sz aus_electrical_demand
pmc aus_electrical_demand
swing aus_electrical_demand
sz solar
pmc solar
swing solar


In [3]:
get_baseline('arima', 'aus')

Unnamed: 0_level_0,mae,rmse,nrmse,rse,corr,psnr
eb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,0.089336,0.161029,0.035564,0.189757,0.981988,28.979742


In [4]:
get_baseline('arima', 'wind')

Unnamed: 0_level_0,mae,rmse,nrmse,rse,corr,psnr
eb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,0.088439,0.151244,0.033403,0.178219,0.984094,29.524227


In [16]:
# compression,eb,data,metric,decompression error
# sz,0.01,ettm1,mae,0.023257426917552948
data_results = pd.DataFrame()
for eblc in ['sz', 'pmc', 'swing']:
    te = get_transformation_error(data_file=f'aus_electrical_demand_points.parquet',
                                 data_name='aus',
                                 eblc_name=eblc,
                                 target_ot='y',
                                 ebs_values=bounds[0] if eblc == 'sz' else np.asarray(bounds[1])*1.0)
    te['compression'] = eblc
    te.eb = te.eb if eblc == 'sz' else (te.eb*0.01)
    data_results = pd.concat([data_results, te])

data_results['data'] = 'aus'

In [34]:
melted_te_results = data_results.melt(id_vars=['compression', 'eb', 'data'], value_vars=None, var_name='metric', value_name="decompression error")
df = pd.read_csv('../results/te/transformation_error.csv')
all_results = pd.concat([df, melted_te_results], axis=0)

In [37]:
all_results.to_csv('../results/te/transformation_error.csv', index=None)