In [1]:
import pandas as pd
import os
import numpy as np
from scipy.stats import spearmanr

In [14]:
def compute_rho_eb_te_cr(data_name):
    cr_df = pd.read_csv('../results/cr/all_cr.csv')
    cr_df = cr_df[cr_df.data == data_name]
    te_df = pd.read_csv('../results/te/transformation_error.csv')
    te_df = te_df[te_df.data == data_name]
    te_df = te_df[te_df.metric == 'nrmse']
    cr_df = cr_df.rename({'error_bound': 'eb', 'gzip': 'cr'}, axis=1)
    te_df = te_df.rename({'decompression error': 'te'}, axis=1)
    cr_df = cr_df[cr_df.eb.isin(te_df.eb)]
    cr_df.set_index(['eb', 'compression'], inplace=True)
    te_df.set_index(['eb', 'compression'], inplace=True)
    cr_df.drop(['data'], axis=1, inplace=True)
    te_df.drop(['data'], axis=1, inplace=True)
    joined = cr_df.join(te_df)
    joined.reset_index(inplace=True)
    display(joined.groupby('compression').corr())
    return joined

def compute_rho_tfe_te_cr(data_name):
    all_models_results = pd.read_csv('../results/tfe/all_models_results.csv')
    all_models_results = all_models_results[all_models_results.data == data_name]
    all_models_results = all_models_results[all_models_results.te_metric == 'nrmse']
    if 'SOLAR' in data_name:
        all_models_results = all_models_results[all_models_results.model != 'GRU']
    all_models_results.drop(['forecasting error', 'error'], axis=1, inplace=True)
    display(all_models_results.groupby('compression').corr(method='spearman'))
    # columns = all_models_results.columns.tolist()
    # ['compression', 'data', 'eb', 'tfe_metric', 'model', 'compression ratio', 'TFE', 'te_metric', 'te']
    columns = ['compression ratio', 'TFE', 'te']

    all_corr_results = pd.DataFrame()

    for eblc in ['PMC', 'SWING', 'SZ']:
        correlations = {}
        pvalues = {}
        ebcl_results = all_models_results[all_models_results.compression == eblc]
        for col1 in range(3):
            for col2 in range(col1, 3):
                if col1 != col2:
                    corr, pvalue = spearmanr(ebcl_results[columns[col1]], ebcl_results[columns[col2]])
                    correlations[f"{columns[col1]}_{columns[col2]}"] = corr
                    pvalues[f"{columns[col1]}_{columns[col2]}"] = pvalue
        eblc_corr = pd.DataFrame([correlations, pvalues], index=['correlation', 'pvalue'])
        eblc_corr['eblc'] = eblc
        all_corr_results = pd.concat([all_corr_results, eblc_corr])
    return all_corr_results

def compute_rho_tfe_te_cr_pweather():
    pweather_results = pd.read_csv('../results/tfe/pprocessed_weather.csv')
    pweather_results.drop(['forecasting error'], axis=1, inplace=True)
    display(pweather_results.groupby('compression').corr(method='spearman'))

def get_tfe(model_results):
    baseline_result  = model_results[model_results.eblc == 'baseline']
    model_results['tfe'] = (model_results.nrmse.values - baseline_result.nrmse.values)/baseline_result.nrmse.values
    baseline_result = baseline_result.append([baseline_result]*2, ignore_index=True)
    baseline_result.at[0, 'eblc'] = 'pmc'
    baseline_result.at[1, 'eblc'] = 'sz'
    baseline_result.at[2, 'eblc'] = 'swing'
    baseline_result['tfe'] = 0.
    return pd.concat([model_results, baseline_result])

def compute_rho_tfe_te_cr_aus():
    all_results = pd.DataFrame()
    for root, dirs, files in os.walk('../results/tfe/per_model/'):
        for result_file in files:
            if 'gru' in result_file or 'arima' in result_file:
                continue
            print(result_file)
            model_results = pd.read_csv(os.path.join(root, result_file))
            model_results = model_results[(model_results.data == 'aus')]
            model_results = get_tfe(model_results)
            all_results = pd.concat([all_results, model_results])

    all_results = all_results[['error', 'tfe', 'eblc']]

    return all_results


def get_feature_diff(features):
    raw_feat = features.iloc[0]
    num_colums = features.columns[:-3]
    features[num_colums] = features[num_colums].astype(float)
    features[num_colums] -= raw_feat[num_colums].astype(float)
    features[num_colums] = features[num_colums].astype(float)
    raw_feat = features[features.compression == 'RAW']
    raw_feat = raw_feat.append([raw_feat], ignore_index=True)
    features.at[0, 'compression'] = 'PMC'
    raw_feat.at[0, 'compression'] = 'SZ'
    raw_feat.at[1, 'compression'] = 'SWING'
    return pd.concat([raw_feat, features])


def get_feature_diff_percentage(features):
    raw_feat = features.iloc[0]
    num_colums = features.columns[:-3]
    features[num_colums] = features[num_colums].astype(float)
    features[num_colums] = np.abs(features[num_colums] - raw_feat[num_colums].astype(float))/raw_feat[num_colums].astype(float)*100
    features[num_colums] = features[num_colums].astype(float)
    raw_feat = features[features.compression == 'RAW']
    raw_feat = raw_feat.append([raw_feat], ignore_index=True)
    features.at[0, 'compression'] = 'PMC'
    raw_feat.at[0, 'compression'] = 'SZ'
    raw_feat.at[1, 'compression'] = 'SWING'
    return pd.concat([raw_feat, features])

def get_features_diff_solar(features):
    raw_feat = features.iloc[0]
    num_colums = features.columns[2:-1]
    features[num_colums] = features[num_colums].astype(float)
    features[num_colums] -= raw_feat[num_colums].astype(float)
    features[num_colums] = features[num_colums].astype(float)
    raw_feat = features[features.compression == 'RAW']
    raw_feat = raw_feat.append([raw_feat], ignore_index=True)
    features.at[0, 'compression'] = 'PMC'
    raw_feat.at[0, 'compression'] = 'SZ'
    raw_feat.at[1, 'compression'] = 'SWING'
    return pd.concat([raw_feat, features])

def get_features_diff_percentage_solar(features):
    raw_feat = features.iloc[0]
    num_colums = features.columns[2:-1]
    features[num_colums] = features[num_colums].astype(float)
    features[num_colums] = np.abs(features[num_colums] - raw_feat[num_colums].astype(float))/raw_feat[num_colums].astype(float)*100
    features[num_colums] = features[num_colums].astype(float)
    raw_feat = features[features.compression == 'RAW']
    raw_feat = raw_feat.append([raw_feat], ignore_index=True)
    features.at[0, 'compression'] = 'PMC'
    raw_feat.at[0, 'compression'] = 'SZ'
    raw_feat.at[1, 'compression'] = 'SWING'
    return pd.concat([raw_feat, features])

def get_baselines_well(features):
    num_colums = features.columns[:-3]
    features[num_colums] = features[num_colums].astype(float)
    raw_feat = features[features.compression == 'RAW']
    raw_feat = raw_feat.append([raw_feat, raw_feat], ignore_index=True)
    features.drop(features[features.compression == 'RAW'].index)
    raw_feat.at[0, 'compression'] = 'PMC'
    raw_feat.at[1, 'compression'] = 'SZ'
    raw_feat.at[2, 'compression'] = 'SWING'
    return pd.concat([raw_feat, features])


def get_baselines_well_solar(features):
    num_colums = features.columns[2:-1]
    features[num_colums] = features[num_colums].astype(float)
    raw_feat = features[features.compression == 'RAW']
    raw_feat = raw_feat.append([raw_feat, raw_feat], ignore_index=True)
    features.drop(features[features.compression == 'RAW'].index)
    raw_feat.at[0, 'compression'] = 'PMC'
    raw_feat.at[1, 'compression'] = 'SZ'
    raw_feat.at[2, 'compression'] = 'SWING'
    return pd.concat([raw_feat, features])



def compute_features():
    all_models_results = pd.read_csv('../results/tfe/models_results.csv')
    datasets = all_models_results.data.str.lower().unique()
    all_datasets_features = pd.DataFrame()
    for dataset in datasets:
        print(dataset)
        features = pd.read_csv(f'../results/features/test_{dataset}_features.csv')
        x = features.eb.values
        features.eb = np.where(x >= 1.0, x*0.01, x)
        features.eblc = features.eblc.str.upper()
        features.rename({'eblc': 'compression'}, axis=1, inplace=True)
        features = get_baselines_well(features) if 'solar' not in dataset else get_baselines_well_solar(features)
        features.set_index(['compression', 'eb'], inplace=True)
        model_results = all_models_results[all_models_results.data == dataset.upper()]
        model_results = model_results[model_results.te_metric == 'nrmse']
        model_results = model_results[['eb',  'te', 'TFE', 'compression']]
        joined_df = features.merge(model_results.reset_index(drop=True), on=['compression', 'eb']).drop_duplicates()
        joined_df['data'] = dataset
        all_datasets_features = pd.concat([all_datasets_features, joined_df])

    all_datasets_features.to_csv('../results/features/all_datasets_features.csv', index=False)


def compute_features_diff_percentage():
    all_models_results = pd.read_csv('../results/tfe/models_results.csv')
    datasets = all_models_results.data.str.lower().unique()
    all_datasets_features = pd.DataFrame()
    for dataset in datasets:
        print(dataset)
        features = pd.read_csv(f'../results/features/test_{dataset}_features.csv')
        x = features.eb.values
        features.eb = np.where(x >= 1.0, x*0.01, x)
        features.eblc = features.eblc.str.upper()
        features.rename({'eblc': 'compression'}, axis=1, inplace=True)
        features = get_feature_diff_percentage(features) if 'solar' not in dataset else get_features_diff_percentage_solar(features)
        features.set_index(['compression', 'eb'], inplace=True)
        model_results = all_models_results[all_models_results.data == dataset.upper()]
        model_results = model_results[model_results.te_metric == 'nrmse']
        model_results = model_results[['eb', 'te', 'TFE', 'compression']]
        joined_df = features.merge(model_results.reset_index(drop=True), on=['compression', 'eb']).drop_duplicates()
        joined_df['data'] = dataset
        all_datasets_features = pd.concat([all_datasets_features, joined_df])

    all_datasets_features.to_csv('../results/features/all_datasets_features_diff_percentage.csv', index=False)
    return all_datasets_features.corr('spearman')


def compute_features_tfe_corr():
    all_models_results = pd.read_csv('../results/tfe/models_results.csv')
    datasets = all_models_results.data.str.lower().unique()
    all_datasets_features = pd.DataFrame()
    for dataset in datasets:
        print(dataset)
        features = pd.read_csv(f'../results/features/test_{dataset}_features.csv')
        x = features.eb.values
        features.eb = np.where(x >= 1.0, x*0.01, x)
        features.eblc = features.eblc.str.upper()
        features.rename({'eblc': 'compression'}, axis=1, inplace=True)
        features = get_feature_diff(features) if 'solar' not in dataset else get_features_diff_solar(features)
        features.set_index(['compression', 'eb'], inplace=True)
        model_results = all_models_results[all_models_results.data == dataset.upper()]
        model_results = model_results[model_results.te_metric == 'nrmse']
        model_results = model_results[['eb',  'te', 'TFE', 'compression']]
        joined_df = features.merge(model_results.reset_index(drop=True), on=['compression', 'eb']).drop_duplicates()
        joined_df['data'] = dataset
        all_datasets_features = pd.concat([all_datasets_features, joined_df])

    all_datasets_features.to_csv('../results/features/all_datasets_features_diff.csv', index=False)
    return all_datasets_features.corr('spearman')


def compute_features_tfe_corr_xg():
    all_models_results = pd.read_csv('../results/tfe/models_results_xg.csv')
    datasets = all_models_results.data.str.lower().unique()
    all_datasets_features = pd.DataFrame()
    for dataset in datasets:
        print(dataset)
        features = pd.read_csv(f'../results/features/test_{dataset}_features.csv')
        x = features.eb.values
        features.eb = np.where(x >= 1.0, x*0.01, x)
        features.eblc = features.eblc.str.upper()
        features.rename({'eblc': 'compression'}, axis=1, inplace=True)
        features = get_feature_diff(features) if 'solar' not in dataset else get_features_diff_solar(features)
        features.set_index(['compression', 'eb'], inplace=True)
        model_results = all_models_results[all_models_results.data == dataset.upper()]
        model_results = model_results[model_results.te_metric == 'nrmse']
        model_results = model_results[['eb',  'te', 'TFE', 'compression', 'model']]
        joined_df = features.merge(model_results.reset_index(drop=True), on=['compression', 'eb']).drop_duplicates()
        joined_df['data'] = dataset
        all_datasets_features = pd.concat([all_datasets_features, joined_df])

    all_datasets_features.to_csv('../results/features/all_datasets_features_diff_xg.csv', index=False)
    return all_datasets_features.corr('spearman')

def compute_features_tfe_corr_xg():
    all_models_results = pd.read_csv('../results/tfe/models_results_xg.csv')
    datasets = all_models_results.data.str.lower().unique()
    all_datasets_features = pd.DataFrame()
    for dataset in datasets:
        print(dataset)
        features = pd.read_csv(f'../results/features/test_{dataset}_features.csv')
        x = features.eb.values
        features.eb = np.where(x >= 1.0, x*0.01, x)
        features.eblc = features.eblc.str.upper()
        features.rename({'eblc': 'compression'}, axis=1, inplace=True)
        features = get_feature_diff(features) if 'solar' not in dataset else get_features_diff_solar(features)
        features.set_index(['compression', 'eb'], inplace=True)
        model_results = all_models_results[all_models_results.data == dataset.upper()]
        model_results = model_results[model_results.te_metric == 'nrmse']
        model_results = model_results[['eb',  'te', 'TFE', 'compression', 'model']]
        joined_df = features.merge(model_results.reset_index(drop=True), on=['compression', 'eb']).drop_duplicates()
        joined_df['data'] = dataset
        all_datasets_features = pd.concat([all_datasets_features, joined_df])

    all_datasets_features.to_csv('../results/features/all_datasets_features_diff_xg.csv', index=False)
    return all_datasets_features.corr('spearman')

def compute_features_with_xgboost():
    all_models_results = pd.read_csv('../results/tfe/models_results_xg.csv')

    datasets = all_models_results.data.str.lower().unique()
    all_datasets_features = pd.DataFrame()
    for dataset in datasets:
        print(dataset)
        features = pd.read_csv(f'../results/features/test_{dataset}_features.csv')
        x = features.eb.values
        features.eb = np.where(x >= 1.0, x*0.01, x)
        features.eblc = features.eblc.str.upper()
        features.rename({'eblc': 'compression'}, axis=1, inplace=True)
        features = get_baselines_well(features) if 'solar' not in dataset else get_baselines_well_solar(features)
        features.set_index(['compression', 'eb'], inplace=True)
        model_results = all_models_results[all_models_results.data == dataset.upper()]
        model_results = model_results[model_results.te_metric == 'nrmse']
        model_results = model_results[['eb',  'te', 'TFE', 'compression', 'model']]
        joined_df = features.merge(model_results.reset_index(drop=True), on=['compression', 'eb']).drop_duplicates()
        joined_df['data'] = dataset
        all_datasets_features = pd.concat([all_datasets_features, joined_df])

    all_datasets_features.to_csv('../results/features/all_datasets_features_xg.csv', index=False)


def compute_features_tfe_corr_percentage_xg():
    all_models_results = pd.read_csv('../results/tfe/models_results_xg.csv')
    datasets = all_models_results.data.str.lower().unique()
    all_datasets_features = pd.DataFrame()
    for dataset in datasets:
        print(dataset)
        features = pd.read_csv(f'../results/features/test_{dataset}_features.csv')
        x = features.eb.values
        features.eb = np.where(x >= 1.0, x*0.01, x)
        features.eblc = features.eblc.str.upper()
        features.rename({'eblc': 'compression'}, axis=1, inplace=True)
        features = get_feature_diff_percentage(features) if 'solar' not in dataset else get_features_diff_percentage_solar(features)
        features.set_index(['compression', 'eb'], inplace=True)
        model_results = all_models_results[all_models_results.data == dataset.upper()]
        model_results = model_results[model_results.te_metric == 'nrmse']
        model_results = model_results[['eb',  'te', 'TFE', 'compression', 'model']]
        joined_df = features.merge(model_results.reset_index(drop=True), on=['compression', 'eb']).drop_duplicates()
        joined_df['data'] = dataset
        all_datasets_features = pd.concat([all_datasets_features, joined_df])

    print('HERE')
    all_datasets_features.to_csv('../results/features/all_datasets_features_diff_percentage_xg.csv', index=False)
    return all_datasets_features.corr('spearman')


In [15]:
compute_features_tfe_corr_percentage_xg()

ettm2
ettm1
wind
solar
aus
weather
HERE


Unnamed: 0,eb,mean,var,max_kl_shift,time_kl_shift,max_level_shift,time_level_shift,max_var_shift,time_var_shift,x_acf1,...,spike,linearity,curvature,e_acf1,e_acf10,seasonal_strength,peak,trough,te,TFE
eb,1.0,0.542635,0.494557,0.661785,0.482561,0.695958,0.573828,0.703739,0.636672,0.493228,...,0.716068,-0.49786,0.264061,0.424354,0.483554,0.632189,0.612749,0.429762,0.727003,0.65581
mean,0.542635,1.0,0.426626,0.385309,0.36048,0.525433,0.447289,0.599639,0.506109,0.445917,...,0.513696,-0.269915,0.13611,0.46614,0.471813,0.419886,0.281957,0.24091,0.558408,0.511321
var,0.494557,0.426626,1.0,0.772369,0.487313,0.783457,0.350305,0.800345,0.417379,0.492772,...,0.758431,-0.113946,0.532203,0.511994,0.685682,0.775206,0.609064,0.545791,0.903226,0.818079
max_kl_shift,0.661785,0.385309,0.772369,1.0,0.543547,0.768497,0.543518,0.79662,0.585103,0.583852,...,0.799629,-0.101453,0.246888,0.524481,0.638121,0.770638,0.604655,0.508287,0.876767,0.79949
time_kl_shift,0.482561,0.36048,0.487313,0.543547,1.0,0.533698,0.52556,0.517123,0.483016,0.458436,...,0.607981,-0.316844,0.217502,0.411216,0.473276,0.436749,0.394712,0.247148,0.544511,0.497782
max_level_shift,0.695958,0.525433,0.783457,0.768497,0.533698,1.0,0.568559,0.851351,0.574226,0.566476,...,0.790292,-0.266938,0.345168,0.539835,0.630206,0.708914,0.670183,0.492543,0.882211,0.824095
time_level_shift,0.573828,0.447289,0.350305,0.543518,0.52556,0.568559,1.0,0.534408,0.714192,0.40392,...,0.558253,-0.332235,-0.074574,0.266062,0.256066,0.377325,0.388226,0.259235,0.528359,0.520698
max_var_shift,0.703739,0.599639,0.800345,0.79662,0.517123,0.851351,0.534408,1.0,0.589173,0.561205,...,0.802359,-0.240911,0.327265,0.554393,0.673773,0.766732,0.67366,0.496529,0.908661,0.839041
time_var_shift,0.636672,0.506109,0.417379,0.585103,0.483016,0.574226,0.714192,0.589173,1.0,0.397094,...,0.613441,-0.222858,-0.079434,0.288559,0.309122,0.489233,0.422746,0.342709,0.600685,0.515821
x_acf1,0.493228,0.445917,0.492772,0.583852,0.458436,0.566476,0.40392,0.561205,0.397094,1.0,...,0.627564,-0.388935,0.294158,0.885969,0.70996,0.303629,0.285998,0.080832,0.561608,0.612179


In [16]:
compute_features_with_xgboost()

ettm2
ettm1
wind
solar
aus
weather


In [18]:
compute_features_tfe_corr_xg()

ettm2
ettm1
wind
solar
aus
weather


Unnamed: 0,eb,mean,var,max_kl_shift,time_kl_shift,max_level_shift,time_level_shift,max_var_shift,time_var_shift,x_acf1,...,spike,linearity,curvature,e_acf1,e_acf10,seasonal_strength,peak,trough,te,TFE
eb,1.0,-0.408551,-0.289032,0.551681,-0.060784,0.108595,-0.044212,0.186682,0.065579,-0.192129,...,0.340081,-0.041176,0.146757,-0.135709,-0.104297,-0.449649,0.202057,0.057897,0.727003,0.65581
mean,-0.408551,1.0,0.058416,-0.262006,-0.10775,0.091838,-0.055521,-0.133257,-0.059369,0.393393,...,-0.249716,-0.084645,-0.036176,0.37928,0.313434,0.228252,0.02575,0.008387,-0.438359,-0.454449
var,-0.289032,0.058416,1.0,-0.419322,0.172827,-0.422675,0.158233,-0.302041,0.099713,-0.185176,...,-0.249734,0.031119,-0.122105,0.102731,-0.161749,0.290595,0.074633,-0.350232,-0.494624,-0.39274
max_kl_shift,0.551681,-0.262006,-0.419322,1.0,-0.090835,0.280378,-0.074885,0.275936,-0.108011,-0.234855,...,0.370137,0.038251,0.221519,-0.361782,-0.145654,-0.563691,-0.054525,0.219359,0.817835,0.753394
time_kl_shift,-0.060784,-0.10775,0.172827,-0.090835,1.0,-0.184651,0.260477,-0.019377,0.217405,-0.006474,...,-0.039661,0.107841,-0.108872,0.0811,0.072864,0.242369,-0.163083,-0.032257,-0.154501,-0.125208
max_level_shift,0.108595,0.091838,-0.422675,0.280378,-0.184651,1.0,-0.213531,0.427182,-0.089341,0.089414,...,0.283563,-0.261201,0.219932,0.016234,0.096097,-0.32245,0.121724,0.078032,0.262524,0.223344
time_level_shift,-0.044212,-0.055521,0.158233,-0.074885,0.260477,-0.213531,1.0,-0.0421,0.322538,-0.026018,...,-0.017123,-0.042134,0.092409,0.067869,-0.026259,0.111473,-0.084858,0.043751,-0.077646,-0.061312
max_var_shift,0.186682,-0.133257,-0.302041,0.275936,-0.019377,0.427182,-0.0421,1.0,0.049324,-0.055945,...,0.296542,0.111659,0.076549,-0.01131,0.001725,-0.120095,0.028725,0.070212,0.231297,0.216684
time_var_shift,0.065579,-0.059369,0.099713,-0.108011,0.217405,-0.089341,0.322538,0.049324,1.0,0.039966,...,-0.062334,-0.097815,0.144056,0.094405,0.067411,0.158885,-0.08471,-0.013771,-0.140894,-0.129979
x_acf1,-0.192129,0.393393,-0.185176,-0.234855,-0.006474,0.089414,-0.026018,-0.055945,0.039966,1.0,...,-0.091945,-0.115281,0.01053,0.747641,0.811712,0.124065,-0.179402,0.333589,-0.181223,-0.324587


In [None]:
compute_features()

In [6]:
compute_features_tfe_corr()

ettm2
ettm1
wind
solar
weather
aus


Unnamed: 0,eb,mean,var,max_kl_shift,time_kl_shift,max_level_shift,time_level_shift,max_var_shift,time_var_shift,x_acf1,...,spike,linearity,curvature,e_acf1,e_acf10,seasonal_strength,peak,trough,te,TFE
eb,1.0,-0.359034,-0.243797,0.601491,-0.092623,0.111844,0.012655,0.206705,0.096136,-0.198447,...,0.396365,-0.025559,0.17994,-0.123872,-0.117021,-0.492407,0.147704,0.058067,0.755653,0.660665
mean,-0.359034,1.0,0.114055,-0.267071,-0.112175,0.074808,-0.11126,-0.182641,-0.08848,0.359082,...,-0.356634,-0.117547,-0.061338,0.35521,0.295988,0.238303,0.101138,0.002329,-0.44375,-0.448635
var,-0.243797,0.114055,1.0,-0.482291,0.173313,-0.368105,0.159886,-0.291138,0.127246,-0.125419,...,-0.182867,-0.037653,-0.112462,0.209516,-0.062643,0.298927,0.095142,-0.35777,-0.544091,-0.471729
max_kl_shift,0.601491,-0.267071,-0.482291,1.0,-0.056249,0.267627,-0.095061,0.322158,-0.080377,-0.227111,...,0.402394,0.091374,0.186993,-0.35702,-0.145459,-0.542245,-0.049538,0.195874,0.818096,0.766695
time_kl_shift,-0.092623,-0.112175,0.173313,-0.056249,1.0,-0.164164,0.294844,-0.012175,0.226509,0.004957,...,-0.026763,0.087307,-0.067795,0.094092,0.082439,0.211817,-0.211141,0.022521,-0.113032,-0.106192
max_level_shift,0.111844,0.074808,-0.368105,0.267627,-0.164164,1.0,-0.266466,0.400592,-0.121363,0.040686,...,0.233039,-0.213934,0.181885,-0.013215,0.050439,-0.285418,0.153042,0.022053,0.230332,0.231704
time_level_shift,0.012655,-0.11126,0.159886,-0.095061,0.294844,-0.266466,1.0,-0.039405,0.355796,-0.049701,...,-0.016593,-0.022319,0.084425,0.044529,-0.037906,0.142394,-0.0756,0.031703,-0.094383,-0.085664
max_var_shift,0.206705,-0.182641,-0.291138,0.322158,-0.012175,0.400592,-0.039405,1.0,0.037879,-0.094117,...,0.308884,0.147684,0.066448,-0.033382,-0.032224,-0.1256,0.001061,0.05416,0.25957,0.25237
time_var_shift,0.096136,-0.08848,0.127246,-0.080377,0.226509,-0.121363,0.355796,0.037879,1.0,0.00583,...,-0.051957,-0.110798,0.160414,0.076564,0.053022,0.148567,-0.099034,-0.027053,-0.135018,-0.133507
x_acf1,-0.198447,0.359082,-0.125419,-0.227111,0.004957,0.040686,-0.049701,-0.094117,0.00583,1.0,...,-0.214675,-0.098622,-0.005033,0.72563,0.795162,0.147952,-0.142212,0.35555,-0.175202,-0.305901


In [200]:
features_corr = compute_features_tfe_corr()
features_corr = features_corr[['TFE']].abs()
features_corr.drop(['eb', 'TFE'], inplace=True)
r = features_corr.sort_values(by='TFE', ascending=False)
r.index.name = 'ts_feature'
r.to_csv('../results/features/spearman_correlation.csv')

ettm2
ettm1
wind
solar
weather
aus


In [204]:
compute_features()

ettm2
ettm1
wind
solar
weather
aus


In [151]:
corr = compute_features_tfe_corr('test_ettm1', 'ETTM1')
# corr = corr[['TFE', 'te', 'compression ratio']]
features_of_interest = ['TFE', 'forecasting error']

for feature in features_of_interest:
    # for eblc in corr.compression.unique():
    print(f"Correlations with {feature}:")
    # Get correlations with the specific feature for each 'compression' group
    corr_feature = corr[[feature]] # .xs(key=feature, level=1, axis=1)[feature]

    # Drop the correlation of the feature with itself (it's always 1)
    corr_feature = corr_feature.drop(index=['eb', 'TFE', 'forecasting error', 'te', 'compression ratio'], errors='ignore')
    corr_feature[feature] = np.abs(corr_feature[feature].values)
    # Sort the correlations in descending order to get most correlated features first
    sorted_corr = corr_feature.sort_values(by=feature, ascending=False)

    # Display the top correlated features (e.g., top 5)
    print(sorted_corr.head(10))
    print("-----")

Correlations with TFE:
                        TFE
seasonal_strength  0.916137
var                0.869453
max_kl_shift       0.851600
flat_spots         0.798266
spike              0.701500
diff2x_pacf5       0.694868
nonlinearity       0.691763
diff1_acf1         0.671335
diff2_acf10        0.638921
alpha              0.638266
-----
Correlations with forecasting error:
                   forecasting error
seasonal_strength           0.735805
var                         0.731819
max_kl_shift                0.702418
flat_spots                  0.687810
nonlinearity                0.528525
spike                       0.504212
diff2x_pacf5                0.486277
mean                        0.474727
diff1_acf1                  0.467007
alpha                       0.460499
-----


In [147]:
corr_feature

Unnamed: 0,TFE
mean,-0.929065
var,-0.933789
max_kl_shift,0.759507
time_kl_shift,-0.229471
max_level_shift,0.165456
time_level_shift,-0.430137
max_var_shift,0.596724
time_var_shift,-0.13729
x_acf1,-0.468551
x_acf10,-0.444785


In [122]:
corr[[feature]].sort_values(by='TFE', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,TFE
compression,Unnamed: 1_level_1,Unnamed: 2_level_1
PMC,TFE,1.000000
SZ,TFE,1.000000
SWING,TFE,1.000000
PMC,max_kl_shift,0.986813
SWING,CO_FirstMin_ac,0.984505
SWING,...,...
SWING,nperiods,
SWING,seasonal_period,
SZ,IN_AutoMutualInfoStats_40_gaussian_fmmi,
SZ,nperiods,


In [113]:
aus_results = compute_rho_tfe_te_cr_aus()
aus_results.groupby('eblc').corr(method='spearman')

dlinear_results.csv
informer_results.csv
nbeats_results.csv
transformer_results.csv


Unnamed: 0_level_0,Unnamed: 1_level_0,error,tfe
eblc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
baseline,error,,
baseline,tfe,,
pmc,error,1.0,0.970638
pmc,tfe,0.970638,1.0
swing,error,1.0,0.980232
swing,tfe,0.980232,1.0
sz,error,1.0,0.973927
sz,tfe,0.973927,1.0


In [44]:
compute_rho_eb_te_cr('aus')
compute_rho_tfe_te_cr('WIND')
compute_rho_tfe_te_cr('SOLAR')

Unnamed: 0_level_0,Unnamed: 1_level_0,eb,compression ratio,TFE,te
compression,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PMC,eb,1.0,1.0,0.902292,1.0
PMC,compression ratio,1.0,1.0,0.902292,1.0
PMC,TFE,0.902292,0.902292,1.0,0.902292
PMC,te,1.0,1.0,0.902292,1.0
SWING,eb,1.0,1.0,0.929307,1.0
SWING,compression ratio,1.0,1.0,0.929307,1.0
SWING,TFE,0.929307,0.929307,1.0,0.929307
SWING,te,1.0,1.0,0.929307,1.0
SZ,eb,1.0,1.0,0.996319,0.995604
SZ,compression ratio,1.0,1.0,0.996319,0.995604


Unnamed: 0,compression ratio_TFE,compression ratio_te,TFE_te,eblc
correlation,0.9022916,1.0,0.9022916,PMC
pvalue,1.515094e-26,0.0,1.515094e-26,PMC
correlation,0.9293069,1.0,0.9293069,SWING
pvalue,3.95755e-31,0.0,3.95755e-31,SWING
correlation,0.9963187,0.9956044,0.9919331,SZ
pvalue,2.738345e-74,1.124893e-71,9.791566999999998e-63,SZ


In [50]:
compute_rho_tfe_te_cr_pweather()

Unnamed: 0_level_0,Unnamed: 1_level_0,eb,TFE,te,cr
compression,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PMC,eb,1.0,0.954628,1.0,1.0
PMC,TFE,0.954628,1.0,0.954628,0.954628
PMC,te,1.0,0.954628,1.0,1.0
PMC,cr,1.0,0.954628,1.0,1.0
SWING,eb,1.0,0.98181,1.0,1.0
SWING,TFE,0.98181,1.0,0.98181,0.98181
SWING,te,1.0,0.98181,1.0,1.0
SWING,cr,1.0,0.98181,1.0,1.0
SZ,eb,1.0,0.973918,1.0,0.994505
SZ,TFE,0.973918,1.0,0.973918,0.969973


Unnamed: 0,compression ratio_TFE,compression ratio_te,TFE_te,eblc
correlation,0.5753655,1.0,0.5753655,PMC
pvalue,3.507506e-06,0.0,3.507506e-06,PMC
correlation,0.980232,1.0,0.980232,SWING
pvalue,1.1076830000000001e-39,0.0,1.1076830000000001e-39,SWING
correlation,0.9602217,1.0,0.9602217,SZ
pvalue,1.359577e-31,0.0,1.359577e-31,SZ


In [19]:
compute_rho_tfe_te_cr('SOLAR')

Unnamed: 0_level_0,Unnamed: 1_level_0,eb,compression ratio,TFE,te
compression,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PMC,eb,1.0,1.0,0.575365,1.0
PMC,compression ratio,1.0,1.0,0.575365,1.0
PMC,TFE,0.575365,0.575365,1.0,0.575365
PMC,te,1.0,1.0,0.575365,1.0
SWING,eb,1.0,1.0,0.980232,1.0
SWING,compression ratio,1.0,1.0,0.980232,1.0
SWING,TFE,0.980232,0.980232,1.0,0.980232
SWING,te,1.0,1.0,0.980232,1.0
SZ,eb,1.0,1.0,0.960222,1.0
SZ,compression ratio,1.0,1.0,0.960222,1.0


In [66]:
import statsmodels.formula.api as smf
for comp in ['pmc', 'swing', 'sz']:
    model = smf.ols(formula='cr ~ te', data=joined[(joined['compression'] == comp)]).fit()
    display(model.summary())

pmc


  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,cr,R-squared:,0.336
Model:,OLS,Adj. R-squared:,0.275
Method:,Least Squares,F-statistic:,5.559
Date:,"Tue, 12 Sep 2023",Prob (F-statistic):,0.038
Time:,18:11:54,Log-Likelihood:,-115.76
No. Observations:,13,AIC:,235.5
Df Residuals:,11,BIC:,236.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-968.9107,948.828,-1.021,0.329,-3057.268,1119.446
te,2.652e+04,1.12e+04,2.358,0.038,1763.935,5.13e+04

0,1,2,3
Omnibus:,13.955,Durbin-Watson:,0.744
Prob(Omnibus):,0.001,Jarque-Bera (JB):,9.446
Skew:,1.557,Prob(JB):,0.00889
Kurtosis:,5.781,Cond. No.,21.0


swing


  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,cr,R-squared:,0.595
Model:,OLS,Adj. R-squared:,0.559
Method:,Least Squares,F-statistic:,16.19
Date:,"Tue, 12 Sep 2023",Prob (F-statistic):,0.002
Time:,18:11:54,Log-Likelihood:,-104.87
No. Observations:,13,AIC:,213.7
Df Residuals:,11,BIC:,214.9
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-620.9022,378.243,-1.642,0.129,-1453.410,211.605
te,1.184e+04,2943.105,4.024,0.002,5365.485,1.83e+04

0,1,2,3
Omnibus:,4.894,Durbin-Watson:,0.781
Prob(Omnibus):,0.087,Jarque-Bera (JB):,1.939
Skew:,0.803,Prob(JB):,0.379
Kurtosis:,3.999,Cond. No.,12.8


sz


  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,cr,R-squared:,0.492
Model:,OLS,Adj. R-squared:,0.446
Method:,Least Squares,F-statistic:,10.66
Date:,"Tue, 12 Sep 2023",Prob (F-statistic):,0.00753
Time:,18:11:54,Log-Likelihood:,-69.281
No. Observations:,13,AIC:,142.6
Df Residuals:,11,BIC:,143.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,34.4219,20.900,1.647,0.128,-11.579,80.423
te,369.5957,113.191,3.265,0.008,120.464,618.728

0,1,2,3
Omnibus:,15.387,Durbin-Watson:,2.568
Prob(Omnibus):,0.0,Jarque-Bera (JB):,12.916
Skew:,1.413,Prob(JB):,0.00157
Kurtosis:,6.982,Cond. No.,7.65


In [4]:
ettm1 = pd.read_parquet('../data/raw/ETTm1/ETTm1.parquet')
ettm2 = pd.read_parquet('../data/raw/ETTm2/ETTm2.parquet')
solar = pd.read_parquet('../data/raw/Solar/solar.parquet')
weather = pd.read_parquet('../data/raw/Weather/weather.parquet')
wind = pd.read_parquet('../data/raw/Wind/wind.parquet')
aus = pd.read_parquet('../data/raw/AUSElecDem/aus_electrical_demand.parquet')


In [38]:
d = ettm1.OT.describe()
display(d)
(d['75%'] - d['mean'])/d['mean']*100,(d['mean'] - d['25%'])/d['25%']*100, (d['75%'] - d['25%'])/d['mean']*100

count    69680.000000
mean        13.320642
std          8.564817
min         -4.221000
25%          6.964000
50%         11.396000
75%         18.079000
max         46.007000
Name: OT, dtype: float64

(35.72169515321236, 91.2785911420722, 83.44192907854226)

In [39]:
d = ettm2.OT.describe()
display(d)
(d['75%'] - d['mean'])/d['mean']*100,(d['mean'] - d['25%'])/d['25%']*100, (d['75%'] - d['25%'])/d['mean']*100

count    69680.000000
mean        26.609798
std         11.886537
min         -2.646500
25%         16.469500
50%         26.577000
75%         35.585999
max         58.876999
Name: OT, dtype: float64

(33.73268732907382, 61.57016968969957, 71.84007423591866)

In [48]:
d = solar.describe().T.describe()
d = d[['mean', '25%', '75%']].iloc[1]
(d['75%'] - d['mean'])/d['mean']*100,(d['mean'] - d['25%'])/d['25%']*100, (d['75%'] - d['25%'])/d['mean']*100

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
count,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0
mean,52560.0,6.352652,9.203678,0.0,0.0,0.0,12.200821,34.176642
std,0.0,2.532767,3.472893,0.0,0.0,0.0,5.531961,12.30006
min,52560.0,1.606337,2.350481,0.0,0.0,0.0,3.05,8.8
25%,52560.0,5.460884,7.929383,0.0,0.0,0.0,10.15,29.4
50%,52560.0,5.746929,8.485564,0.0,0.0,0.0,11.0,32.150002
75%,52560.0,6.028389,8.85968,0.0,0.0,0.0,11.5,33.5
max,52560.0,19.131531,25.238678,0.0,0.0,0.0,43.299999,88.900002


mean     6.352652
25%      0.000000
75%     12.200821
Name: mean, dtype: float64

  """


(92.05869928344535, inf, 192.05869928344538)

In [54]:
d = weather.OT.describe()
display(d)
(d['75%'] - d['mean'])/d['mean']*100,(d['mean'] - d['25%'])/d['25%']*100, (d['75%'] - d['25%'])/d['mean']*100

count    52704.000000
mean       427.664764
std         18.751526
min        305.500000
25%        415.500000
50%        423.200012
75%        437.100006
max        524.200012
Name: OT, dtype: float64

(2.206223772576002, 2.927741132201414, 5.050686402374701)

In [55]:
d = aus.y.describe()
display(d)
(d['75%'] - d['mean'])/d['mean']*100,(d['mean'] - d['25%'])/d['25%']*100, (d['75%'] - d['25%'])/d['mean']*100

count    230736.000000
mean       6740.515625
std        1361.918335
min        3498.385254
25%        5751.761108
50%        6783.570068
75%        7658.944702
max       12865.795898
Name: y, dtype: float64

(13.62550179013104, 17.190465632479622, 28.294327909817728)

In [56]:
d = wind['active power'].describe()
display(d)
(d['75%'] - d['mean'])/d['mean']*100,(d['mean'] - d['25%'])/d['25%']*100, (d['75%'] - d['25%'])/d['mean']*100

count    432000.000000
mean        363.690948
std         329.669708
min         -68.110573
25%         107.959009
50%         270.256577
75%         549.859192
max        2030.670776
Name: active power, dtype: float64

(51.188583104152116, 236.8787387737519, 121.50431143892241)

In [26]:
from scipy.stats import skew, kurtosis

def compute_skew_kurtosis(data):
    ts_skewness = skew(data)
    ts_kurtosis = kurtosis(data, fisher=True)

    # Print results
    print(f"Skewness: {ts_skewness}")
    print(f"Kurtosis: {ts_kurtosis}")

In [28]:
compute_skew_kurtosis(ettm1.OT)

Skewness: 0.9667847497500424
Kurtosis: 0.7709062803549038


In [29]:
compute_skew_kurtosis(ettm2.OT)

Skewness: 0.09930423318815919
Kurtosis: -0.811803464088662
