In [1]:
# edit this variable to set de data streams

datasets = [
    'sine_rw_10_2341'
]

In [2]:
import pandas as pd
import pickle
import numpy as np
import statsmodels.stats.api as sms
import scipy
import os

In [3]:
import os
os.chdir(os.path.dirname(os.getcwd()))

In [4]:
ensemble = False

models = ["ensemble_clstm_50hs_5ws", "cpnn_clstm_50hs", "single_clstm_50hs", "multiple_clstm_50hs"]
models = ["cpnn_clstm_50hs", "single_clstm_50hs", "multiple_clstm_50hs"]

alpha = 0.05

In [5]:
models_dict = {
    "cpnn_clstm_50hs": "cPNN",
    "single_clstm_50hs": "cLSTM",
    "multiple_clstm_50hs": "cLSTMs",
    "ensemble_clstm_50hs_5ws": "ensemble"
}

In [6]:
models_dict_inverse = {models_dict[k] : k for k in models_dict}

In [7]:
tasks_dict = {
    "1": "SINE1+",
    "2": "SINE2+",
    "3": "SINE1-",
    "4": "SINE2-"
}

In [8]:
def compute_performance(dataset, models, metric="accuracy"):
    perf_dict = {}
    perf_interval = {}
    for model in models:
        try:
            perf_dict[model] = {}
            with open(f"performance/{dataset}/{model}/test_then_train.pkl", "rb") as f:
                perf = pickle.load(f)
            for k in perf:
                perf[k] = np.asarray(perf[k])
            for k in [metric]:
                perf_dict[model][k] = {
                    "[1,50]": np.mean(perf[k][:,:,:50], axis=2),
                    "[1,100]": np.mean(perf[k][:,:,:100], axis=2),
                    "(100,)": np.mean(perf[k][:,:,100:], axis=2),
                    "[1,)": np.mean(perf[k], axis=2)
                }
        except Exception as e:
            print(e)
            del perf_dict[model]
            print(model, "not present")
    
    return perf_dict

In [9]:
def calculate_confint(perf):
    conf = sms.DescrStatsW(perf).tconfint_mean()
    return np.round(conf, 3), (conf[0]+conf[1])/2

In [10]:
def check_normality(data):
    pvalue = np.round(scipy.stats.shapiro(data).pvalue, 3)
    if pvalue>alpha:
        return True
    return False

In [11]:
def test(dataset, models, metric="accuracy"):
    perf = compute_performance(dataset, models)
    perf_batches = {}
    tasks = dataset.split("_")[-1]

    for model in perf:
        perf_batches[model] = []
        for t in range(0,perf[model][metric][list(perf[model][metric].keys())[0]].shape[1]):
            d = {}
            for b in perf[model][metric]:
                d[b] = perf[model][metric][b][:,t]
            perf_batches[model].append(d)
    
    test_dfs = []
    if len(models)==3:
        loop = [(models[0], models[1]), (models[0], models[2]), (models[1], models[2])]
    else:
        loop = [models[0], models[1]]
    for m1, m2 in loop:
        test_df = pd.DataFrame()

        for t in range(0,len(perf_batches[m1])):
            for b in perf_batches[m2][t]:
                mean = {}
                row = {
                    "task": tasks_dict[tasks[t]],
                    "batches": b,
                }
                for model in [m1, m2]:
                    c, m = calculate_confint(perf_batches[model][t][b])
                    mean[model] = m
                    row[models_dict[model]] = f"{np.round(np.mean(perf_batches[model][t][b]), 3)} / {np.round(np.std(perf_batches[model][t][b]), 3)}"
                best = max(mean, key=mean.get)
                worst = min(mean, key=mean.get)            
                if check_normality(perf_batches[best][t][b]) and check_normality(perf_batches[worst][t][b]):
                    row[f"{models_dict[m1]}-{models_dict[m2]}_normality"] = True
                    row[f"{models_dict[m1]}-{models_dict[m2]}_test"] = "ttest"
                    row[f"{models_dict[m1]}-{models_dict[m2]}_p"] = np.round(scipy.stats.ttest_ind(perf_batches[best][t][b], perf_batches[worst][t][b], alternative = "greater", equal_var=False).pvalue, 3)
                else:
                    row[f"{models_dict[m1]}-{models_dict[m2]}_normality"] = False
                    row[f"{models_dict[m1]}-{models_dict[m2]}_test"] = "wilcoxon"
                    row[f"{models_dict[m1]}-{models_dict[m2]}_p"] = np.round(scipy.stats.wilcoxon(perf_batches[best][t][b], perf_batches[worst][t][b], alternative = "greater").pvalue, 3)
                if row[f"{models_dict[m1]}-{models_dict[m2]}_p"]<alpha:
                    row[f"{models_dict[m1]}-{models_dict[m2]}_GREATER"] = models_dict[best]
                else:
                    row[f"{models_dict[m1]}-{models_dict[m2]}_GREATER"] = "-"
                test_df = test_df.append(row, ignore_index=True)
        test_dfs.append(test_df)
    test_df_final = test_dfs[0]
    for df in test_dfs[1:]:
        df = df.drop(columns = [c for c in test_df_final.columns.intersection(df.columns) if c not in ["task", "batches"]])
        test_df_final = test_df_final.merge(df, on=["task", "batches"], how="inner")
    initial_columns = ["task", "batches"] + [models_dict[m] for m in models]
    test_df_final = test_df_final[initial_columns + [c for c in test_df_final if c not in initial_columns]]
    return test_df_final, perf_batches

In [12]:
def test_ensemble(dataset, models, metric="accuracy"):
    perf = compute_performance(dataset, models)
    perf_batches = {}
    tasks = dataset.split("_")[-1]
    tasks_pos = {}

    ensemble_model = [m for m in models if "ensemble" in m][0]
    models = [m for m in models if "ensemble" not in m]

    for model in perf:
        perf_batches[model] = []
        for t in range(0,perf[model][metric][list(perf[model][metric].keys())[0]].shape[1]):
            d = {}
            for b in perf[model][metric]:
                d[b] = perf[model][metric][b][:,t]
            perf_batches[model].append(d)
            
    test_df = pd.DataFrame(
        columns=(
            ["task", "batches"] +
            [models_dict[m] for m in models] +
            ["GREATER_THAN_"+models_dict[m] for m in models] + 
            [item for sublist in [[models_dict[m]+"_normality", models_dict[m]+"_test", models_dict[m]+"_p"] for m in models] for item in sublist]
        )
    )
    for t in range(0,len(perf_batches[models[0]])):
        tasks_pos[tasks_dict[tasks[t]]] = t
        for b in perf_batches[models[0]][t]:
            row = {
                "task": tasks_dict[tasks[t]],
                "batches": b,
            }
            for model in models+[ensemble_model]:
                c, _ = calculate_confint(perf_batches[model][t][b])
                row[models_dict[model]] = f"{np.round(np.mean(perf_batches[model][t][b]), 3)} / {np.round(np.std(perf_batches[model][t][b]), 3)}"
            for model in models:
                if check_normality(perf_batches[ensemble_model][t][b]) and check_normality(perf_batches[model][t][b]):
                    row[models_dict[model]+"_normality"] = True
                    row[models_dict[model]+"_test"] = "ttest"
                    row[models_dict[model]+"_p"] = np.round(scipy.stats.ttest_ind(perf_batches[ensemble_model][t][b], perf_batches[model][t][b], alternative = "greater", equal_var=False).pvalue, 3)
                else:
                    row[models_dict[model]+"_normality"] = False
                    row[models_dict[model]+"_test"] = "wilcoxon"
                    row[models_dict[model]+"_p"] = np.round(scipy.stats.wilcoxon(perf_batches[ensemble_model][t][b], perf_batches[model][t][b], alternative = "greater").pvalue, 3)
                if row[models_dict[model]+"_p"]<alpha:
                    row["GREATER_THAN_"+models_dict[model]] = True
                else:
                    row["GREATER_THAN_"+models_dict[model]] = False
            test_df = test_df.append(row, ignore_index=True)
    initial_columns = ["task", "batches"] + [models_dict[ensemble_model]] + [models_dict[m] for m in models]
    test_df = test_df[initial_columns + [c for c in test_df if c not in initial_columns]]
    return test_df, perf_batches, tasks_pos

In [15]:
import warnings
warnings.filterwarnings('ignore')

if not ensemble:
    file_name = "test"
    file_name_accuracy = "accuracy"
    models = [m for m in models if "ensemble" not in m]
else:
    file_name = "test_ensemble"
    file_name_accuracy = "accuracy_ensemble"

for d in datasets:
    print(d)
    if not ensemble:
        test_df, perf_batches = test(d, models)
    else:
        test_df, perf_batches, tasks_pos = test_ensemble(d, models)
    
    path = f"performance/{d}/_prob_test"
    if not os.path.isdir(path):
        os.makedirs(path)
    test_df.to_csv(os.path.join(path, f"{file_name}_complete_{d}.csv"), index=False)
    test_df.to_excel(os.path.join(path, f"{file_name}_complete_{d}.xlsx"), index=False)
    test_df = test_df.drop(columns = [c for c in test_df.columns if "_normality" in c or "_test" in c or "_p" in c])
    test_df.to_csv(os.path.join(path, f"{file_name}_{d}.csv"), index=False)
    test_df.to_excel(os.path.join(path, f"{file_name}_{d}.xlsx"), index=False)
    if not ensemble:
        test_df = test_df[["task", "batches"] + [models_dict[m] for m in models]]
    else:
        for m in models:
            if "ensemble" not in m:
                test_df[models_dict[m]+"_value"] = test_df[models_dict[m]].apply(lambda x : float(x.split(" / ")[0]))
        test_df["best single learner"] = test_df[[models_dict[m] for m in models if "ensemble" not in m]].max(axis=1)
        test_df["best single learner name"] = test_df[[models_dict[m]+"_value" for m in models if "ensemble" not in m]].idxmax(axis=1).apply(lambda x : x.replace("_value", ""))
        norm = []
        for i, r in test_df.iterrows():
            norm.append(check_normality(perf_batches[models_dict_inverse[r["best single learner name"]]][tasks_pos[r["task"]]][r["batches"]]))
        test_df["best single learner normality"] = norm
        test_df = test_df[["task", "batches", "ensemble", "best single learner", "best single learner name", "best single learner normality"]]
    test_df.to_csv(os.path.join(path, f"{file_name_accuracy}_{d}.csv"), index=False)
    test_df.to_excel(os.path.join(path, f"{file_name_accuracy}_{d}.xlsx"), index=False)

sine_rw_10_2341


In [14]:
test_df

Unnamed: 0,task,batches,cPNN,cLSTM,cLSTMs
0,SINE2+,"[1,50]",0.733 / 0.007,0.743 / 0.003,0.735 / 0.002
1,SINE2+,"[1,100]",0.759 / 0.005,0.764 / 0.002,0.756 / 0.0
2,SINE2+,"(100,)",0.828 / 0.012,0.83 / 0.005,0.838 / 0.001
3,SINE2+,"[1,)",0.811 / 0.008,0.813 / 0.004,0.817 / 0.001
4,SINE1-,"[1,50]",0.912 / 0.017,0.88 / 0.022,0.903 / 0.006
5,SINE1-,"[1,100]",0.933 / 0.015,0.926 / 0.012,0.934 / 0.003
6,SINE1-,"(100,)",0.972 / 0.004,0.982 / 0.001,0.975 / 0.001
7,SINE1-,"[1,)",0.962 / 0.007,0.968 / 0.004,0.964 / 0.0
8,SINE2-,"[1,50]",0.827 / 0.005,0.79 / 0.002,0.739 / 0.004
9,SINE2-,"[1,100]",0.853 / 0.004,0.823 / 0.005,0.768 / 0.003
