In [1]:
import pandas as pd
import os
import seaborn as sns
from config import *
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay

In [2]:
result_dir='algorithm_classification_results'
seeds=[200,400,600,800,1000]
visualizations_dir='algorithm_classification_visualizations'
train_on_seed=True
difference=False

seed_mapping={index:seed for index,seed in enumerate(seeds)}

In [3]:
all_algorithms=['DE','PSO','ES']
all_stats=['min','max','mean','std']
all_dimensions=[3,5,10,20]

In [4]:
def get_global_file_name(dimension,algorithms, iteration_start,iteration_end, train_on_seed,seed, difference,stat=None):
    file_name=f'dim_{dimension}_{"_".join(algorithms)}_it_{iteration_start}-{iteration_end}_instance_count_100_{"train" if train_on_seed else "test"}_on_seed_{seed}{"_differenced" if difference else ""}'
    if stat is None:
        return os.path.join(result_dir,file_name)
    else:
        return os.path.join(result_dir,f'stat_{stat}_' + file_name)


def get_visualization_output_name_without_seed(dimension,algorithms, iteration_start,iteration_end, train_on_seed, difference):
    return os.path.join(visualizations_dir,f'dim_{dimension}_{algorithms}_it_{iteration_start}-{iteration_end}_instance_count_100_{"train" if train_on_seed else "test"}_on_seed{"_differenced" if difference else ""}')

In [5]:
def get_seed_accuracies_df(dimension,algorithm, iteration_start, iteration_end, train_on_seed, difference, seeds, stat=None):
    seed_accuracies=[]
    test_df_all=pd.DataFrame()
    for train_seed in seeds:
        test_df=pd.DataFrame()
        for fold in range(0,10):

            global_file_name=get_global_file_name(dimension,algorithm, iteration_start,iteration_end, train_on_seed,train_seed, difference, stat)
            file_location=global_file_name+f'_fold_{fold}_test_preds.csv'

            if not os.path.isfile(file_location):
                print('Not found', file_location)
                continue
            #print(file_location)
            test_df_fold=pd.read_csv(file_location, index_col=[0], compression='zip').rename(columns={'seed':'test_seed'}) 
            test_df_fold['fold']=fold
            test_df_fold['train_seed']=train_seed
            test_df=pd.concat([test_df,test_df_fold])
            test_df_all=pd.concat([test_df_all,test_df_fold])



        for test_seed in seeds:
            for fold in range(0,10):
                seed_preds_df=test_df.query('test_seed==@test_seed and fold==@fold')
                seed_accuracies+=[(accuracy_score(seed_preds_df['y'], seed_preds_df['preds']), train_seed, test_seed, fold)]


    if len(seed_accuracies)>0:
        seed_accuracy_df=pd.DataFrame(seed_accuracies)
        seed_accuracy_df.columns=['accuracy','train_seed','test_seed','fold']
        return seed_accuracy_df, test_df_all
    return None

In [6]:
def plot_accuracies_per_statistics_per_dimension(all_stats,difference=False, train_on_seed=True):
    
    results_per_iteration=[]
    all_accuracies=pd.DataFrame()
    for stat in all_stats:
        for dimension in [3,5,10,20]: 

            for end_iteration in [2,4,9,19,29]:
                seed_accuracy_df,_ = get_seed_accuracies_df(dimension,all_algorithms, 0, end_iteration, train_on_seed, difference, seeds,stat)
                seed_accuracy_df['feature calculation budget (iterations)']=end_iteration+1
                seed_accuracy_df['dimension']=dimension
                seed_accuracy_df['statistic']=stat
                all_accuracies=pd.concat([all_accuracies,seed_accuracy_df])
                        
                        
    fig, axes = plt.subplots(int(len(all_dimensions)/2),2, figsize=(8,8), sharex=True, sharey=True)
    for index,dimension in enumerate(all_dimensions):


        accuracies=all_accuracies.query('dimension==@dimension')
        accuracies=accuracies.groupby(['dimension','statistic','feature calculation budget (iterations)']).median()
        ax=axes[int(index/2), index%2] if type(axes) is np.ndarray else axes
        ax.set_title(f'Dimension: {dimension}')
        sns.lineplot(x="feature calculation budget (iterations)", y="accuracy",
                 hue="statistic", ax=ax,style="statistic",markers=True,dashes=False,
                 data=accuracies,palette=[light_blue,dark_cyan,lime, grey], hue_order=['mean','std','max','min'])


        if index != 0:
            ax.get_legend().remove()
    plt.tight_layout()
    plt.savefig(os.path.join(visualizations_dir,f'{"-".join(all_stats)}_accuracies_per_statistic_per_dimension_{"train" if train_on_seed else "test"}_on_seed{"_differenced" if difference else ""}.pdf'))
    plt.show()
    return all_accuracies

In [7]:
x_y_accuracies=plot_accuracies_per_statistics_per_dimension(['min','max','mean','std'], difference=True)

Not found algorithm_classification_results/stat_min_dim_3_DE_PSO_ES_it_0-2_instance_count_100_train_on_seed_200_differenced_fold_0_test_preds.csv
Not found algorithm_classification_results/stat_min_dim_3_DE_PSO_ES_it_0-2_instance_count_100_train_on_seed_200_differenced_fold_1_test_preds.csv
Not found algorithm_classification_results/stat_min_dim_3_DE_PSO_ES_it_0-2_instance_count_100_train_on_seed_200_differenced_fold_2_test_preds.csv
Not found algorithm_classification_results/stat_min_dim_3_DE_PSO_ES_it_0-2_instance_count_100_train_on_seed_200_differenced_fold_3_test_preds.csv
Not found algorithm_classification_results/stat_min_dim_3_DE_PSO_ES_it_0-2_instance_count_100_train_on_seed_200_differenced_fold_4_test_preds.csv
Not found algorithm_classification_results/stat_min_dim_3_DE_PSO_ES_it_0-2_instance_count_100_train_on_seed_200_differenced_fold_5_test_preds.csv
Not found algorithm_classification_results/stat_min_dim_3_DE_PSO_ES_it_0-2_instance_count_100_train_on_seed_200_differenced_

UndefinedVariableError: name 'test_seed' is not defined

In [None]:
x_y_accuracies=plot_accuracies_per_statistics_per_dimension(['min','max','mean','std'], difference=False)

In [None]:
x_y_accuracies

In [None]:
#Accuracy per statistics across all settings
results_per_iteration=[]
all_algorithms=['DE','ES','CMAES','PSO']
all_stats=['min','max','mean','std']
all_accuracies=pd.DataFrame()
for stat in all_stats:
    

    for dimension in [3,5,10,20]: #
        for algorithm in all_algorithms:
            for train_on_seed_1 in [True]:
                for difference_1 in [False]:
                    for end_iteration in [0,2,4,9,19,29,39,49]:
                        seed_accuracy_df,_ = get_seed_accuracies_df(dimension,algorithm, 0, end_iteration, train_on_seed_1, difference_1, seeds,stat)
                        seed_accuracy_df['feature calculation budget (iterations)']=end_iteration+1
                        seed_accuracy_df['algorithm']=algorithm
                        seed_accuracy_df['dimension']=dimension
                        seed_accuracy_df['stat']=stat
                        all_accuracies=pd.concat([all_accuracies,seed_accuracy_df])
                        
all_accuracies['Evaluation on trajectories from training seed']=all_accuracies.apply(lambda row: row['train_seed']==row['test_seed'], axis=1)
fig, axes = plt.subplots(int(len(all_stats)/2),2, figsize=(8,8), sharex=True, sharey=True)
for index, stat in enumerate(all_stats):
    accuracies=all_accuracies.query('stat==@stat')
    ax=axes[int(index/2), index%2] if type(axes) is np.ndarray else axes
    ax.set_title(f'Statistic: {stat}')
    sns.violinplot(data=accuracies, x="feature calculation budget (iterations)", y="accuracy", hue='Evaluation on trajectories from training seed', split=True, palette=color_palette[1:3], ax=ax)
    if index != 0:
        ax.get_legend().remove()
plt.tight_layout()

plt.savefig(os.path.join(visualizations_dir,f'accuracies_per_statistic.pdf'))
plt.show()