In [1]:
import pandas as pd
import os
import seaborn as sns
from config import *
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay

In [2]:
result_dir='algorithm_classification_ela_results'
seeds=[200,400,600,800,1000]
visualizations_dir='algorithm_classification_ela_visualizations'
train_on_seed=True
difference=False

seed_mapping={index:seed for index,seed in enumerate(seeds)}

In [3]:
os.makedirs(visualizations_dir,exist_ok=True)

In [4]:
def get_global_file_name(dimension,algorithms, iteration_start,iteration_end, train_on_seed,seed, difference,stat=None, normalize_y=False):
    file_name=f'dim_{dimension}_{algorithms}_it_{iteration_start}-{iteration_end}_instance_count_100_{"train" if train_on_seed else "test"}_on_seed_{seed}{"_differenced" if difference else ""}'
    if stat is None:
        return os.path.join(result_dir + f'_normalize_{normalize_y}',  file_name)
    else:
        return os.path.join(result_dir + f'_normalize_{normalize_y}',f'stat_{stat}_' + file_name)


def get_visualization_output_name_without_seed(dimension,algorithms, iteration_start,iteration_end, train_on_seed, difference):
    return os.path.join(visualizations_dir,f'dim_{dimension}_{algorithms}_it_{iteration_start}-{iteration_end}_instance_count_100_{"train" if train_on_seed else "test"}_on_seed{"_differenced" if difference else ""}')

In [5]:
def get_seed_accuracies_df(dimension,algorithm, iteration_start, iteration_end, train_on_seed, difference, seeds, stat=None, normalize_y=False):
    seed_accuracies=[]
    test_df_all=pd.DataFrame()
    for train_seed in seeds:
        test_df=pd.DataFrame()
        for fold in range(0,10):

            global_file_name=get_global_file_name(dimension,algorithm, iteration_start,iteration_end, train_on_seed,train_seed, difference, stat, normalize_y)
            file_location=global_file_name+f'_fold_{fold}_test_preds.csv'

            if not os.path.isfile(file_location):
                print('Not found', file_location)
                continue
            #print(file_location)
            test_df_fold=pd.read_csv(file_location, index_col=[0], compression='zip').rename(columns={'seed':'test_seed'}) 
            test_df_fold['fold']=fold
            test_df_fold['train_seed']=train_seed
            test_df=pd.concat([test_df,test_df_fold])
            test_df_all=pd.concat([test_df_all,test_df_fold])



        for test_seed in seeds:
            for fold in range(0,10):
                seed_preds_df=test_df.query('test_seed==@test_seed and fold==@fold')
                seed_accuracies+=[(accuracy_score(seed_preds_df['y'], seed_preds_df['preds']), train_seed, test_seed, fold)]


    if len(seed_accuracies)>0:
        seed_accuracy_df=pd.DataFrame(seed_accuracies)
        seed_accuracy_df.columns=['accuracy','train_seed','test_seed','fold']
        return seed_accuracy_df, test_df_all
    return None

In [6]:
def accuracy_subplots(all_accuracies, all_groupings, group_name, file_name):
    fig, axes = plt.subplots(int(len(all_groupings)/2),2, figsize=(8,8), sharex=True, sharey=True, squeeze=False)
    for grouping_index, grouping in enumerate(all_groupings):
        grouping_accuracies=all_accuracies[all_accuracies[group_name]==grouping]
        ax=axes[int(grouping_index/2), grouping_index%2] if type(axes) is np.ndarray else axes
        ax.set_title(f'{group_name}: {grouping}')
        sns.violinplot(data=grouping_accuracies, x="feature calculation budget (iterations)", y="accuracy", hue='Evaluation on trajectories from training seed', split=True, palette=color_palette[1:3], ax=ax)
        if grouping_index != 0:
            ax.get_legend().remove()
    plt.tight_layout()

    plt.savefig(os.path.join(visualizations_dir,file_name))
    plt.show()

In [7]:
results_per_iteration=[]
all_algorithms='DE_PSO_ES'
all_dimensions=[5]
all_accuracies=pd.DataFrame()
train_on_seed=True
for dimension in all_dimensions:

    for end_iteration in [2,4,9,19,29]: #[0,2,4,9,19,29]
        for difference_index, difference in enumerate([False,True]):
            if end_iteration==0 and difference:
                continue
            for normalize_index, normalize_y in enumerate([False,True]):

                seed_accuracy_df,_ = get_seed_accuracies_df(dimension,all_algorithms, 0, end_iteration, train_on_seed, difference, seeds, normalize_y=normalize_y)
                seed_accuracy_df['feature calculation budget']=end_iteration+1
                seed_accuracy_df['dimension']=dimension
                seed_accuracy_df['difference']=difference
                seed_accuracy_df['normalize_y']=normalize_y
                all_accuracies=pd.concat([all_accuracies,seed_accuracy_df])

    
all_accuracies['Train seed==Test seed']=all_accuracies.apply(lambda row: row['train_seed']==row['test_seed'] if train_on_seed else row['train_seed']!=row['test_seed'] , axis=1)
print(all_accuracies)
#accuracy_subplots(all_accuracies,all_dimensions,'dimension',f'accuracy_by_dimension_{"train" if train_on_seed else "test"}_on_seed{"_differenced" if difference else ""}.pdf')

     accuracy  train_seed  test_seed  fold  feature calculation budget  \
0    1.000000         200        200     0                           3   
1    0.998889         200        200     1                           3   
2    1.000000         200        200     2                           3   
3    1.000000         200        200     3                           3   
4    1.000000         200        200     4                           3   
..        ...         ...        ...   ...                         ...   
245  0.913333        1000       1000     5                          30   
246  1.000000        1000       1000     6                          30   
247  1.000000        1000       1000     7                          30   
248  0.973333        1000       1000     8                          30   
249  0.882000        1000       1000     9                          30   

     dimension  difference  normalize_y  Train seed==Test seed  
0            5       False        False       

In [None]:
sns.set(font_scale=1.2)
fig, axes = plt.subplots(2,2, figsize=(8,8), sharex=True, sharey=True, squeeze=False)
for normalize_index, normalize_y in enumerate([False,True]):
    for difference_index, difference in enumerate([False,True]):
        grouping_accuracies=all_accuracies.query('difference==@difference and normalize_y==@normalize_y')
        ax=axes[normalize_index][difference_index] if type(axes) is np.ndarray else axes
        ax.set_title(f'Difference: {difference}\n Scale y: {normalize_y}')
        sns.violinplot(data=grouping_accuracies, x="feature calculation budget", y="accuracy", hue='Train seed==Test seed', split=True, palette=color_palette[1:3], ax=ax)
        if normalize_index==0 and difference_index==1:
            pass
            #ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        else: 
            ax.get_legend().remove()
plt.tight_layout()

plt.savefig(os.path.join(visualizations_dir,'algorithm_classification_results.pdf'))
plt.show()