In [245]:
import os
import gc
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict

In [246]:
def parse_fname(fname):
    fname = fname[:fname.index('-')]
    fname_split = fname.split('_')

    channel = fname_split[0]
    error_prob = float(fname_split[1]) if not len(fname_split) == 3 else 0.
    max_len = int(fname_split[-2])
    seed = int(fname_split[-1])

    return channel, error_prob, max_len, seed

In [247]:
def load_data(input_dir):

    history_train = defaultdict(list)
    history_val = defaultdict(list)
    results = defaultdict(lambda: defaultdict(list))
    channels = []

    for fname in os.listdir(path=input_dir):
        
        fpath = os.path.join(input_dir, fname)
        if fname.endswith('csv'):
            channel, error_prob, max_len, seed = parse_fname(fname)
            df = pd.read_csv(fpath)

            df_train = df[df.phase == 'train']
            if error_prob != 0.:
                df_noise = df[df.phase == 'val']
                df_no_noise = df[df.phase == 'val (nn)']
            else:
                df_noise = df[df.phase == 'val']
                df_no_noise = df[df.phase == 'val']

            df_noise = df_noise.assign(noise=['noise' for _ in range(len(df_noise))])
            df_noise = df_noise.reset_index(drop=True)
            df_noise['accuracy'] = df_noise['accuracy'] / 100

            df_no_noise = df_no_noise.assign(noise=['no noise' for _ in range(len(df_noise))])
            df_no_noise = df_no_noise.reset_index(drop=True)
            df_no_noise['accuracy'] = df_no_noise['accuracy'] / 100

            history_val[(max_len, channel, error_prob)].append(df_noise)
            history_val[(max_len, channel, error_prob)].append(df_no_noise)
            history_train[(max_len, channel, error_prob)].append(df_train)

        elif fname.endswith('json'):
            channel, error_prob, max_len, seed = parse_fname(fname)
            channels.append(channel)
            with open(fpath) as file:
                fdata = json.load(file)

            for dataset_key in ('train', 'test'):
                for condition_key in fdata[dataset_key]['evaluation']:
                    results[dataset_key]['max_len'].append(max_len)
                    results[dataset_key]['channel'].append(channel)
                    results[dataset_key]['error_prob'].append(error_prob)
                    results[dataset_key]['noise'].append(condition_key)
                    measures = fdata[dataset_key]['evaluation'][condition_key]
                    for key, val in measures.items():
                        results[dataset_key][key].append(val)

    channels = set(channels) - {'baseline'}

    # history val: export to DataFrame and handle baseline results
    for max_len, channel, error_prob in list(history_val.keys()):
        if channel != 'baseline':
            continue
        key = (max_len, channel, error_prob)
        for c in channels:
            new_key = (max_len, c, error_prob)
            history_val[new_key] = history_val[key]
            history_train[new_key] = history_train[key]
        del history_val[key]
        del history_train[key]

    for max_len, channel, error_prob in history_val:
        for df in history_val[(max_len, channel, error_prob)]:
            df['max_len'] = max_len
            df['channel'] = channel
            df['error_prob'] = error_prob

    for max_len, channel, error_prob in history_train:
        for df in history_train[(max_len, channel, error_prob)]:
            df['max_len'] = max_len
            df['channel'] = channel
            df['error_prob'] = error_prob

    history_val = pd.concat(
        [df for key in history_val for df in history_val[key]],
        ignore_index=True
    )
    history_train = pd.concat(
        [df for key in history_train for df in history_train[key]],
        ignore_index=True
    )

    # results: export to DataFrame and handle baseline results
    result_dfs = {}
    for key, dictionary in results.items():
        df = pd.DataFrame(dictionary)
        result_dfs[key] = df

    baseline_df_list = []
    for dataset_key in result_dfs:
        for channel in channels:
            for noise_key in result_dfs[dataset_key]['noise'].unique():
                if noise_key == 'baseline':
                    continue
                df = result_dfs[dataset_key].copy(deep=True)
                df = df[df['channel'] == 'baseline'].copy(deep=True)
                df['channel'] = channel
                df['noise'] = noise_key
                baseline_df_list.append(df)
        baseline_df = pd.concat(baseline_df_list, ignore_index=True)

        _results = result_dfs[dataset_key]
        _results = _results.drop(_results[_results['channel'] == 'baseline'].index, axis=0)
        _results = pd.concat([baseline_df, _results], ignore_index=True)
        result_dfs[dataset_key] = _results

    print('df columns:', list(result_dfs['test'].columns))
    print(
        '% empty messages:',
        100 * result_dfs['test']['KLD_test_train'].isna().sum() / len(result_dfs['test']),
        100 * result_dfs['train']['KLD_test_train'].isna().sum() / len(result_dfs['train']),
    )

    return history_train, history_val, result_dfs['train'], result_dfs['test']

In [248]:
def get_long_data(history_val, metrics, dataset):
    data_long = pd.melt(
        history_val[history_val.dataset == dataset],
        id_vars='epoch max_len channel error_prob noise'.split(),
        value_vars=metrics, var_name='metric', value_name='value', ignore_index=True)
    # data_long.dropna(inplace=True)
    return data_long

In [249]:
def close_plot(plot):
    plt.close()
    gc.collect()

In [250]:
def plot_perchannel(data, out_dir, y_ranges, y_ticks, savename, big):

    long = pd.melt(pd.DataFrame(data),
        id_vars='max_len channel error_prob noise'.split(),
        value_vars=None, var_name='metric', value_name='value', ignore_index=True)
    long = long.sort_values('max_len')
    long.max_len = long.max_len.astype(str)
    long.value = long.value.astype(float)
    long.error_prob = long.error_prob.astype(float)

    channels = pd.unique(long['channel'])

    if big:
        savename = f"all_{savename}"
    else:
        long = long[(long['max_len'] == '2') | (long['max_len'] == '4')]
    col_names = ['accuracy', 'accuracy_symbol_removal', 'redundancy', 'topsim']
    df_metrics = long[long.metric.isin(col_names)]

    for channel in channels:
        save_as = f"{savename}_{channel}"

        df = df_metrics.loc[
                (df_metrics.channel == channel)]
        value_x_tick = [0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30]
        sns.set_style("whitegrid")

        if channel == 'symmetric':
            with sns.plotting_context(rc={"legend.fontsize":20}):
                plot = sns.relplot(df, x = "error_prob", y= 'value', row="metric", row_order=col_names,
                                col="max_len", hue="max_len", style="noise", style_order=["noise", "no noise"], kind='line',
                                marker ='o', markersize=8, legend="brief", facet_kws={"margin_titles": True, 'sharey': False})
        else:
            plot = sns.relplot(df, x = "error_prob", y= 'value', row="metric", row_order=col_names,
                            col="max_len", hue="max_len", style="noise", style_order=["noise", "no noise"], kind='line',
                            marker ='o', markersize=8, legend=False, facet_kws={"margin_titles": True, 'sharey': False})

        for i, metric in enumerate(col_names):
            for ax in plot.axes[i]:  # Each row contains multiple columns
                ax.set_ylim(*y_ranges.get(metric, (0, 1)))  # Set y-axis range
                ax.set_yticks(y_ticks.get(metric, [0, 0.2, 0.4, 0.6, 0.8, 1.0]))  # Set y-ticks
                ax.set_yticklabels(y_ticks.get(metric, [0, 0.2, 0.4, 0.6, 0.8, 1.0]), size = '25')       
        
        for ax in plot.axes.flatten():
            ax.tick_params(labelbottom=True)

        plot.set(xticks=value_x_tick)
        (plot
        .set_axis_labels("Error Probability", "Value", size = '25')
        .set_titles(col_template="max len {col_name}", row_template="{row_name}", size = '25')
        .set_xticklabels(value_x_tick, size = '20')
        .tight_layout())
        plot.fig.subplots_adjust(top=0.94)
        plot.fig.suptitle(f'{channel}', fontsize = '30')
        #plot.savefig(os.path.join(out_dir, save_as))
        plot.savefig(
            os.path.join(out_dir, f"{save_as}.pdf"),
            format='pdf',
            dpi=None,
            pad_inches=0.01,
            bbox_inches='tight',
        )
        close_plot(plot)

In [251]:
input_folder = "ancm/runs/03_05/visa/"
output_folder = "ancm/results/03_05/visa"

In [252]:
processed_data_path = os.path.join(input_folder, 'processed')

visa_history_train, visa_history_test, visa_results_train, visa_results_test = load_data(input_folder)

os.makedirs(output_folder, exist_ok=True)

df columns: ['max_len', 'channel', 'error_prob', 'noise', 'samples', 'samples_per_target_obj', 'samples_per_cat', 'unique_msg', 'unique_samples', 'unique_target_objs', 'unique_target_objs_per_msg', 'unique_samples_per_target_obj', 'unique_samples_cat', 'unique_cat', 'unique_samples_per_target_cat', 'unique_cats_per_msg', 'average_length', 'actual_vocab_size', 'accuracy', 'accuracy_symbol_removal', 'max_rep', 'redundancy', 'topsim', 'entropy_msg', 'entropy_msg_as_a_whole', 'entropy_max', 'entropy_min', 'entropy_min_cat', 'entropy_input', 'mutual_info_msg_input', 'variation_of_info_msg_input', 'proficiency_msg_input', 'redundancy_msg_input', 'entropy_category', 'mutual_info_msg_category', 'variation_of_info_msg_category', 'proficiency_msg_category', 'redundancy_msg_category', 'KLD_train_test', 'KLD_test_train']
% empty messages: 0.0 0.0


In [253]:
y_ranges = {
    "accuracy": (0.4, 0.9),
    "accuracy_symbol_removal": (0.3, 0.8),
    "redundancy": (0.0, 0.4),
    "topsim": (0.1, 0.4),
    }

y_ticks = {
    "accuracy": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    "accuracy_symbol_removal": [0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
    "redundancy": [0.0, 0.1, 0.2, 0.3, 0.4],
    "topsim": [0.1, 0.2, 0.3, 0.4],
    }

In [254]:
plot_perchannel(visa_results_test, output_folder, y_ranges, y_ticks, savename="visa_test", big=True)
plot_perchannel(visa_results_test, output_folder, y_ranges, y_ticks, savename="visa_test", big=False)
#plot_perchannel(visa_results_train, output_folder, y_ranges, y_ticks, savename="visa_train")

In [255]:
input_folder_obv = "ancm/runs/03_05/obverter_10/"
output_folder_obv = "ancm/results/03_05/obverter_10"

In [256]:
processed_data_path = os.path.join(input_folder_obv, 'processed')

obv_history_train, obv_history_test, obv_results_train, obv_results_test = load_data(input_folder_obv)

os.makedirs(output_folder_obv, exist_ok=True)

df columns: ['max_len', 'channel', 'error_prob', 'noise', 'samples', 'samples_per_target_obj', 'unique_msg', 'unique_samples', 'unique_target_objs', 'unique_target_objs_per_msg', 'unique_samples_per_target_obj', 'average_length', 'actual_vocab_size', 'accuracy', 'accuracy_symbol_removal', 'max_rep', 'redundancy', 'topsim', 'entropy_msg', 'entropy_msg_as_a_whole', 'entropy_max', 'entropy_min', 'entropy_attr', 'entropy_shape', 'mutual_info_msg_shape', 'variation_of_info_msg_shape', 'proficiency_msg_shape', 'redundancy_msg_shape', 'entropy_color', 'mutual_info_msg_color', 'variation_of_info_msg_color', 'proficiency_msg_color', 'redundancy_msg_color', 'entropy_xpos', 'mutual_info_msg_xpos', 'variation_of_info_msg_xpos', 'proficiency_msg_xpos', 'redundancy_msg_xpos', 'entropy_ypos', 'mutual_info_msg_ypos', 'variation_of_info_msg_ypos', 'proficiency_msg_ypos', 'redundancy_msg_ypos', 'entropy_rotation', 'mutual_info_msg_rotation', 'variation_of_info_msg_rotation', 'proficiency_msg_rotation', 

In [257]:
y_ranges = {
    "accuracy": (0.5, 0.9),
    "accuracy_symbol_removal": (0.3, 0.8),
    "redundancy": (0.0, 0.6),
    "topsim": (0, 0.3),
    }

y_ticks = {
    "accuracy": [0.5, 0.6, 0.7, 0.8, 0.9],
    "accuracy_symbol_removal": [0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
    "redundancy": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
    "topsim": [0.0, 0.1, 0.2, 0.3],
    }

In [258]:
plot_perchannel(obv_results_test, output_folder, y_ranges, y_ticks, savename="obv_test", big=True)
plot_perchannel(obv_results_test, output_folder, y_ranges, y_ticks, savename="obv_test", big=False)
#plot_perchannel(obv_results_train, output_folder, y_ranges, y_ticks, savename="obv_train")