In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import inspect
import pickle
import os
import glob

In [6]:
def construct_graphs(output_dir):
    print(output_dir)
    raw_df = pd.read_csv(f'{output_dir}/raw.csv')

    with open(f'{output_dir}/metadata.pkl', 'rb') as f:
        metadata = pickle.load(f)

    print("Number of relevant nodes in precision:", sum(metadata['number_of_nn_within_radius']) / len(metadata['number_of_nn_within_radius']))

    # rename columns
    raw_df = raw_df[raw_df['method'].isin(['DP-TT-CMP', 'LAPLACE', 'L-SRR', 'SM'])]
    raw_df['method'] = raw_df['method'].replace({'DP-TT-CMP': 'DP-TT', 'LAPLACE': 'Laplace', 'SM': 'Square Mechanism'})
    raw_df['Mechanism'] = raw_df['method']

    # remove jitter from graph
    jitter_threshold = raw_df[raw_df['early_stopping_level'] == 1]['eps_cmp'].value_counts().tolist()[len(metadata['epsilons']) - 1]
    frequent_values_index = raw_df['eps_cmp'].value_counts()[raw_df['eps_cmp'].value_counts() >= jitter_threshold].index
    raw_df = raw_df[raw_df['eps_cmp'].isin(frequent_values_index)]

    print("Jitter threshold:", jitter_threshold)

    # change early_stopping_level to return_size
    for early_stopping_level in [1, 3, 5, 9]:
        raw_df.loc[raw_df['early_stopping_level'] == early_stopping_level, 'return_size'] = round(raw_df[raw_df['early_stopping_level'] == early_stopping_level]['return_size'].mean(), 2)
    raw_df['Return Set Size'] = raw_df['return_size']
    raw_df['early_stopping_level'] -= 1


    # graphs for these four
    metric_name = {
        'raw_acc': 'Raw Accuracy',
        'top_5_acc': 'Top 5 Accuracy',
        'precision': 'Precision',
        'recall': 'Recall',
    }

    for early_stopping_level in ['all', 0, 2, 4, 8]:
        if early_stopping_level == 'all':
            tmp_df = raw_df
        else:
            tmp_df = raw_df[raw_df['early_stopping_level'] == early_stopping_level]

        for metric in ['raw_acc', 'top_5_acc', 'precision', 'recall']: # precision and recall considers top K values relevant

            # plot for both DP-TT and LDP
            if early_stopping_level == 'all':
                plot = sns.scatterplot(
                    data=tmp_df, x='mse', y=f'{metric}', 
                    hue='Return Set Size', style='Mechanism', markers=True,
                    palette='deep', alpha=0.75,
                )
            else:
                plot = sns.scatterplot(
                    data=tmp_df, x='mse', y=f'{metric}', 
                    style='Mechanism', markers=True,
                    alpha=0.75,
                )

            # graph params
            plot.set(
        #         xlim=(0, 10),
                xlabel='Total MSE',
                ylabel=metric_name[metric]
            )
            if metric != 'mse':
                plot.set(ylim=(-0.1, 1.1))

            # move legend
            # sns.move_legend(plot, "upper left", bbox_to_anchor=(1, 1))

            # save plot
            os.makedirs(f'{output_dir}/{early_stopping_level}', exist_ok=True)
            plot.get_figure().savefig(f'{output_dir}/{early_stopping_level}/{metric}_mse.png', bbox_inches='tight', dpi=400)
            plt.close()
            plot.get_figure().clf()


    # # precision-recall curves
    # os.makedirs(f'{output_dir}/precision_recall/', exist_ok=True)
    # files = glob.glob(f'{output_dir}/precision_recall/*')
    # for f in files:
    #     os.remove(f)

    # for eps in raw_df['base_eps'].unique():
    #     tmp_df = raw_df[raw_df['base_eps'] == eps].groupby(['Mechanism', 'early_stopping_level'])
    #     mean_df = tmp_df[['precision', 'recall']].mean()

    #     # plot for both DP-TT and LDP
    #     plot = sns.lineplot(
    #         data=mean_df, x='recall', y='precision', 
    #         style='Mechanism', markers=True, alpha=0.75,
    #         # palette='deep',
    #     )

    #     # graph params
    #     plot.set(
    #         ylim=(-0.1, 1.1),
    #         xlim=(-0.1, 1.1),
    #         ylabel='Precision',
    #         xlabel='Recall',
    #         # title=f'Precision vs. recall for epsilon = {round(eps, 2)}'
    #     )

    #     # move legend
    #     sns.move_legend(plot, "upper left", bbox_to_anchor=(1, 1))

    #     # save file
    #     # with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    #     #     display(tmp_df["eps_cmp"].unique())
    #     plot.get_figure().savefig(f'{output_dir}/precision_recall/eps_{round(tmp_df["eps_cmp"].unique().median(), 2)}_std_{round(tmp_df["eps_cmp"].unique().std(), 2)}.png', bbox_inches='tight', dpi=400)
    #     plt.close()
    #     plot.get_figure().clf()

In [8]:
construct_graphs('graphs/densities/100000')

graphs/densities/100000
Number of relevant nodes in precision: 31.538
Jitter threshold: 897
