In [1]:
import gc
import os
import warnings
from os import path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from plotnine import *

warnings.filterwarnings('ignore')

In [9]:
sample_size = 24        # 56
plots_dir = os.path.join('out', 'plots', f'n{sample_size}')
calculations_dir = os.path.join('out', 'calculations', f'n{sample_size}')

os.makedirs(plots_dir, exist_ok=True)
os.makedirs(calculations_dir, exist_ok=True)

metrics = {
    'sb.bin': 'Stereotypical bias',
    'acc_equality_diff.bin': 'Accuracy equality difference',
    'acc_equality_ratio.bin': 'Accuracy equality ratio',
    'disp_impact.bin': 'Disparate impact',
    'equal_opp_diff.bin': 'Equal opportunity difference',
    'equal_opp_ratio.bin': 'Equal opportunity ratio',
    'neg_pred_parity_diff.bin': 'Negative predictive parity difference',
    'neg_pred_parity_ratio.bin': 'Negative predictive parity ratio',
    'pos_pred_parity_diff.bin': 'Positive predictive parity difference',
    'pred_equality_diff.bin': 'Predictive equality difference',
    'pred_equality_ratio.bin': 'Predictive equality ratio',
    'pos_pred_parity_ratio.bin': 'Positive predictive parity ratio',
    'stat_parity.bin': 'Statistical parity'
}


In [10]:
with open(path.join(calculations_dir, 'gr.bin'), 'rb') as f:
    gr = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['gr'])

with open(path.join(calculations_dir, 'ir.bin'), 'rb') as f:
    ir = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['ir'])

In [11]:
def get_plot(df, metric):
    fig = ggplot(df, aes(x=metric, fill='typeVal'))
    fig += geom_histogram(aes(y=after_stat('count / np.sum(count)')), bins=50)
    fig += scale_fill_manual(values={'NaN': '#ca0020', 'not NaN': '#404040'}, guide=False)
    fig += facet_grid('ir~gr', labeller='label_both')
    fig += scale_x_continuous(labels=lambda lst: [x if x != -1.5 else 'NaN' for x in lst])
    fig += theme_minimal()
    fig += theme(figure_size=(16, 8))
    fig += ggtitle(f'{metric} for selected IR & GR')
    fig += ylab('Probability')
    return fig

ir_selected = [0.125, 0.5, 0.875]
gr_selected = [0.125, 0.5, 0.875]

In [None]:
for metric_file, metric_name in metrics.items():
    with open(path.join(calculations_dir, metric_file), 'rb') as f:
        metric = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=[metric_name])
    df = pd.concat([gr, ir], axis=1)
    df = pd.concat([df, metric], axis=1)
    df = df.loc[df.ir.isin(ir_selected) & df.gr.isin(gr_selected)]
    df.replace(np.nan, -1.5, inplace=True)
    df['typeVal'] = np.where(df[metric_name] == -1.5, 'NaN', 'not NaN')

    fig = get_plot(df, metric_name)
    print(fig)
    # TODO name
    fig.save(path.join(plots_dir, f'{metric_name}_hist.png'), width=16, height=8, dpi=300)

    del metric
    del df
    gc.collect()

## matplotlib

In [None]:
# same in matplotlib
for metric_file, metric_name in metrics.items():
    with open(path.join(calculations_dir, metric_file), 'rb') as f:
        metric = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=[metric_name])
    df = pd.concat([gr, ir], axis=1)
    df = pd.concat([df, metric], axis=1)
    df = df.loc[df.ir.isin(ir_selected) & df.gr.isin(gr_selected)]
    df.replace(np.nan, -1.5, inplace=True)
    df['typeVal'] = np.where(df[metric_name] == -1.5, 'NaN', 'not NaN')

    fig, ax = plt.subplots(1, 1, figsize=(16, 8))
    for ir_val in ir_selected:
        for gr_val in gr_selected:
            df_tmp = df.loc[(df.ir == ir_val) & (df.gr == gr_val)]
            ax.hist(df_tmp[metric_name], bins=100, alpha=0.5, label=f'ir={ir_val}, gr={gr_val}')
    ax.set_title(f'{metric_name} for selected IR & GR')
    ax.set_xlabel(metric_name)
    ax.set_ylabel('Probability')
    ax.legend()
    fig.savefig(path.join(plots_dir, f'{metric_name}_matplotlib.png'), dpi=300)

    del metric
    del df
    gc.collect()