# Accuracy equality and statistical parity histograms for IR = 1/4 and GR = 1/28

research about its shape (aka the "Meowth" Pokemon shape)

### setup

In [5]:
import gc
import os
import warnings
from datetime import datetime
from os import path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

warnings.filterwarnings('ignore')

In [10]:
sample_size = 56
plots_dir = os.path.join('out', 'plots', f'n{sample_size}', 'histograms', 'meow')
calculations_dir = os.path.join('out', 'calculations', f'n{sample_size}')

os.makedirs(plots_dir, exist_ok=True)
os.makedirs(calculations_dir, exist_ok=True)

metrics = {
    'acc_equality_diff.bin': 'Accuracy equality',
    'stat_parity.bin': 'Statistical parity',
}

plt.style.use('default')

SMALL_SIZE = MEDIUM_SIZE = 14
BIGGER_SIZE = 15

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [3]:
with open(path.join(calculations_dir, 'gr.bin'), 'rb') as f:
    gr = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['gr'])

with open(path.join(calculations_dir, 'ir.bin'), 'rb') as f:
    ir = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['ir'])

In [16]:
def plot_histograms(metric_info, grs, irs, gr_labels, ir_labels, bins_n):
    m_file, m_name = metric_info

    with open(path.join(calculations_dir, m_file), 'rb') as f:
        df = pd.concat([gr, ir, pd.DataFrame(np.fromfile(f), columns=[m_name])], axis=1)

    # filter to get only results for selected ratios
    df = df.loc[df.ir.isin(irs) & df.gr.isin(grs)]

    mosaic = [
        [f'a{i}{g}{x}'
         for g in range(len(grs))
         for x in ['', 'n']
        ]
        for i in range(len(irs))
    ]

    fig, axs = plt.subplot_mosaic(mosaic,
                                  width_ratios=[50, 1]*len(grs),
                                  sharex=False, sharey=True,
                                  layout='constrained',
                                  figsize=(20, 20),
                                  gridspec_kw={'wspace': 0.1, 'hspace': 0.1})
    fig.suptitle(f'{m_name}')

    for i, ir_val in enumerate(irs):
        for g, gr_val in enumerate(grs):

            # separate nans and numbers
            df_tmp = df.loc[(df.ir == ir_val) & (df.gr == gr_val)]
            total = df_tmp.shape[0]

            df_not_nan = df_tmp.loc[np.logical_not(np.isnan(df_tmp[m_name]))]
            nan_prob = df_tmp.loc[np.isnan(df_tmp[m_name])].shape[0] / total

            # prepare data for plotting
            binned, edges = np.histogram(df_not_nan[m_name], bins=bins_n)
            binned = binned / total

            # plot not nans
            axs[f'a{i}{g}'].hist(edges[:-1], edges, weights=binned, fc='black', ec='black')
            axs[f'a{i}{g}'].spines[['top', 'right']].set_visible(False)

            # plot nans - without drawing the full axis frame
            axs[f'a{i}{g}n'].bar(0, nan_prob, fc='red', ec='red', width=0.1, lw=0)
            axs[f'a{i}{g}n'].spines[['top', 'left']].set_visible(False)

            # styling
            if g == 0:
                axs[f'a{i}{g}'].set_ylabel(f'IR = {ir_labels[i]}')
            if i == 0:
                axs[f'a{i}{g}'].set_title(f'GR = {gr_labels[g]}')
            if i == len(irs) - 1:   # last row
                axs[f'a{i}{g}n'].set_xticks([0], ['Undef.'])
            else:
                axs[f'a{i}{g}'].set_xticklabels([])
                axs[f'a{i}{g}n'].set_xticks([0], [''])

    del df
    gc.collect()

    return fig

## Exploration

### Histograms with various density

In [None]:
ratios_labels = ['', '']

grs = np.float16([1. / 28, 27. / 28])
irs = np.float16([1. / 4, 3. / 4])

for BINS in (257, 321, 513, 601):
    for metric_info in {'stat_parity.bin': 'Statistical parity', }.items():
        fig = plot_histograms(metric_info, grs, irs, ratios_labels, ratios_labels, BINS)
        fig.savefig(path.join(plots_dir, 'meow', f'meow_histogram_b{BINS}_{metric_info[1]}.svg'), dpi=300)
        # plt.show()
        plt.close(fig)
m_file, m_name = ('stat_parity.bin', 'Statistical parity')

with open(path.join(calculations_dir, m_file), 'rb') as f:
    df = pd.concat([gr, ir, pd.DataFrame(np.fromfile(f), columns=[m_name])], axis=1)

# filter to get only results for selected ratios
df = df.loc[(df.ir == np.float16(1 / 4)) & (df.gr == np.float16(1 / 28))]
df['Statistical parity'].mode()
m_file, m_name = ('stat_parity.bin', 'Statistical parity')

with open(path.join(calculations_dir, m_file), 'rb') as f:
    df = pd.concat([gr, ir, pd.DataFrame(np.fromfile(f), columns=[m_name])], axis=1)

# filter to get only results for selected ratios
df = df.loc[(df.ir == np.float16(3 / 4)) & (df.gr == np.float16(1 / 28))]

df['Statistical parity'].value_counts()

### Histograms of adjacent ratios

how other, unsimilar shapes of histogram "transform" into this specific one

In [None]:
gr_ratios = [1/56, 2./56, 3/56, 4/56, 5/56]
gr_labels = ['1/56', '2/56', '3/56', '4/56', '5/56']
grs = np.float16(gr_ratios)

BINS = 109

# IR changes every 1/56, 2/56 or 3/56
for ir_ratios, ir_labels in [
    [[10/56, 11/56, 12/56, 13/56, 1./4, 15/56, 16/56, 17/56, 18/56], ['10/56', '11/56', '12/56', '13/56', '1/4', '15/56', '16/56', '17/56', '18/56'][::-1]],
    [[6/56, 8/56, 10/56, 12/56, 1./4, 16/56, 18/56, 20/56, 22/56], ['6/56', '8/56', '10/56', '12/56', '1/4', '16/56', '18/56', '20/56', '22/56'][::-1]],
    [[2/56, 5/56, 8/56, 11/56, 1./4, 17/56, 20/56, 23/56, 26/56], ['2/56', '5/56', '8/56', '11/56', '1/4', '17/56', '20/56', '23/56', '26/56'][::-1]],
]:
    irs = np.float16(ir_ratios[::-1])
    for metric_info in metrics.items():
        fig = plot_histograms(metric_info, grs, irs, gr_labels, ir_labels, bins_n=BINS)
        fig.savefig(path.join(plots_dir,
                              f'histogram_b{BINS}_{datetime.now().strftime("%H%M%S")}.svg'),
                    dpi=300)
        plt.close(fig)

In [None]:
m_file, m_name = ('stat_parity.bin', 'Statistical parity')

with open(path.join(calculations_dir, m_file), 'rb') as f:
    df = pd.concat([gr, ir, pd.DataFrame(np.fromfile(f), columns=[m_name])], axis=1)

# filter to get only results for selected ratios
df = df.loc[(df.ir == np.float16(1/4)) & (df.gr == np.float16(1/28))]

print(df['Statistical parity'].mode())

df['Statistical parity'].value_counts()