In [1]:
import pandas as pd
import numpy as np
import os
from analysis import *
import glob
import string
import plotly.graph_objects as go
os.chdir('Checkpoints/')

In [2]:
colors = ['#e6194B', '#3cb44b', '#ffe119','#f58231','#42d4f4',
          '#f032e6','#fabed4','#469990','#aaffc3','#000075','#e6194B','#9a6324','#dcbeff', '#aaffc3', '#ffe119']
plotted_models = ['DPN','ReXNet', 'EseVoVNet', 'TNT', 'Inception', 'HRNet', 'MobileNet', 'VGG19', 'ResNet-RS', 
                 'DenseNet', 'DPN_Magface', 'DPN_Cosface', 'SMAC', 'Coat', 'Cait']

In [3]:
models = get_finished_models_Phase1B()

rank_files = glob.glob('Phase1B/**/*_rank_by_id_val.csv')
rank_files += glob.glob('timm_explore_few_epochs/**/*_rank_by_id_val.csv')
rank_files_Phase1 = [x for x in rank_files if any([m in x for m in models])]
kacc_files = glob.glob('**/**/*_kacc_val.csv')
metadata = pd.read_csv('val_identities_gender-expression_seed_222.csv')

epochs = ['epoch_'+str(i) for i in [19,39,59,79,99]]

acc_df, acc_disp_df, rank_df = analyze_rank_files(rank_files_Phase1, metadata, epochs=epochs)
_, acc_disp_ratio_df, rank_ratio_df = analyze_rank_files(rank_files_Phase1, metadata, ratio=True, epochs=epochs)
err_df, error_ratio_df, _ = analyze_rank_files(rank_files_Phase1, metadata, ratio=True, error=True, epochs=epochs)

# Stdacc

In [4]:
def stdacc_from_rank_func(df, epoch_columns):
    # calculate the violation of statistical parity
    data = {}
    for g,g_df in df.groupby('gender_expression'):
        data[g] = g_df[epoch_columns].apply(lambda x: np.mean([(x<=i).mean() for i in range(x.max())]))
    return abs(data['male'].std()-data['female'].std())
    
def stdacc_rank_files_np(files, metadata, epochs=None): 
    if epochs is None:
        epochs = ['epoch_'+str(e) for e in range(100)]

    stdacc_df = pd.DataFrame(columns=epochs)

    for f in files:
        try:
            df = pd.read_csv(f)
        except:
            continue
        epoch_columns = list(set(df.columns).intersection(epochs))
        if not df[epoch_columns].shape[1]:
            continue
        df = metadata.merge(df)
        num_epochs = len(epoch_columns)
        stdacc = stdacc_from_rank_func(df, epoch_columns)

        experiment = get_name_details(f)[0]
        stdacc_df.loc[experiment] = stdacc
    return stdacc_df

stdacc_df = stdacc_rank_files_np(rank_files_Phase1, metadata, epochs)



In [10]:
stdacc_df = prepare(stdacc_df)
stdacc_df['index'] = stdacc_df['index']+'.csv'

In [14]:
import plotly
df = err_df.merge(stdacc_df, on=['index','epoch'])
df = df[df['epoch'] == 99]
df = df.rename(columns={'Metric_x':'Error'})
df = df.rename(columns={'Metric_y':'Disparity'})
df = df.dropna()
df = df[df['Error']<0.3]
df = df[df['Disparity']<2.0]
df.loc[:,'model'] = df['index'].apply(lambda x: get_name_details(x)[1])
df = df.sort_values(by=['model'])


fig = px.scatter(df, x='Error', 
                 y='Disparity', 
                 template="simple_white", 
                 width=1000, height= 500
            )

fig.update_layout(
    yaxis_title="StdAcc",
#     yaxis_range=[0,6.15],
    xaxis_range=[0,.31],
    font=dict(
        family="Times New Roman",
        size=19,
        color="Black"
    )
)
fig.add_shape(type='line',
                x0=0,y0=0,x1=1,y1=0,
                line=dict(color='Red',),
                xref='x',yref='y'
)
p = np.array(preparePareto(df[['Error','Disparity']], False, False).dropna())
for x, y in zip(p[:-1], p[1:]):
    fig.add_shape(type='line',
                x0=x[0],y0=x[1],x1=y[0],y1=y[1],
                line=dict(color='gray',width=4),line_dash='dash',
                xref='x',yref='y')
    

fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=-.29,
    xanchor="right",
    x=.75,
    font=dict(
        family="Times New Roman",
        size=16,
        color="Black"
    )
    )
)

pareto_df = pd.merge(df, pd.DataFrame(p), right_on=[0,1], left_on=["Error","Disparity"])
for model in set(pareto_df['model']):
    # match model name to color
    i = [x.lower()[:3] for x in plotted_models].index(model.lower()[:3])
    name = plotted_models[i]
    color = colors[i]
    fig.add_trace(go.Scatter(
        x=pareto_df[pareto_df['model'] == model]['Error'],
        y=pareto_df[pareto_df['model'] == model]['Disparity'],
        mode='markers',
        name = name,
        marker = dict(color=color,size=15),
    ))

fig.show()
plotly.io.write_image(fig, 'app_RQ1_main_full_stdacc.png', format='png')

In [12]:
set(pareto_df['model'])

{'coat_lite_small_CosFace_AdamW_0.001_cosine',
 'dpn107_ArcFace_SGD_0.1_cosine',
 'dpn107_CosFace_SGD_0.1_cosine',
 'rexnet_200_CosFace_SGD_rank_by',
 'rexnet_200_MagFace_SGD_0.1_cosine',
 'vgg19_bn_MagFace_AdamW_rank_by'}

# BPC

In [4]:
def bpc_bias_from_rank_func(df, epoch_columns):
    # calculate the violation of statistical parity
    data = {}
    for g,g_df in df.groupby('gender_expression'):
        data[g] = g_df[epoch_columns].apply(lambda x: np.mean([(x<=i).mean() for i in range(x.max())]))
    return abs(data['male']-data['female'])
    
def bpc_bias_rank_files_np(files, metadata, epochs=None): 
    if epochs is None:
        epochs = ['epoch_'+str(e) for e in range(100)]

    bpc_bias_df = pd.DataFrame(columns=epochs)

    for f in files:
        try:
            df = pd.read_csv(f)
        except:
            continue
        epoch_columns = list(set(df.columns).intersection(epochs))
        if not df[epoch_columns].shape[1]:
            continue
        df = metadata.merge(df)
        num_epochs = len(epoch_columns)
        bias = bpc_bias_from_rank_func(df, epoch_columns)

        experiment = get_name_details(f)[0]
        bpc_bias_df.loc[experiment] = bias
    return bpc_bias_df

bpc_bias_df = bpc_bias_rank_files_np(rank_files_Phase1, metadata, epochs)

def bpc_tpr_from_rank_func(df, epoch_columns):
    # calculate the violation of statistical parity
    return df[epoch_columns].apply(lambda x: np.mean([(x<=i).mean() for i in range(x.max())]))
    
def bpc_tpr_rank_files_np(files, metadata, epochs=None): 
    if epochs is None:
        epochs = ['epoch_'+str(e) for e in range(100)]

    bpc_tpr_df = pd.DataFrame(columns=epochs)

    for f in files:
        try:
            df = pd.read_csv(f)
        except:
            continue
        epoch_columns = list(set(df.columns).intersection(epochs))
        if not df[epoch_columns].shape[1]:
            continue
        df = metadata.merge(df)
        num_epochs = len(epoch_columns)
        bias = bpc_tpr_from_rank_func(df, epoch_columns)

        experiment = get_name_details(f)[0]
        bpc_tpr_df.loc[experiment] = bias
    return bpc_tpr_df

bpc_tpr_df = bpc_tpr_rank_files_np(rank_files_Phase1, metadata, epochs)

In [5]:
bpc_df = bpc_bias_df.apply(
    lambda x: (x - bpc_bias_df.loc['inception_resnet_v2_ArcFace_SGD_0.1_cosine_rank_by_id_val'])/x, axis=1) - \
bpc_tpr_df.apply(lambda x: (x - bpc_tpr_df.loc['inception_resnet_v2_ArcFace_SGD_0.1_cosine_rank_by_id_val'])/ \
                 x, axis=1)

In [6]:
bpc_df = prepare(bpc_df)
bpc_df['index'] = bpc_df['index']+'.csv'
bpc_df['Metric'] = -bpc_df['Metric'] 

In [18]:
import plotly
df = err_df.merge(bpc_df, on=['index','epoch'])
df = df[df['epoch'] == 99]
df = df.rename(columns={'Metric_x':'Error'})
df = df.rename(columns={'Metric_y':'Disparity'})
df = df.dropna()
df = df[df['Error']<0.3]
df = df[df['Disparity']<2.0]
df.loc[:,'model'] = df['index'].apply(lambda x: get_name_details(x)[1])
df = df.sort_values(by=['model'])


fig = px.scatter(df, x='Error', 
                 y='Disparity', 
                 template="simple_white", 
                 width=1000, height= 500
            )

fig.update_layout(
    yaxis_title="-BPC",
#     yaxis_range=[0,6.15],
    xaxis_range=[0,.31],
    font=dict(
        family="Times New Roman",
        size=19,
        color="Black"
    )
)
fig.add_shape(type='line',
                x0=0,y0=0,x1=1,y1=0,
                line=dict(color='Red',),
                xref='x',yref='y'
)
p = np.array(preparePareto(df[['Error','Disparity']], False, False).dropna())
for x, y in zip(p[:-1], p[1:]):
    fig.add_shape(type='line',
                x0=x[0],y0=x[1],x1=y[0],y1=y[1],
                line=dict(color='gray',width=4),line_dash='dash',
                xref='x',yref='y')
    

fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=-.29,
    xanchor="right",
    x=.75,
    font=dict(
        family="Times New Roman",
        size=16,
        color="Black"
    )
    )
)

pareto_df = pd.merge(df, pd.DataFrame(p), right_on=[0,1], left_on=["Error","Disparity"])
for model in set(pareto_df['model']):
    # match model name to color
    i = [x.lower()[:3] for x in plotted_models].index(model.lower()[:3])
    name = plotted_models[i]
    color = colors[i]
    fig.add_trace(go.Scatter(
        x=pareto_df[pareto_df['model'] == model]['Error'],
        y=pareto_df[pareto_df['model'] == model]['Disparity'],
        mode='markers',
        name = name,
        marker = dict(color=color,size=15),
    ))

fig.show()
plotly.io.write_image(fig, 'app_RQ1_main_full_BPC.png', format='png')