# The Effect of Hyperparameters analysis:
(_This message is copied from the slack conversation_)

- We wonder how the hyperparameters impact fairness measures
- At a minimum we can report the hps of converged models:
-- which performed best across all the models
-- for each combination of {head,optimizer} what models performed best
- The most straightforward analysis might be to run a regression (or anova) of all the models from Phase1B like
-- accuracy_at_epoch_n ~ head + opt + lr
-- fairness_at_epoch_n ~ head + opt + lr


In [9]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import os
from analysis import *
import glob
import plotly.express as px
import statsmodels.api as sm
from statsmodels.formula.api import ols
from stargazer.stargazer import Stargazer
from statsmodels.stats.multicomp import pairwise_tukeyhsd

final_models = get_finished_models_Phase1B()
metadata = pd.read_csv('val_identities_gender-expression_seed_222.csv')

In [3]:
default_params = [x for x in glob.glob('../configs/**/*.yaml') if [m for m in final_models if m in x]]
hp_same_lr = [x for x in glob.glob('../configs_multi/**/*.yaml') if [m for m in final_models if m in x]]
hp_unified_lr = [x for x in glob.glob('../configs_unified_lr/**/*.yaml') if [m for m in final_models if m in x]]
    
rank_files = glob.glob('timm_explore_few_epochs/**/*_rank_by_id_val.csv') + glob.glob('Phase1B/**/*_rank_by_id_val.csv')

epochs = [19,39,59,79,99]
epoch_columns = ['epoch_'+str(i) for i in epochs]

In [4]:
acc_df, acc_disp_df, rank_df = analyze_rank_files(rank_files, metadata, epochs=epoch_columns)
_, acc_disp_ratio_df, rank_ratio_df = analyze_rank_files(rank_files, metadata, ratio=True, epochs=epoch_columns)
err_df, error_ratio_df, _ = analyze_rank_files(rank_files, metadata, ratio=True, error=True, epochs=epoch_columns)

acc_disp_df = merge(acc_df, acc_disp_df)
rank_df = merge(acc_df, rank_df)
acc_disp_ratio_df = merge(acc_df, acc_disp_ratio_df)
rank_ratio_df = merge(acc_df, rank_ratio_df)
error_ratio_df = merge(err_df, error_ratio_df).rename(columns={'Accuracy':'Error'})

In [5]:
# models which didn't converge
non_converged_models = list(set(acc_df[acc_df['Metric'] < 0.25]['index']))

In [6]:
[acc_df, acc_disp_df, rank_df, acc_disp_ratio_df, rank_ratio_df, err_df, error_ratio_df] = drop_models([acc_df, acc_disp_df, rank_df, acc_disp_ratio_df, rank_ratio_df, err_df, error_ratio_df], non_converged_models)

# For each head what architectures are Pareto optimal?

In [16]:
hp_configs = {}

hp_configs['acc_disp'] = get_pareto_hps(acc_disp_df)
hp_configs['acc_disp_ratio'] = get_pareto_hps(acc_disp_ratio_df)
hp_configs['rank'] = get_pareto_hps(rank_df)
hp_configs['rank_ratio'] = get_pareto_hps(rank_ratio_df)
hp_configs['error_ratio'] = get_pareto_hps(error_ratio_df, col='Error')

print(pd.DataFrame.from_dict(hp_configs, orient='index',
                       columns=[head for head in ['ArcFace', 'CosFace', 'MagFace']]).to_markdown())

|                | ArcFace             | CosFace            | MagFace             |
|:---------------|:--------------------|:-------------------|:--------------------|
| acc_disp       | dpn107              | rexnet_200         | gluon_xception65    |
|                |                     |                    | rexnet_200          |
| acc_disp_ratio | dpn107              | rexnet_200         | gluon_xception65    |
|                |                     |                    | rexnet_200          |
| rank           | dpn107              | cspdarknet53       | gluon_xception65    |
|                | ig_resnext101_32x8d | dla102x2           | rexnet_200          |
|                | rexnet_200          | dpn107             |                     |
|                | xception65          | gluon_inception_v3 |                     |
|                |                     | rexnet_200         |                     |
|                |                     | tnt_s_patch16_224  |               

# For each opt what architectures are Pareto optimal?

In [14]:
hp_configs = {}
hp_configs['acc_disp'] = get_pareto_hps(acc_disp_df)
hp_configs['acc_disp_ratio'] = get_pareto_hps(acc_disp_ratio_df)
hp_configs['rank'] = get_pareto_hps(rank_df)
hp_configs['rank_ratio'] = get_pareto_hps(rank_ratio_df)
hp_configs['error_ratio'] = get_pareto_hps(error_ratio_df, col='Error')

print(pd.DataFrame.from_dict(hp_configs, orient='index',
                       columns=[opt for opt in ['adamw', 'sgd']]).to_markdown())

|                | adamw             | sgd                 |
|:---------------|:------------------|:--------------------|
| acc_disp       | dpn107            | rexnet_200          |
|                | gluon_xception65  |                     |
|                | tnt_s_patch16_224 |                     |
| acc_disp_ratio | dpn107            | rexnet_200          |
|                | gluon_xception65  |                     |
|                | tnt_s_patch16_224 |                     |
| rank           | cspdarknet53      | dla102x2            |
|                | dpn107            | gluon_inception_v3  |
|                | gluon_xception65  | inception_resnet_v2 |
|                | tnt_s_patch16_224 | resnetrs101         |
|                |                   | rexnet_200          |
| rank_ratio     | cspdarknet53      | dla102x2            |
|                | dpn107            | ese_vovnet39b       |
|                | gluon_xception65  | hrnet_w64           |
|                | tnt_s

# For each {head,opt} what architectures are Pareto optimal?

In [22]:
hp_configs = {}

hp_configs['acc_disp'] = get_pareto_hps(acc_disp_df)
hp_configs['acc_disp_ratio'] = get_pareto_hps(acc_disp_ratio_df)
hp_configs['rank'] = get_pareto_hps(rank_df)
hp_configs['rank_ratio'] = get_pareto_hps(rank_ratio_df)
hp_configs['error_ratio'] = get_pareto_hps(error_ratio_df, col='Error')

df = pd.DataFrame.from_dict(hp_configs, orient='index',
                       columns=[opt+head for opt in ['adamw', 'sgd'] 
                                for head in ['ArcFace','CosFace','MagFace']])
print(df[['adamw'+head for head in ['ArcFace','CosFace','MagFace']]].to_markdown())
print()
print(df[['sgd'+head for head in ['ArcFace','CosFace','MagFace']]].to_markdown())

|                | adamwArcFace        | adamwCosFace      | adamwMagFace        |
|:---------------|:--------------------|:------------------|:--------------------|
| acc_disp       | dpn107              | dpn107            | ese_vovnet39b       |
|                | ese_vovnet39b       | tnt_s_patch16_224 | gluon_xception65    |
|                | xception65          |                   |                     |
| acc_disp_ratio | dpn107              | dpn107            | ese_vovnet39b       |
|                | ese_vovnet39b       | tnt_s_patch16_224 | gluon_xception65    |
| rank           | dpn107              | cspdarknet53      | ese_vovnet39b       |
|                | ese_vovnet39b       | dpn107            | gluon_xception65    |
|                | ig_resnext101_32x8d | tnt_s_patch16_224 | legacy_senet154     |
|                | xception65          | xception65        | resnetrs101         |
| rank_ratio     | dpn107              | cspdarknet53      | ese_vovnet39b       |
|   

# Impact of Head and Opt on Accuracy

We see that both head and opt impact accuracy. CosFace and MagFace are better than ArcFace. And SGD is better than AdamW.

In [45]:
_ = anova_hp_accuracy(acc_disp_df)

             df     sum_sq   mean_sq          F    PR(>F)
head        2.0   0.785199  0.392599   9.091594  0.000129
opt         2.0   1.020655  0.510327  11.817872  0.000009
Residual  585.0  25.261865  0.043183        NaN       NaN
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1  group2 meandiff p-adj   lower  upper  reject
-----------------------------------------------------
ArcFace CosFace   0.0911  0.001  0.0399 0.1424   True
ArcFace MagFace   0.0559 0.0318  0.0039 0.1079   True
CosFace MagFace  -0.0353 0.1995 -0.0836  0.013  False
-----------------------------------------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower  upper  reject
--------------------------------------------------
  adam  adamw  -0.0804 0.6585 -0.303 0.1422  False
  adam    sgd   0.0086    0.9 -0.214 0.2312  False
 adamw    sgd    0.089  0.001 0.0482 0.1298   True
--------------------------------------------------


# Impact of Head and Opt on Disparity

For **statisitcal parity**, we see that both head and opt impact disparity. MagFace is better than CosFace are better than ArcFace. SGD is better than AdamW is better than Adam.

In [48]:
meta = pd.read_csv('../timm_model_metadata.csv')

_ = anova_hp_disp(acc_disp_df)

             df    sum_sq   mean_sq          F        PR(>F)
head        2.0  0.012555  0.006277   4.299561  1.400603e-02
opt         2.0  0.042935  0.021468  14.703575  5.883480e-07
Residual  585.0  0.854112  0.001460        NaN           NaN
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1  group2 meandiff p-adj   lower  upper  reject
-----------------------------------------------------
ArcFace CosFace   -0.009 0.0647 -0.0185 0.0004  False
ArcFace MagFace   0.0009    0.9 -0.0088 0.0105  False
CosFace MagFace   0.0099 0.0253   0.001 0.0188   True
-----------------------------------------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
  adam  adamw    -0.05 0.0114 -0.0907 -0.0092   True
  adam    sgd  -0.0635  0.001 -0.1042 -0.0228   True
 adamw    sgd  -0.0135  0.001  -0.021 -0.0061   True
----------------------------------------------------


For **ratio of accuracies**, we see that only the optimizer impacts disparity. Both Adam and AdamW are better than SGD

In [49]:
_ = anova_hp_disp(acc_disp_ratio_df)

             df    sum_sq   mean_sq          F        PR(>F)
head        2.0  0.028170  0.014085   1.403141  2.466498e-01
opt         2.0  0.401753  0.200877  20.010931  3.924120e-09
Residual  585.0  5.872433  0.010038        NaN           NaN
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1  group2 meandiff p-adj   lower  upper  reject
-----------------------------------------------------
ArcFace CosFace  -0.0118 0.5085 -0.0368 0.0132  False
ArcFace MagFace   0.0038    0.9 -0.0216 0.0292  False
CosFace MagFace   0.0157 0.2645 -0.0079 0.0392  False
-----------------------------------------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
  adam  adamw  -0.0896 0.1172 -0.1958  0.0165  False
  adam    sgd  -0.1382 0.0066 -0.2444  -0.032   True
 adamw    sgd  -0.0485  0.001  -0.068 -0.0291   True
----------------------------------------------------


For **rank disparity**, we see that only the optimizer impacts disparity. Both Adam and AdamW are better than SGD

In [50]:
_ = anova_hp_disp(rank_df)

             df       sum_sq    mean_sq          F    PR(>F)
head        2.0    24.466701  12.233351   1.586037  0.205614
opt         2.0   160.108387  80.054193  10.378918  0.000037
Residual  585.0  4512.195143   7.713154        NaN       NaN
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1  group2 meandiff p-adj   lower  upper  reject
-----------------------------------------------------
ArcFace CosFace    0.509 0.1871 -0.1737 1.1918  False
ArcFace MagFace   0.3073 0.5481 -0.3861 1.0006  False
CosFace MagFace  -0.2018 0.7225 -0.8453 0.4418  False
-----------------------------------------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
  adam  adamw  -0.6468  0.849 -3.5956  2.3019  False
  adam    sgd  -1.6434 0.3919 -4.5926  1.3057  False
 adamw    sgd  -0.9966  0.001 -1.5373 -0.4559   True
----------------------------------------------------


For **ratio of ranks**, we see that only the head impacts disparity. MagFace is better than ArcFace and CosFace

In [51]:
_ = anova_hp_disp(rank_ratio_df)

             df     sum_sq   mean_sq          F        PR(>F)
head        2.0   0.654412  0.327206  18.190434  2.167263e-08
opt         2.0   0.092198  0.046099   2.562782  7.795529e-02
Residual  585.0  10.522859  0.017988        NaN           NaN
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1  group2 meandiff p-adj   lower  upper  reject
-----------------------------------------------------
ArcFace CosFace  -0.0321 0.0542 -0.0646 0.0004  False
ArcFace MagFace   0.0462 0.0031  0.0132 0.0793   True
CosFace MagFace   0.0783  0.001  0.0476  0.109   True
-----------------------------------------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
  adam  adamw  -0.0953 0.2787 -0.2419 0.0512  False
  adam    sgd  -0.0947 0.2835 -0.2413 0.0519  False
 adamw    sgd   0.0006    0.9 -0.0262 0.0275  False
---------------------------------------------------


For **ratio of errors**, we see that both the head and optimizer impact disparity. Adam is better than both AdamW and SGD. CosFace is better than both ArcFace and MagFace.

In [52]:
_ = anova_hp_disp(error_ratio_df, col='Error')

             df    sum_sq   mean_sq          F        PR(>F)
head        2.0  0.518772  0.259386  23.276316  1.876281e-10
opt         2.0  0.104933  0.052466   4.708133  9.366246e-03
Residual  585.0  6.519102  0.011144        NaN           NaN
 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1  group2 meandiff p-adj   lower   upper  reject
------------------------------------------------------
ArcFace CosFace   0.0718  0.001  0.0461  0.0975   True
ArcFace MagFace   0.0253 0.0602 -0.0008  0.0514  False
CosFace MagFace  -0.0465  0.001 -0.0707 -0.0223   True
------------------------------------------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
  adam  adamw  -0.1803  0.001 -0.2958 -0.0649   True
  adam    sgd  -0.1668 0.0021 -0.2822 -0.0513   True
 adamw    sgd   0.0136 0.2896 -0.0076  0.0347  False
----------------------------------------------------
