In [1]:
import pandas as pd
import numpy as np
import os
from analysis import *

In [None]:
def stat_parity_func(df, epoch_columns):
    # calculate the violation of statistical parity
    data = {}
    for g,g_df in df.groupby('gender_expression'):
        data[g] = g_df[epoch_columns].sum(axis=0)/g_df.shape[0]
    return data['male'] - data['female']

def acc_func(df, epoch_columns):
    # calculate the accuracy 
    return df[epoch_columns].sum(axis=0)/df.shape[0]

In [None]:
def get_name_details(f):
    experiment, _ = os.path.splitext(os.path.basename(f))
    experiment = '_'.join(experiment.split('_')[1:])
    head = experiment.split('_')[-2]
    opt = experiment.split('_')[-1]
    model = '_'.join(experiment.split('_')[:-2])
    return experiment, model, head, opt

In [None]:
import plotly.express as px

def analyze_files(files):
    acc_df = pd.DataFrame(columns=['epoch_'+str(e) for e in range(100)])
    acc_disp_df = pd.DataFrame(columns=['epoch_'+str(e) for e in range(100)])
    for f in files:
        df = pd.read_csv(f)
        epochs = df.drop('ids',axis=1).columns
        df = metadata.merge(df)
        num_epochs = len(epochs)
        epoch_columns = ['epoch_'+str(e) for e in range(num_epochs)]
        df[epoch_columns] = df[epoch_columns].apply(lambda x: x == df['label'])
        acc = acc_func(df, epoch_columns)
        experiment = os.path.dirname(f).split('/')[1]
        acc_df.loc[experiment] = acc
        acc_disp = stat_parity_func(df, epoch_columns)
        acc_disp_df.loc[experiment] = acc_disp    
    return acc_df, acc_disp_df

def plot_df(df):
    # dataframe of a long format
    df = pd.melt(df.reset_index(), id_vars='index')
    df = df.rename(columns={'variable':'epoch'})
    df.epoch = df.epoch.apply(lambda x: int(x.split('_')[1]))

    # plotly express
    fig = px.line(df, x='epoch', y='value', color='index')
    fig.show()

In [None]:
metadata = pd.read_csv('../../../../rhea/FR-NAS/Checkpoints/test_identities_gender-expression_seed_222.csv')
df = pd.read_csv('fbnetv3_g_CosFace_Adam/fbnetv3_g_CosFace_Adam_kacc.csv')
df = metadata.merge(df)

In [None]:
epochs = df.drop('ids',axis=1).columns

In [None]:
num_epochs = 45
df[['epoch_'+str(e) for e in range(num_epochs)]] = df[['epoch_'+str(e) for e in range(num_epochs)]].apply(lambda x: x == df['label'])

In [None]:
data = {}
for g,g_df in df.groupby('gender_expression'):
    data[g] = np.array([g_df[e].sum() for e in ['epoch_'+str(e) for e in range(num_epochs)]])/g_df.shape[0]

In [None]:
save = pd.read_csv('fbnetv3_g_CosFace_Adam/timm_from-scratch.csv', sep='\t')

In [None]:
np.round(data['male']*100,3)

In [None]:
list(save['Acc k female']) == np.round(data['female']*100,3)

In [None]:
save

# Use this to clean up folders when training stops before a checkpoint as been made

In [None]:
def clean_up_dir(d):
    # skip folders like '.ipynb_checkpoints' '.git' etc
    if d[0] == '.':
        return False
    #     return
    # get all the files
    files = [f for f in os.listdir('./Phase1B/'+d) if not f.startswith('.')]

    name = "Checkpoint"

    potential_checkpoints = [chckpt for chckpt in os.listdir('./Phase1B/'+d) if chckpt.startswith(name)]
    print('Found checkpoints for this model:', len(potential_checkpoints))
    if len(potential_checkpoints) > 6:
        print(d)

    # if a model checkpoint was saved
    if len(potential_checkpoints):
        # find the latest checkpoint
        epoch_numbers = []
        for chckpt in potential_checkpoints:
            epoch_numbers.append([int(num) for num in chckpt[-8:].replace('.', '_').split('_') if num.isdigit()])
        last_checkpoint = potential_checkpoints[np.argmax(np.array(epoch_numbers))]
        max_epoch = np.max(np.array(epoch_numbers))

        # if the max_epoch is 100, then training is done for this routine and we can return True
        if max_epoch >= 100:
            return True

#         ## now remove all things that were saved in the log files after the max_epoch
#         # make adjustments to _multi.csv and _kacc.csv files
#         for f in [f for f in files if '_kacc' in f or '_multi' in f]:
#             df = pd.read_csv('/'.join(['.',d,f]))
#             df = df[['ids']+['epoch_'+str(i) for i in range(max_epoch)]]
#             df.to_csv('/'.join(['.',d,f]), index=False)
#         # make adjustments to _multi.csv and _kacc.csv files
#         for f in [f for f in files if 'timm_from-scratch' in f]:
#             df = pd.read_csv('/'.join(['.',d,f]), sep='\t')
#             df = df[df.epoch < max_epoch]
#             df.to_csv('/'.join(['.',d,f]), sep='\t', index=False)

    # otherwise, nothing was saved and we have to start all over
    else:
        # move the files up a directory and then delete manually
#         for f in files:
#             os.rename('/'.join(['.',d,f]), f)
        i=0
    return False

In [None]:
import os
import numpy as np
root, dirs, files = next(os.walk(r'./Phase1B/'))
not_finished = []
finished = []
for d in dirs:
    if clean_up_dir(d):
        finished += [d]
    else:
        not_finished += [d]
print(finished)
print(not_finished)

In [None]:
# remove .ipynb_checkpoints
not_finished = [x for x in not_finished if x[0]!='.']
with open('../phase1biii.sh') as f:
    cmds = [cmd for cmd in f if any([x in cmd for x in not_finished])]
with open('../phase1bii_finish.sh', 'w') as f:
    f.writelines(cmds)

In [48]:
finished_models = get_finished_models_Phase1B()

In [37]:
# which experiments don't have completed rank files
not_done = []
for model in finished_models:
    for x in glob.glob('../configs/'+model+'/*') + glob.glob('../configs_multi/'+model+'/*') :
        # if a rank_by_id_val file exists
        if glob.glob(find_yaml_folder(x)+'/*_rank_by_id_val*'):
            r = glob.glob(find_yaml_folder(x)+'/*_rank_by_id_val*')[0]
            acc_df, acc_disp_df, _ = analyze_rank_files_np([r], metadata, epochs=['epoch_'+str(i) for i  in [19,39,59,79,99]])
            # but it isn't complete
            if acc_df.isna().sum(axis=1)[0]:
                not_done += [x]
#                 print(x)
        # rank_by_id file doesn't even exist 
        else:
            not_done += [x]
#             print(x)

for y in not_done:
    experiment_folder = find_yaml_folder(y)
    if 'Phase1B' in experiment_folder:
#         if not [x for x in glob.glob(experiment_folder+'/*.pth') if 'Epoch_100' in x]:
#             print(y)
#             print(glob.glob(experiment_folder+'/*.pth'))
#             print()
        if [x for x in glob.glob(experiment_folder+'/*.pth') if 'Epoch_100' in x]:            
            print('python src/fairness_test_timm.py --config_path '+y.replace('../',''))

python src/fairness_test_timm.py --config_path configs_multi/inception_v4/config_inception_v4_CosFace_AdamW.yaml
python src/fairness_test_timm.py --config_path configs_multi/legacy_senet154/config_legacy_senet154_ArcFace_SGD.yaml
python src/fairness_test_timm.py --config_path configs_multi/rexnet_200/config_rexnet_200_CosFace_AdamW.yaml
python src/fairness_test_timm.py --config_path configs_multi/selecsls60b/config_selecsls60b_MagFace_AdamW.yaml
python src/fairness_test_timm.py --config_path configs_multi/tnt_s_patch16_224/config_tnt_s_patch16_224_ArcFace_SGD.yaml
python src/fairness_test_timm.py --config_path configs_multi/vgg19/config_vgg19_ArcFace_SGD.yaml
python src/fairness_test_timm.py --config_path configs_multi/vgg19/config_vgg19_ArcFace_AdamW.yaml
python src/fairness_test_timm.py --config_path configs_multi/vgg19_bn/config_vgg19_bn_ArcFace_AdamW.yaml
python src/fairness_test_timm.py --config_path configs_multi/vgg19_bn/config_vgg19_bn_ArcFace_SGD.yaml
python src/fairness_test_

In [47]:
glob.glob('../configs_unified_lr/'+model+'/*')

['../configs_unified_lr/resnetrs101',
 '../configs_unified_lr/tf_efficientnet_b7_ns',
 '../configs_unified_lr/jx_nest_base',
 '../configs_unified_lr/xcit_medium_24_p8_224_dist',
 '../configs_unified_lr/twins_svt_large',
 '../configs_unified_lr/xception',
 '../configs_unified_lr/selecsls60b',
 '../configs_unified_lr/cspdarknet53',
 '../configs_unified_lr/convit_base',
 '../configs_unified_lr/swin_base_patch4_window7_224',
 '../configs_unified_lr/coat_lite_small',
 '../configs_unified_lr/dpn107',
 '../configs_unified_lr/ghostnet_100',
 '../configs_unified_lr/vit_large_patch16_224',
 '../configs_unified_lr/xcit_large_24_p8_384_dist',
 '../configs_unified_lr/gluon_xception65',
 '../configs_unified_lr/vgg19_bn',
 '../configs_unified_lr/resmlp_big_24_224_in22ft1k',
 '../configs_unified_lr/ig_resnext101_32x8d',
 '../configs_unified_lr/xception65',
 '../configs_unified_lr/visformer_small',
 '../configs_unified_lr/fbnetv3_g',
 '../configs_unified_lr/vgg19',
 '../configs_unified_lr/inception_v4'

In [43]:
# which experiments don't have completed rank files
not_done = []
for model in finished_models:
    for x in glob.glob('../configs_unified_lr/'+model+'/*') :
        # if a rank_by_id_val file exists
        if glob.glob(find_yaml_folder(x)+'/*_rank_by_id_val*'):
            r = glob.glob(find_yaml_folder(x)+'/*_rank_by_id_val*')[0]
            acc_df, acc_disp_df, _ = analyze_rank_files_np([r], metadata, epochs=['epoch_'+str(i) for i  in [19,39,59,79,99]])
            # but it isn't complete
            if acc_df.isna().sum(axis=1)[0]:
                not_done += [x]
#                 print(x)
        # rank_by_id file doesn't even exist 
        else:
            not_done += [x]
#             print(x)

for y in not_done:
    experiment_folder = find_yaml_folder(y)
    if 'Phase1B' in experiment_folder:
#         if not [x for x in glob.glob(experiment_folder+'/*.pth') if 'Epoch_100' in x]:
#             print(y)
        if [x for x in glob.glob(experiment_folder+'/*.pth') if 'Epoch_100' in x]:            
#             print('python src/fairness_test_timm.py --config_path '+y.replace('../',''))
            print(y)

../configs_unified_lr/coat_lite_small/config_coat_lite_small_MagFace_AdamW_0.001_cosine.yaml
../configs_unified_lr/coat_lite_small/config_coat_lite_small_ArcFace_SGD_0.1_cosine.yaml
../configs_unified_lr/coat_lite_small/config_coat_lite_small_CosFace_SGD_0.1_cosine.yaml
../configs_unified_lr/coat_lite_small/config_coat_lite_small_ArcFace_AdamW_0.001_cosine.yaml
../configs_unified_lr/cspdarknet53/config_cspdarknet53_CosFace_SGD_0.1_cosine.yaml
../configs_unified_lr/cspdarknet53/config_cspdarknet53_CosFace_AdamW_0.001_cosine.yaml
../configs_unified_lr/dpn107/config_dpn107_MagFace_SGD_0.1_cosine.yaml
../configs_unified_lr/dpn107/config_dpn107_MagFace_AdamW_0.001_cosine.yaml
../configs_unified_lr/ese_vovnet39b/config_ese_vovnet39b_CosFace_AdamW_0.001_cosine.yaml
../configs_unified_lr/ese_vovnet39b/config_ese_vovnet39b_CosFace_SGD_0.1_cosine.yaml
../configs_unified_lr/ese_vovnet39b/config_ese_vovnet39b_ArcFace_AdamW_0.001_cosine.yaml
../configs_unified_lr/fbnetv3_g/config_fbnetv3_g_ArcFace_

In [None]:
with open('../phase1biv.sh') as f:
    cmds = [x.strip() for x in f.readlines()]
        
yamls = [x.split(' ')[-1] for x in cmds if any([y in x for y in finished_models])]

c=0
for yaml in yamls:
    yaml_folder = find_yaml_folder(yaml)
    if yaml_folder:
        if any([x for x in glob.glob(os.path.join(yaml_folder,'*.pth')) if 'Epoch_101' in x]):
            c+=1
print(c)

In [None]:
len(yamls)

# Some models don't converge so we stop them early
this is code to fix the data

In [1]:
def fix_by_row(f):
    df = pd.read_csv(f,sep='\t')
    df = df.append(df.iloc[[-1]*(101-df.shape[0])] )
    df['epoch'] = range(101)
    df.to_csv(f, sep='\t', index=False)
    
def fix_by_column(f):
    df = pd.read_csv(f)
    max_epoch = int(df.columns[-1].split('_')[1])
    for i in range(max_epoch+1, 101):
        df['epoch_'+str(i)] = df[df.columns[-1]]
    df.to_csv(f, index=False)

In [2]:
import shutil
def copy_latest_epoch(yaml_folder):
    e = max([int(x.split('Epoch_')[1].split('.')[0]) for x in glob.glob(yaml_folder + '/*.pth')])
    e = str(e)
    e_file = glob.glob(yaml_folder+'/*Epoch_'+e+'.pth')[0]
    for new_e in [20,40,60,80,100,101]:
        if int(e) < new_e:
            new_e_file = e_file.replace('Epoch_'+e, 'Epoch_'+str(new_e))
            shutil.copy(e_file, new_e_file)

In [49]:
yaml = 'tnt_s_patch16_224_ArcFace_SGD_0.1_cosine.yaml'
yaml_folder = find_yaml_folder(yaml)

by_row = [x for x in glob.glob(yaml_folder+'/*scratch*.csv') if 'Copy' not in x]
by_column = [x for x in glob.glob(yaml_folder+'/*.csv') if 'Copy' not in x]
for g in by_row:
    by_column.remove(g)

for f in by_row:
    fix_by_row(f)
for f in by_column:
    fix_by_column(f)
copy_latest_epoch(yaml_folder)

In [62]:
df = pd.read_csv('Phase1B/twins_svt_large_MagFace_SGD_0.1_cosine/twins_svt_large_MagFace_SGD_0.1_cosine_kacc_default.csv')

In [63]:
cols = list(df.columns)
cols = cols[1:] + [cols[0]]
df = df[cols]
df

Unnamed: 0,ids,epoch_0,epoch_1,epoch_2,epoch_3,epoch_4,epoch_5,epoch_6,epoch_7,epoch_8,...,epoch_91,epoch_92,epoch_93,epoch_94,epoch_95,epoch_96,epoch_97,epoch_98,epoch_99,epoch_100
0,0,265,59,59,59,59,99,787,59,787,...,787,787,787,787,787,787,787,787,787,787
1,1,569,59,59,59,59,99,787,59,787,...,787,787,787,787,787,787,787,787,787,787
2,2,406,59,59,59,59,99,787,59,787,...,787,787,787,787,787,787,787,787,787,787
3,3,763,59,59,59,59,59,787,59,787,...,787,787,787,787,787,787,787,787,787,787
4,4,278,59,59,59,59,99,787,59,787,...,787,787,787,787,787,787,787,787,787,787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15267,15267,617,59,59,787,59,787,59,59,59,...,59,59,59,59,59,59,59,59,59,59
15268,15268,449,59,59,787,59,787,59,59,59,...,59,59,59,59,59,59,59,59,59,59
15269,15269,807,59,59,787,59,787,59,59,59,...,59,59,59,59,59,59,59,59,59,59
15270,15270,172,59,59,787,59,787,59,59,59,...,59,59,59,59,59,59,59,59,59,59


In [64]:
df.to_csv('Phase1B/twins_svt_large_MagFace_SGD_0.1_cosine/twins_svt_large_MagFace_SGD_0.1_cosine_kacc_default.csv',index=False)

In [8]:
yaml_folder

'/cmlscratch/sdooley1/merge_timm/FR-NAS/Checkpoints/Phase1B/convit_base_MagFace_SGD_0.1_cosine'

In [None]:
with open('../forMicah.sh','w') as f:
    for m in final_models:
        yaml = glob.glob('../configs/'+m+'/*.yaml')[0]
        print('python src/fairness_test_random_timm.py --seed 231 --config_path '+yaml.replace('../',''),file=f)