In this notebook, I present the CDA finetuning debiasing results, in terms of pronoun score, for 6 neopronouns individually:
* dee
* dij
* vij
* nij
* zhij
* zem

In [1]:
import json
import matplotlib.pyplot as plt
import os

import pandas as pd

## scores on all specific neopronoun test sets before debiasing

In [5]:
def get_pronoun_results(model_name):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
    
    dev_f1s = [epoch['sl_f1'] for epoch in logs['dev_eval']]
    
    best_epoch = dev_f1s.index(max(dev_f1s))
    best_epoch_results = logs['dev_eval'][best_epoch]
    if "finetune" in model_name:
        best_epoch = logs['epochs'] - 10 + best_epoch
        
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            best_epoch + 1, 
            round(logs['dee_test_head.jsonlines_pronoun_score'],2),
            round(logs['dij_test_head.jsonlines_pronoun_score'],2),
            round(logs['vij_test_head.jsonlines_pronoun_score'],2),
            round(logs['nij_test_head.jsonlines_pronoun_score'],2),
            round(logs['zem_test_head.jsonlines_pronoun_score'],2),
            round(logs['zhij_test_head.jsonlines_pronoun_score'],2)
           ]

pronoun_col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'best_epoch', 'dee', 'dij', 'vij', 'nij', 'zem', 'zhij']
def get_pronoun_overview(setting):
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    for file in files:
            results.append(get_pronoun_results(file))
    df = pd.DataFrame(results, columns=pronoun_col_names)
    return df

In [6]:
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_regular_248')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dee,dij,vij,nij,zem,zhij
0,xlm_regular_248,20,0.0005,3e-05,248,18,41.55,48.06,42.74,45.89,35.66,84.55


## Inspect DEV results, to find the best epoch, in order to store those weights

In [7]:
def get_training_results(model_name):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
    dev_f1s = [epoch['sl_f1'] for epoch in logs['dev_eval']]
    
    best_epoch = dev_f1s.index(max(dev_f1s))
    best_epoch_results = logs['dev_eval'][best_epoch]
    best_epoch = logs['epochs'] - 10 + best_epoch
        
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            best_epoch + 1, 
            round(best_epoch_results['wl_p'],4) * 100,
            round(best_epoch_results['wl_r'],4) * 100,
            round(best_epoch_results['wl_f1'],4) * 100,
            round(best_epoch_results['sl_p'],4) * 100,
            round(best_epoch_results['sl_r'],4) * 100,
            round(best_epoch_results['sl_f1'],4) * 100,
           ]

In [8]:
col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'best_epoch', 'wl P', 'wl R', 'wl F1','P', 'R', 'F1']
def get_training_overview(setting):
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    for file in files:
        results.append(get_training_results(file))
    df = pd.DataFrame(results, columns=col_names)
    return df

#### dij

In [16]:
#0.625% of the training data
regular_dev = get_training_overview('xlm_dij_1_p')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_dij_1_percent_14,28,0.0005,3e-05,248,28,50.31,59.67,54.59,47.64,55.52,51.28
1,xlm_dij_1_percent_13,28,0.0005,3e-05,248,28,50.19,57.98,53.81,47.48,53.99,50.53
2,xlm_dij_1_percent_11,28,0.0005,3e-05,248,28,46.89,63.93,54.1,44.35,59.51,50.82
3,xlm_dij_1_percent_12,28,0.0005,3e-05,248,28,46.89,63.93,54.1,44.35,59.51,50.82
4,xlm_dij_1_percent_10,28,0.0005,3e-05,248,28,48.06,63.61,54.75,45.34,58.98,51.27


In [17]:
# 1.25% of the training data
regular_dev = get_training_overview('xlm_dij_2')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_dij_248,28,0.0005,3e-05,248,20,50.99,64.22,56.84,48.1,59.55,53.22
1,xlm_dij_2_percent_16,28,0.0005,3e-05,248,28,49.87,63.69,55.94,47.15,59.12,52.46
2,xlm_dij_2_percent_15,28,0.0005,3e-05,248,28,49.01,63.01,55.13,46.25,58.55,51.68
3,xlm_dij_2_percent_18,28,0.0005,3e-05,248,25,50.08,61.12,55.06,47.39,56.95,51.73
4,xlm_dij_2_percent_19,28,0.0005,3e-05,248,22,54.63,57.98,56.25,51.76,53.98,52.85
5,xlm_dij_2_percent_17,28,0.0005,3e-05,248,28,51.54,61.69,56.16,48.78,57.38,52.73


In [18]:
# 2.5% of the training data
regular_dev = get_training_overview('xlm_dij_3')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_dij_3_percent_20,28,0.0005,3e-05,248,25,50.72,62.74,56.09,48.24,58.55,52.89
1,xlm_dij_3_percent_23,28,0.0005,3e-05,248,23,53.3,60.61,56.72,50.3,56.31,53.14
2,xlm_dij_3_percent_21,28,0.0005,3e-05,248,25,51.93,62.08,56.56,48.98,57.42,52.86
3,xlm_dij_3_percent_24,28,0.0005,3e-05,248,27,50.52,61.06,55.3,47.83,56.99,52.01
4,xlm_dij_3_percent_22,28,0.0005,3e-05,248,26,50.21,62.96,55.87,47.59,58.62,52.54


In [19]:
# 10% of the training data
regular_dev = get_training_overview('xlm_dij_10')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_dij_10_percent_4,28,0.0005,3e-05,248,28,51.05,63.98,56.79,48.13,59.29,53.13
1,xlm_dij_10_percent_1,28,0.0005,3e-05,248,23,50.7,63.89,56.53,48.04,59.51,53.16
2,xlm_dij_10_percent_2,28,0.0005,3e-05,248,20,53.41,60.69,56.82,50.45,56.49,53.3
3,xlm_dij_10_percent_3,28,0.0005,3e-05,248,23,53.3,60.61,56.72,50.3,56.31,53.14
4,xlm_dij_10_percent_0,28,0.0005,3e-05,248,28,50.64,64.46,56.72,47.85,59.8,53.16


In [20]:
# full training data
regular_dev = get_training_overview('xlm_dij_248')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_dij_248,28,0.0005,3e-05,248,20,50.99,64.22,56.84,48.1,59.55,53.22


#### zem

In [21]:
#0.625% of the training data
regular_dev = get_training_overview('xlm_zem_0')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_zem_0_percent_2,28,0.0005,3e-05,248,20,50.32,53.77,51.99,47.62,49.99,48.77
1,xlm_zem_0_percent_4,28,0.0005,3e-05,248,24,51.52,52.39,51.95,48.74,48.77,48.76
2,xlm_zem_0_percent_0,28,0.0005,3e-05,248,28,48.87,56.05,52.22,46.13,51.99,48.88
3,xlm_zem_0_percent_1,28,0.0005,3e-05,248,28,49.22,55.0,51.95,46.66,51.19,48.82
4,xlm_zem_0_percent_3,28,0.0005,3e-05,248,28,49.95,56.55,53.04,47.11,52.29,49.57


In [23]:
# 1.25% of the training data
regular_dev = get_training_overview('xlm_zem_1_')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_zem_1_percent_6,28,0.0005,3e-05,248,23,50.9,52.86,51.87,48.21,49.12,48.66
1,xlm_zem_1_percent_5,28,0.0005,3e-05,248,28,47.27,58.74,52.39,44.68,54.59,49.14
2,xlm_zem_1_percent_9,28,0.0005,3e-05,248,28,50.09,55.66,52.73,47.0,51.37,49.09
3,xlm_zem_1_percent_8,28,0.0005,3e-05,248,28,49.04,57.71,53.02,46.42,53.46,49.69
4,xlm_zem_1_percent_7,28,0.0005,3e-05,248,24,49.76,55.94,52.67,47.16,52.03,49.47


In [25]:
# 2.5% of the training data
regular_dev = get_training_overview('xlm_zem_2_per')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_zem_2_percent_11,28,0.0005,3e-05,248,27,48.41,60.06,53.61,45.59,55.66,50.13
1,xlm_zem_2_percent_10,28,0.0005,3e-05,248,27,49.45,59.05,53.83,46.63,54.76,50.37
2,xlm_zem_2_percent_14,28,0.0005,3e-05,248,28,48.51,60.33,53.78,45.73,55.91,50.31
3,xlm_zem_2_percent_12,28,0.0005,3e-05,248,27,51.69,55.85,53.69,48.88,51.92,50.36
4,xlm_zem_2_percent_13,28,0.0005,3e-05,248,27,50.99,58.3,54.4,48.28,54.27,51.1


In [26]:
# 10% of the training data
regular_dev = get_training_overview('xlm_zem_10')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_zem_10_percent_2,28,0.0005,3e-05,248,28,49.51,63.65,55.7,46.58,59.05,52.08
1,xlm_zem_10_percent_0,28,0.0005,3e-05,248,27,49.45,62.31,55.14,46.76,57.85,51.71
2,xlm_zem_10_percent_1,28,0.0005,3e-05,248,26,49.57,63.32,55.61,46.69,58.78,52.04
3,xlm_zem_10_percent_4,28,0.0005,3e-05,248,27,50.23,61.73,55.39,47.29,57.07,51.72
4,xlm_zem_10_percent_3,28,0.0005,3e-05,248,28,51.29,61.32,55.86,48.26,56.8,52.18


In [27]:
# full debiasing set
regular_dev = get_training_overview('xlm_zem_248')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_zem_248,28,0.0005,3e-05,248,20,51.12,63.37,56.59,48.24,58.79,53.0


#### vij

In [28]:
# 10% debiasing data
regular_dev = get_training_overview('xlm_vij_10')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_vij_10_percent_4,28,0.0005,3e-05,248,27,50.44,63.15,56.09,47.56,58.41,52.43
1,xlm_vij_10_percent_1,28,0.0005,3e-05,248,28,49.89,64.66,56.32,46.93,60.01,52.67
2,xlm_vij_10_percent_3,28,0.0005,3e-05,248,28,50.75,64.24,56.7,47.79,59.47,52.99
3,xlm_vij_10_percent_2,28,0.0005,3e-05,248,26,50.67,63.86,56.51,47.81,59.26,52.92
4,xlm_vij_10_percent_0,28,0.0005,3e-05,248,26,50.59,64.01,56.52,47.78,59.41,52.97


In [30]:
# 2.5% debiasing data
regular_dev = get_training_overview('xlm_vij_2_pe')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_vij_2_percent_6,28,0.0005,3e-05,248,25,48.7,63.35,55.07,45.83,58.67,51.46
1,xlm_vij_2_percent_8,28,0.0005,3e-05,248,28,50.19,60.89,55.02,47.43,56.55,51.59
2,xlm_vij_2_percent_7,28,0.0005,3e-05,248,28,48.95,62.42,54.87,46.08,57.78,51.27
3,xlm_vij_2_percent_9,28,0.0005,3e-05,248,28,51.21,61.27,55.79,48.41,56.82,52.28
4,xlm_vij_2_percent_5,28,0.0005,3e-05,248,26,48.35,61.24,54.04,45.37,56.73,50.42


In [31]:
# 1.25% debiasing data
regular_dev = get_training_overview('xlm_vij_1_')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_vij_1_percent_12,28,0.0005,3e-05,248,28,46.68,64.48,54.16,43.87,59.69,50.57
1,xlm_vij_1_percent_13,28,0.0005,3e-05,248,28,49.62,60.18,54.39,46.87,55.9,50.99
2,xlm_vij_1_percent_11,28,0.0005,3e-05,248,24,48.62,57.68,52.76,45.82,53.57,49.39
3,xlm_vij_1_percent_14,28,0.0005,3e-05,248,26,47.13,63.28,54.02,44.21,58.58,50.39
4,xlm_vij_1_percent_10,28,0.0005,3e-05,248,28,48.87,58.33,53.18,46.12,54.15,49.82


In [32]:
# 0.625% debiasing data
regular_dev = get_training_overview('xlm_vij_0_')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_vij_0_percent_15,28,0.0005,3e-05,248,25,50.26,54.07,52.1,47.37,50.22,48.75
1,xlm_vij_0_percent_18,28,0.0005,3e-05,248,28,50.01,55.12,52.45,47.3,51.23,49.19
2,xlm_vij_0_percent_19,28,0.0005,3e-05,248,28,48.74,58.23,53.06,45.88,53.8,49.52
3,xlm_vij_0_percent_16,28,0.0005,3e-05,248,28,45.44,62.39,52.58,42.85,57.83,49.23
4,xlm_vij_0_percent_17,28,0.0005,3e-05,248,22,49.11,55.72,52.21,46.47,51.86,49.02


In [33]:
# full debiasing set
regular_dev = get_training_overview('xlm_vij_248')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_vij_248,28,0.0005,3e-05,248,20,51.45,63.97,57.03,48.46,59.31,53.34


#### nij

In [34]:
# 0.625% of the debiasing data
regular_dev = get_training_overview('xlm_nij_0_')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_nij_0_percent_20,28,0.0005,3e-05,248,28,50.36,60.21,54.85,47.66,55.94,51.47
1,xlm_nij_0_percent_24,28,0.0005,3e-05,248,28,47.58,59.81,53.0,45.0,55.65,49.76
2,xlm_nij_0_percent_23,28,0.0005,3e-05,248,28,48.98,58.49,53.31,46.33,54.29,50.0
3,xlm_nij_0_percent_22,28,0.0005,3e-05,248,28,49.74,59.18,54.05,46.96,54.86,50.6
4,xlm_nij_0_percent_21,28,0.0005,3e-05,248,28,51.28,58.28,54.56,48.21,54.14,51.0


In [35]:
# 1.25% of debiasing data
regular_dev = get_training_overview('xlm_nij_1_')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_nij_1_percent_14,28,0.0005,3e-05,248,23,48.24,64.05,55.03,45.42,59.47,51.51
1,xlm_nij_1_percent_12,28,0.0005,3e-05,248,24,48.37,64.89,55.43,45.7,60.33,52.0
2,xlm_nij_1_percent_10,28,0.0005,3e-05,248,24,49.59,61.53,54.92,46.89,57.19,51.53
3,xlm_nij_1_percent_13,28,0.0005,3e-05,248,28,49.75,63.48,55.78,47.06,58.97,52.35
4,xlm_nij_1_percent_11,28,0.0005,3e-05,248,24,50.29,59.72,54.6,47.47,55.58,51.2


In [36]:
# 2.5% debiasing data
regular_dev = get_training_overview('xlm_nij_2_')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_nij_2_percent_17,28,0.0005,3e-05,248,28,48.45,63.66,55.03,45.78,58.88,51.51
1,xlm_nij_2_percent_16,28,0.0005,3e-05,248,28,50.62,62.2,55.81,47.78,57.55,52.22
2,xlm_nij_2_percent_18,28,0.0005,3e-05,248,28,51.56,62.23,56.39,48.63,57.64,52.75
3,xlm_nij_2_percent_19,28,0.0005,3e-05,248,27,52.55,59.58,55.85,49.79,55.54,52.51
4,xlm_nij_2_percent_15,28,0.0005,3e-05,248,28,48.35,65.61,55.67,45.76,60.86,52.24


In [38]:
# 10% debiasing set
regular_dev = get_training_overview('xlm_nij_10_')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_nij_10_percent_8,28,0.0005,3e-05,248,28,51.08,63.74,56.71,48.19,59.17,53.12
1,xlm_nij_10_percent_7,28,0.0005,3e-05,248,22,50.94,61.81,55.85,48.03,57.2,52.21
2,xlm_nij_10_percent_9,28,0.0005,3e-05,248,27,50.52,63.99,56.47,47.77,59.47,52.98
3,xlm_nij_10_percent_6,28,0.0005,3e-05,248,25,50.58,63.44,56.29,47.8,58.89,52.77
4,xlm_nij_10_percent_5,28,0.0005,3e-05,248,28,50.47,63.35,56.18,47.87,58.96,52.84


In [39]:
# full debiasing set
regular_dev = get_training_overview('xlm_nij_248')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_nij_248,28,0.0005,3e-05,248,20,51.13,63.82,56.78,48.25,59.19,53.17
1,xlm_nij_248_2,28,0.0005,3e-05,248,20,51.13,63.82,56.78,48.25,59.19,53.17


#### dee

In [45]:
# 0.625% debiasing data
regular_dev = get_training_overview('xlm_dee_finetune_0')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_dee_finetune_0p_0,28,0.0005,3e-05,248,28,50.13,61.08,55.07,47.41,56.78,51.67
1,xlm_dee_finetune_0p_2,28,0.0005,3e-05,248,28,47.16,61.87,53.52,44.59,57.52,50.23
2,xlm_dee_finetune_0p_4,28,0.0005,3e-05,248,28,52.01,53.41,52.7,49.18,49.78,49.48
3,xlm_dee_finetune_0p_1,28,0.0005,3e-05,248,28,49.78,57.78,53.49,47.14,53.93,50.31
4,xlm_dee_finetune_0p_3,28,0.0005,3e-05,248,28,50.89,60.84,55.42,48.1,56.29,51.88


In [46]:
# 1.25% debiasing data
regular_dev = get_training_overview('xlm_dee_finetune_1')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_dee_finetune_1p_2,28,0.0005,3e-05,248,28,49.6,62.62,55.35,46.94,58.26,51.99
1,xlm_dee_finetune_1p_3,28,0.0005,3e-05,248,28,52.04,61.18,56.24,49.34,56.86,52.83
2,xlm_dee_finetune_1p_0,28,0.0005,3e-05,248,28,49.85,61.79,55.18,47.26,57.57,51.91
3,xlm_dee_finetune_1p_1,28,0.0005,3e-05,248,22,50.41,61.78,55.52,47.65,57.32,52.04
4,xlm_dee_finetune_1p_4,28,0.0005,3e-05,248,24,48.44,63.39,54.92,45.74,58.81,51.46


In [47]:
# 2.5% debiasing data
regular_dev = get_training_overview('xlm_dee_finetune_2')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_dee_finetune_2p_4,28,0.0005,3e-05,248,23,50.07,61.12,55.05,47.38,56.92,51.71
1,xlm_dee_finetune_2p_1,28,0.0005,3e-05,248,26,50.2,63.73,56.16,47.44,59.11,52.64
2,xlm_dee_finetune_2p_2,28,0.0005,3e-05,248,27,49.94,63.15,55.78,47.4,58.77,52.48
3,xlm_dee_finetune_2p_0,28,0.0005,3e-05,248,28,50.8,62.19,55.92,48.21,57.91,52.61
4,xlm_dee_finetune_2p_3,28,0.0005,3e-05,248,21,50.08,63.64,56.05,47.36,59.2,52.62


In [48]:
# 10% debiasing data
regular_dev = get_training_overview('xlm_dee_10_')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_dee_10_percent_1,28,0.0005,3e-05,248,26,50.21,64.95,56.64,47.4,60.33,53.09
1,xlm_dee_10_percent_0,28,0.0005,3e-05,248,28,50.68,64.47,56.75,47.93,59.88,53.24
2,xlm_dee_10_percent_3,28,0.0005,3e-05,248,23,52.9,60.86,56.6,49.95,56.58,53.06
3,xlm_dee_10_percent_2,28,0.0005,3e-05,248,28,50.41,64.52,56.6,47.63,60.04,53.12
4,xlm_dee_10_percent_4,28,0.0005,3e-05,248,28,51.06,63.98,56.79,48.18,59.29,53.16


In [49]:
# full debiasing set
regular_dev = get_training_overview('xlm_dee_248')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_dee_248,28,0.0005,3e-05,248,20,51.16,64.03,56.88,48.28,59.42,53.27


#### zhij

In [50]:
# 10% debiasing data
regular_dev = get_training_overview('xlm_zhij_10_percent')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_zhij_10_percent_8,28,0.0005,3e-05,248,23,51.01,63.14,56.43,48.27,58.74,52.99
1,xlm_zhij_10_percent_7,28,0.0005,3e-05,248,28,50.14,63.8,56.15,47.32,59.01,52.52
2,xlm_zhij_10_percent_0,28,0.0005,3e-05,248,28,51.06,63.66,56.67,48.25,59.12,53.13
3,xlm_zhij_10_percent_6,28,0.0005,3e-05,248,24,52.08,62.42,56.78,49.18,58.01,53.23
4,xlm_zhij_10_percent_9,28,0.0005,3e-05,248,25,51.04,63.86,56.74,48.26,59.35,53.23


In [51]:
# 2.5% debiasing data
regular_dev = get_training_overview('xlm_zhij_2_percent')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_zhij_2_percent_10,28,0.0005,3e-05,248,27,51.25,62.77,56.43,48.34,58.3,52.85
1,xlm_zhij_2_percent_11,28,0.0005,3e-05,248,24,49.6,63.61,55.74,46.7,58.98,52.13
2,xlm_zhij_2_percent_14,28,0.0005,3e-05,248,21,50.86,61.89,55.83,48.11,57.46,52.37
3,xlm_zhij_2_percent_12,28,0.0005,3e-05,248,27,52.62,59.66,55.92,49.79,55.6,52.53
4,xlm_zhij_2_percent_13,28,0.0005,3e-05,248,28,50.86,63.15,56.34,48.16,58.74,52.93


In [52]:
# 1.25% debiasing data
regular_dev = get_training_overview('xlm_zhij_1_percent')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_zhij_1_percent_18,28,0.0005,3e-05,248,26,49.15,62.47,55.02,46.43,58.04,51.59
1,xlm_zhij_1_percent_15,28,0.0005,3e-05,248,24,48.82,62.87,54.96,46.03,58.39,51.48
2,xlm_zhij_1_percent_17,28,0.0005,3e-05,248,28,50.17,63.0,55.86,47.42,58.45,52.36
3,xlm_zhij_1_percent_19,28,0.0005,3e-05,248,23,53.13,59.42,56.1,50.31,55.27,52.67
4,xlm_zhij_1_percent_16,28,0.0005,3e-05,248,28,49.36,63.81,55.66,46.58,59.15,52.12


In [53]:
# 0.625% debiasing data
regular_dev = get_training_overview('xlm_zhij_0_percent')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_zhij_0_percent_22,28,0.0005,3e-05,248,28,49.98,60.43,54.71,47.21,56.08,51.26
1,xlm_zhij_0_percent_21,28,0.0005,3e-05,248,19,52.24,57.25,54.63,49.55,53.44,51.42
2,xlm_zhij_0_percent_20,28,0.0005,3e-05,248,27,50.52,61.74,55.57,47.76,57.33,52.11
3,xlm_zhij_0_percent_23,28,0.0005,3e-05,248,25,49.94,60.91,54.88,47.21,56.45,51.41
4,xlm_zhij_0_percent_24,28,0.0005,3e-05,248,19,52.04,57.41,54.59,49.37,53.62,51.41


In [54]:
# full debiasing set
regular_dev = get_training_overview('xlm_zhij_248')
regular_dev

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_zhij_248,28,0.0005,3e-05,248,20,51.16,64.44,57.04,48.25,59.85,53.43


## Testset evaluation after debiasing, in terms of pronoun score

In [55]:
def get_pronoun_results(model_name, pronoun):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
    
    dev_f1s = [epoch['sl_f1'] for epoch in logs['dev_eval']]
    
    best_epoch = dev_f1s.index(max(dev_f1s))
    best_epoch_results = logs['dev_eval'][best_epoch]
    if "finetune" in model_name:
        best_epoch = logs['epochs'] - 10 + best_epoch
        
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            best_epoch + 1, 
            round(logs[f'{pronoun}_test_head.jsonlines_pronoun_score'],2),
           ]


def get_pronoun_overview(setting, pronoun):
    pronoun_col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'best_epoch', pronoun]
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    for file in files:
        try:
            results.append(get_pronoun_results(file, pronoun))
        except:
            continue
    df = pd.DataFrame(results, columns=pronoun_col_names)
    return df

### dij

In [56]:
# 0.625% debiasing data 
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_dij_1_p', 'dij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dij
0,xlm_dij_1_percent_14,28,0.0005,3e-05,248,10,76.54
1,xlm_dij_1_percent_13,28,0.0005,3e-05,248,10,66.72
2,xlm_dij_1_percent_11,28,0.0005,3e-05,248,10,81.14
3,xlm_dij_1_percent_12,28,0.0005,3e-05,248,10,81.14
4,xlm_dij_1_percent_10,28,0.0005,3e-05,248,10,87.65


In [55]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,10.0,78.64
std,0.0,0.0,0.0,0.0,0.0,7.75
min,28.0,0.0,0.0,248.0,10.0,66.72
25%,28.0,0.0,0.0,248.0,10.0,76.54
50%,28.0,0.0,0.0,248.0,10.0,81.14
75%,28.0,0.0,0.0,248.0,10.0,81.14
max,28.0,0.0,0.0,248.0,10.0,87.65


In [57]:
# 1.25% debiasing data 
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_dij_2_p', 'dij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dij
0,xlm_dij_2_percent_16,28,0.0005,3e-05,248,10,89.46
1,xlm_dij_2_percent_15,28,0.0005,3e-05,248,10,86.72
2,xlm_dij_2_percent_18,28,0.0005,3e-05,248,7,86.25
3,xlm_dij_2_percent_19,28,0.0005,3e-05,248,4,88.73
4,xlm_dij_2_percent_17,28,0.0005,3e-05,248,10,89.66


In [74]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,8.2,88.16
std,0.0,0.0,0.0,0.0,2.68,1.58
min,28.0,0.0,0.0,248.0,4.0,86.25
25%,28.0,0.0,0.0,248.0,7.0,86.72
50%,28.0,0.0,0.0,248.0,10.0,88.73
75%,28.0,0.0,0.0,248.0,10.0,89.46
max,28.0,0.0,0.0,248.0,10.0,89.66


In [58]:
# 2.5% debiasing data 
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_dij_3_p', 'dij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dij
0,xlm_dij_3_percent_20,28,0.0005,3e-05,248,7,88.01
1,xlm_dij_3_percent_23,28,0.0005,3e-05,248,5,89.56
2,xlm_dij_3_percent_21,28,0.0005,3e-05,248,7,88.73
3,xlm_dij_3_percent_24,28,0.0005,3e-05,248,9,86.87
4,xlm_dij_3_percent_22,28,0.0005,3e-05,248,8,89.25


In [103]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,7.2,88.48
std,0.0,0.0,0.0,0.0,1.48,1.08
min,28.0,0.0,0.0,248.0,5.0,86.87
25%,28.0,0.0,0.0,248.0,7.0,88.01
50%,28.0,0.0,0.0,248.0,7.0,88.73
75%,28.0,0.0,0.0,248.0,8.0,89.25
max,28.0,0.0,0.0,248.0,9.0,89.56


In [59]:
# 10% debiasing data
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_dij_10_p', 'dij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dij
0,xlm_dij_10_percent_4,28,0.0005,3e-05,248,10,90.28
1,xlm_dij_10_percent_1,28,0.0005,3e-05,248,5,89.97
2,xlm_dij_10_percent_2,28,0.0005,3e-05,248,2,89.77
3,xlm_dij_10_percent_3,28,0.0005,3e-05,248,5,89.56
4,xlm_dij_10_percent_0,28,0.0005,3e-05,248,10,90.75


In [111]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,6.4,90.07
std,0.0,0.0,0.0,0.0,3.51,0.47
min,28.0,0.0,0.0,248.0,2.0,89.56
25%,28.0,0.0,0.0,248.0,5.0,89.77
50%,28.0,0.0,0.0,248.0,5.0,89.97
75%,28.0,0.0,0.0,248.0,10.0,90.28
max,28.0,0.0,0.0,248.0,10.0,90.75


In [60]:
# full debiasing set
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_dij_248', 'dij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dij
0,xlm_dij_248,28,0.0005,3e-05,248,2,91.11


### zem

In [61]:
# 0.625% debiasing data
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_zem_0_', 'zem')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zem
0,xlm_zem_0_percent_2,28,0.0005,3e-05,248,2,36.59
1,xlm_zem_0_percent_4,28,0.0005,3e-05,248,6,36.38
2,xlm_zem_0_percent_0,28,0.0005,3e-05,248,10,44.7
3,xlm_zem_0_percent_1,28,0.0005,3e-05,248,10,37.62
4,xlm_zem_0_percent_3,28,0.0005,3e-05,248,10,56.18


In [109]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zem
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,7.6,42.29
std,0.0,0.0,0.0,0.0,3.58,8.48
min,28.0,0.0,0.0,248.0,2.0,36.38
25%,28.0,0.0,0.0,248.0,6.0,36.59
50%,28.0,0.0,0.0,248.0,10.0,37.62
75%,28.0,0.0,0.0,248.0,10.0,44.7
max,28.0,0.0,0.0,248.0,10.0,56.18


In [62]:
# 1.25% debiasing data 
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_zem_1_', 'zem')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zem
0,xlm_zem_1_percent_6,28,0.0005,3e-05,248,5,37.67
1,xlm_zem_1_percent_5,28,0.0005,3e-05,248,10,48.73
2,xlm_zem_1_percent_9,28,0.0005,3e-05,248,10,55.35
3,xlm_zem_1_percent_8,28,0.0005,3e-05,248,10,53.28
4,xlm_zem_1_percent_7,28,0.0005,3e-05,248,6,61.03


In [116]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zem
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,8.2,51.21
std,0.0,0.0,0.0,0.0,2.49,8.77
min,28.0,0.0,0.0,248.0,5.0,37.67
25%,28.0,0.0,0.0,248.0,6.0,48.73
50%,28.0,0.0,0.0,248.0,10.0,53.28
75%,28.0,0.0,0.0,248.0,10.0,55.35
max,28.0,0.0,0.0,248.0,10.0,61.03


In [63]:
# 2.5% debiasing data 
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_zem_2_', 'zem')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zem
0,xlm_zem_2_percent_11,28,0.0005,3e-05,248,9,67.03
1,xlm_zem_2_percent_10,28,0.0005,3e-05,248,9,71.68
2,xlm_zem_2_percent_14,28,0.0005,3e-05,248,10,68.99
3,xlm_zem_2_percent_12,28,0.0005,3e-05,248,9,72.56
4,xlm_zem_2_percent_13,28,0.0005,3e-05,248,9,67.13


In [8]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zem
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,9.2,69.48
std,0.0,0.0,0.0,0.0,0.45,2.55
min,28.0,0.0,0.0,248.0,9.0,67.03
25%,28.0,0.0,0.0,248.0,9.0,67.13
50%,28.0,0.0,0.0,248.0,9.0,68.99
75%,28.0,0.0,0.0,248.0,9.0,71.68
max,28.0,0.0,0.0,248.0,10.0,72.56


In [64]:
# 10% debiasing data 
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_zem_10_', 'zem')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zem
0,xlm_zem_10_percent_2,28,0.0005,3e-05,248,10,87.24
1,xlm_zem_10_percent_0,28,0.0005,3e-05,248,9,79.38
2,xlm_zem_10_percent_1,28,0.0005,3e-05,248,8,80.98
3,xlm_zem_10_percent_4,28,0.0005,3e-05,248,9,78.04
4,xlm_zem_10_percent_3,28,0.0005,3e-05,248,10,85.06


In [127]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zem
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,9.2,82.14
std,0.0,0.0,0.0,0.0,0.84,3.88
min,28.0,0.0,0.0,248.0,8.0,78.04
25%,28.0,0.0,0.0,248.0,9.0,79.38
50%,28.0,0.0,0.0,248.0,9.0,80.98
75%,28.0,0.0,0.0,248.0,10.0,85.06
max,28.0,0.0,0.0,248.0,10.0,87.24


In [65]:
# full debiasing data
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_zem_248', 'zem')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zem
0,xlm_zem_248,28,0.0005,3e-05,248,2,88.99


### vij

In [66]:
# 0.625% debiasing data 
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_vij_0_', 'vij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,vij
0,xlm_vij_0_percent_15,28,0.0005,3e-05,248,7,46.98
1,xlm_vij_0_percent_18,28,0.0005,3e-05,248,10,61.45
2,xlm_vij_0_percent_19,28,0.0005,3e-05,248,10,60.26
3,xlm_vij_0_percent_16,28,0.0005,3e-05,248,10,71.21
4,xlm_vij_0_percent_17,28,0.0005,3e-05,248,4,45.99


In [10]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,vij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,8.2,57.18
std,0.0,0.0,0.0,0.0,2.68,10.65
min,28.0,0.0,0.0,248.0,4.0,45.99
25%,28.0,0.0,0.0,248.0,7.0,46.98
50%,28.0,0.0,0.0,248.0,10.0,60.26
75%,28.0,0.0,0.0,248.0,10.0,61.45
max,28.0,0.0,0.0,248.0,10.0,71.21


In [67]:
# 1.25% debiasing data 
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_vij_1_', 'vij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,vij
0,xlm_vij_1_percent_12,28,0.0005,3e-05,248,10,80.21
1,xlm_vij_1_percent_13,28,0.0005,3e-05,248,10,65.12
2,xlm_vij_1_percent_11,28,0.0005,3e-05,248,6,63.72
3,xlm_vij_1_percent_14,28,0.0005,3e-05,248,8,75.66
4,xlm_vij_1_percent_10,28,0.0005,3e-05,248,10,57.47


In [150]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,vij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,8.8,68.44
std,0.0,0.0,0.0,0.0,1.79,9.28
min,28.0,0.0,0.0,248.0,6.0,57.47
25%,28.0,0.0,0.0,248.0,8.0,63.72
50%,28.0,0.0,0.0,248.0,10.0,65.12
75%,28.0,0.0,0.0,248.0,10.0,75.66
max,28.0,0.0,0.0,248.0,10.0,80.21


In [68]:
# 2.5% debiasing data 
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_vij_2_', 'vij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,vij
0,xlm_vij_2_percent_6,28,0.0005,3e-05,248,7,84.19
1,xlm_vij_2_percent_8,28,0.0005,3e-05,248,10,86.46
2,xlm_vij_2_percent_7,28,0.0005,3e-05,248,10,82.48
3,xlm_vij_2_percent_9,28,0.0005,3e-05,248,10,86.3
4,xlm_vij_2_percent_5,28,0.0005,3e-05,248,8,75.81


In [152]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,vij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,9.0,83.05
std,0.0,0.0,0.0,0.0,1.41,4.37
min,28.0,0.0,0.0,248.0,7.0,75.81
25%,28.0,0.0,0.0,248.0,8.0,82.48
50%,28.0,0.0,0.0,248.0,10.0,84.19
75%,28.0,0.0,0.0,248.0,10.0,86.3
max,28.0,0.0,0.0,248.0,10.0,86.46


In [69]:
# 10% debiasing data 
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_vij_10_', 'vij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,vij
0,xlm_vij_10_percent_4,28,0.0005,3e-05,248,9,89.1
1,xlm_vij_10_percent_1,28,0.0005,3e-05,248,10,89.1
2,xlm_vij_10_percent_3,28,0.0005,3e-05,248,10,90.13
3,xlm_vij_10_percent_2,28,0.0005,3e-05,248,8,89.82
4,xlm_vij_10_percent_0,28,0.0005,3e-05,248,8,89.46


In [154]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,vij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,9.0,89.52
std,0.0,0.0,0.0,0.0,1.0,0.45
min,28.0,0.0,0.0,248.0,8.0,89.1
25%,28.0,0.0,0.0,248.0,8.0,89.1
50%,28.0,0.0,0.0,248.0,9.0,89.46
75%,28.0,0.0,0.0,248.0,10.0,89.82
max,28.0,0.0,0.0,248.0,10.0,90.13


In [70]:
# full dataset
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_vij_248', 'vij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,vij
0,xlm_vij_248,28,0.0005,3e-05,248,2,90.59


### nij

In [71]:
# 0.125% debiasing data 
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_nij_0_', 'nij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,nij
0,xlm_nij_0_percent_20,28,0.0005,3e-05,248,10,75.61
1,xlm_nij_0_percent_24,28,0.0005,3e-05,248,10,64.81
2,xlm_nij_0_percent_23,28,0.0005,3e-05,248,10,56.59
3,xlm_nij_0_percent_22,28,0.0005,3e-05,248,10,70.39
4,xlm_nij_0_percent_21,28,0.0005,3e-05,248,10,75.14


In [187]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,nij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,10.0,68.51
std,0.0,0.0,0.0,0.0,0.0,7.96
min,28.0,0.0,0.0,248.0,10.0,56.59
25%,28.0,0.0,0.0,248.0,10.0,64.81
50%,28.0,0.0,0.0,248.0,10.0,70.39
75%,28.0,0.0,0.0,248.0,10.0,75.14
max,28.0,0.0,0.0,248.0,10.0,75.61


In [72]:
# 1.25% debiasing data 
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_nij_1_', 'nij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,nij
0,xlm_nij_1_percent_14,28,0.0005,3e-05,248,5,85.27
1,xlm_nij_1_percent_12,28,0.0005,3e-05,248,6,87.49
2,xlm_nij_1_percent_10,28,0.0005,3e-05,248,6,78.66
3,xlm_nij_1_percent_13,28,0.0005,3e-05,248,10,86.67
4,xlm_nij_1_percent_11,28,0.0005,3e-05,248,6,78.55


In [180]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,nij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,6.6,83.33
std,0.0,0.0,0.0,0.0,1.95,4.38
min,28.0,0.0,0.0,248.0,5.0,78.55
25%,28.0,0.0,0.0,248.0,6.0,78.66
50%,28.0,0.0,0.0,248.0,6.0,85.27
75%,28.0,0.0,0.0,248.0,6.0,86.67
max,28.0,0.0,0.0,248.0,10.0,87.49


In [73]:
# 2.5% debiasing data 
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_nij_2_', 'nij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,nij
0,xlm_nij_2_percent_17,28,0.0005,3e-05,248,10,86.82
1,xlm_nij_2_percent_16,28,0.0005,3e-05,248,10,85.63
2,xlm_nij_2_percent_18,28,0.0005,3e-05,248,10,88.22
3,xlm_nij_2_percent_19,28,0.0005,3e-05,248,9,87.8
4,xlm_nij_2_percent_15,28,0.0005,3e-05,248,10,88.68


In [182]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,nij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,9.8,87.43
std,0.0,0.0,0.0,0.0,0.45,1.22
min,28.0,0.0,0.0,248.0,9.0,85.63
25%,28.0,0.0,0.0,248.0,10.0,86.82
50%,28.0,0.0,0.0,248.0,10.0,87.8
75%,28.0,0.0,0.0,248.0,10.0,88.22
max,28.0,0.0,0.0,248.0,10.0,88.68


In [74]:
# 10% debiasing data 
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_nij_10_', 'nij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,nij
0,xlm_nij_10_percent_8,28,0.0005,3e-05,248,10,89.72
1,xlm_nij_10_percent_7,28,0.0005,3e-05,248,4,87.91
2,xlm_nij_10_percent_9,28,0.0005,3e-05,248,9,89.25
3,xlm_nij_10_percent_6,28,0.0005,3e-05,248,7,89.46
4,xlm_nij_10_percent_5,28,0.0005,3e-05,248,10,89.66


In [184]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,nij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,8.0,89.2
std,0.0,0.0,0.0,0.0,2.55,0.74
min,28.0,0.0,0.0,248.0,4.0,87.91
25%,28.0,0.0,0.0,248.0,7.0,89.25
50%,28.0,0.0,0.0,248.0,9.0,89.46
75%,28.0,0.0,0.0,248.0,10.0,89.66
max,28.0,0.0,0.0,248.0,10.0,89.72


In [75]:
# full debiasing set
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_nij_248', 'nij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,nij
0,xlm_nij_248,28,0.0005,3e-05,248,2,75.61
1,xlm_nij_248_2,28,0.0005,3e-05,248,2,90.85


### dee

In [84]:
# 0.625% of debiasing data
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_dee_finetune_0', 'dee')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dee
0,xlm_dee_finetune_0p_0,28,0.0005,3e-05,248,28,86.82
1,xlm_dee_finetune_0p_2,28,0.0005,3e-05,248,28,72.4
2,xlm_dee_finetune_0p_4,28,0.0005,3e-05,248,28,50.34
3,xlm_dee_finetune_0p_1,28,0.0005,3e-05,248,28,63.88
4,xlm_dee_finetune_0p_3,28,0.0005,3e-05,248,28,81.4


In [79]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dee
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,26.0,86.62
std,0.0,0.0,0.0,0.0,2.83,1.91
min,28.0,0.0,0.0,248.0,22.0,84.34
25%,28.0,0.0,0.0,248.0,24.0,85.27
50%,28.0,0.0,0.0,248.0,28.0,86.72
75%,28.0,0.0,0.0,248.0,28.0,87.55
max,28.0,0.0,0.0,248.0,28.0,89.2


In [85]:
# 1.25% of debiasing data
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_dee_finetune_1', 'dee')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dee
0,xlm_dee_finetune_1p_2,28,0.0005,3e-05,248,28,84.34
1,xlm_dee_finetune_1p_3,28,0.0005,3e-05,248,28,89.2
2,xlm_dee_finetune_1p_0,28,0.0005,3e-05,248,28,87.55
3,xlm_dee_finetune_1p_1,28,0.0005,3e-05,248,22,85.27
4,xlm_dee_finetune_1p_4,28,0.0005,3e-05,248,24,86.72


In [81]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dee
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,26.0,86.62
std,0.0,0.0,0.0,0.0,2.83,1.91
min,28.0,0.0,0.0,248.0,22.0,84.34
25%,28.0,0.0,0.0,248.0,24.0,85.27
50%,28.0,0.0,0.0,248.0,28.0,86.72
75%,28.0,0.0,0.0,248.0,28.0,87.55
max,28.0,0.0,0.0,248.0,28.0,89.2


In [86]:
# 2.5% of debiasing data
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_dee_finetune_2', 'dee')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dee
0,xlm_dee_finetune_2p_4,28,0.0005,3e-05,248,23,85.32
1,xlm_dee_finetune_2p_1,28,0.0005,3e-05,248,26,89.35
2,xlm_dee_finetune_2p_2,28,0.0005,3e-05,248,27,89.82
3,xlm_dee_finetune_2p_0,28,0.0005,3e-05,248,28,88.27
4,xlm_dee_finetune_2p_3,28,0.0005,3e-05,248,21,88.63


In [83]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dee
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,25.0,88.28
std,0.0,0.0,0.0,0.0,2.92,1.76
min,28.0,0.0,0.0,248.0,21.0,85.32
25%,28.0,0.0,0.0,248.0,23.0,88.27
50%,28.0,0.0,0.0,248.0,26.0,88.63
75%,28.0,0.0,0.0,248.0,27.0,89.35
max,28.0,0.0,0.0,248.0,28.0,89.82


In [87]:
# 10% of debiasing data

gn_combo_full_pronoun_df = get_pronoun_overview('xlm_dee_10_', 'dee')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dee
0,xlm_dee_10_percent_1,28,0.0005,3e-05,248,8,90.96
1,xlm_dee_10_percent_0,28,0.0005,3e-05,248,10,90.85
2,xlm_dee_10_percent_3,28,0.0005,3e-05,248,5,89.61
3,xlm_dee_10_percent_2,28,0.0005,3e-05,248,10,90.44
4,xlm_dee_10_percent_4,28,0.0005,3e-05,248,10,90.28


In [18]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dee
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,8.6,90.43
std,0.0,0.0,0.0,0.0,2.19,0.54
min,28.0,0.0,0.0,248.0,5.0,89.61
25%,28.0,0.0,0.0,248.0,8.0,90.28
50%,28.0,0.0,0.0,248.0,10.0,90.44
75%,28.0,0.0,0.0,248.0,10.0,90.85
max,28.0,0.0,0.0,248.0,10.0,90.96


In [88]:
# full debiasing data set

gn_combo_full_pronoun_df = get_pronoun_overview('xlm_dee_248', 'dee')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,dee
0,xlm_dee_248,28,0.0005,3e-05,248,2,91.01


### zhij

In [89]:
# 0.625% of debiasing data

gn_combo_full_pronoun_df = get_pronoun_overview('xlm_zhij_0_', 'zhij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zhij
0,xlm_zhij_0_percent_22,28,0.0005,3e-05,248,10,87.13
1,xlm_zhij_0_percent_21,28,0.0005,3e-05,248,1,84.6
2,xlm_zhij_0_percent_20,28,0.0005,3e-05,248,9,88.68
3,xlm_zhij_0_percent_23,28,0.0005,3e-05,248,7,86.51
4,xlm_zhij_0_percent_24,28,0.0005,3e-05,248,1,84.7


In [41]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zhij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,5.6,86.32
std,0.0,0.0,0.0,0.0,4.34,1.72
min,28.0,0.0,0.0,248.0,1.0,84.6
25%,28.0,0.0,0.0,248.0,1.0,84.7
50%,28.0,0.0,0.0,248.0,7.0,86.51
75%,28.0,0.0,0.0,248.0,9.0,87.13
max,28.0,0.0,0.0,248.0,10.0,88.68


In [90]:
# 1.25% of debiasing data

gn_combo_full_pronoun_df = get_pronoun_overview('xlm_zhij_1_', 'zhij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zhij
0,xlm_zhij_1_percent_18,28,0.0005,3e-05,248,8,89.77
1,xlm_zhij_1_percent_15,28,0.0005,3e-05,248,6,87.91
2,xlm_zhij_1_percent_17,28,0.0005,3e-05,248,10,90.39
3,xlm_zhij_1_percent_19,28,0.0005,3e-05,248,5,90.59
4,xlm_zhij_1_percent_16,28,0.0005,3e-05,248,10,90.13


In [44]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zhij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,7.8,89.76
std,0.0,0.0,0.0,0.0,2.28,1.08
min,28.0,0.0,0.0,248.0,5.0,87.91
25%,28.0,0.0,0.0,248.0,6.0,89.77
50%,28.0,0.0,0.0,248.0,8.0,90.13
75%,28.0,0.0,0.0,248.0,10.0,90.39
max,28.0,0.0,0.0,248.0,10.0,90.59


In [91]:
# 2.5% of debiasing data

gn_combo_full_pronoun_df = get_pronoun_overview('xlm_zhij_2_', 'zhij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zhij
0,xlm_zhij_2_percent_10,28,0.0005,3e-05,248,9,90.23
1,xlm_zhij_2_percent_11,28,0.0005,3e-05,248,6,90.34
2,xlm_zhij_2_percent_14,28,0.0005,3e-05,248,3,89.46
3,xlm_zhij_2_percent_12,28,0.0005,3e-05,248,9,89.25
4,xlm_zhij_2_percent_13,28,0.0005,3e-05,248,10,90.7


In [34]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zhij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,7.4,90.0
std,0.0,0.0,0.0,0.0,2.88,0.61
min,28.0,0.0,0.0,248.0,3.0,89.25
25%,28.0,0.0,0.0,248.0,6.0,89.46
50%,28.0,0.0,0.0,248.0,9.0,90.23
75%,28.0,0.0,0.0,248.0,9.0,90.34
max,28.0,0.0,0.0,248.0,10.0,90.7


In [92]:
# 10% of debiasing data

gn_combo_full_pronoun_df = get_pronoun_overview('xlm_zhij_10_', 'zhij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zhij
0,xlm_zhij_10_percent_8,28,0.0005,3e-05,248,5,91.16
1,xlm_zhij_10_percent_7,28,0.0005,3e-05,248,10,89.61
2,xlm_zhij_10_percent_0,28,0.0005,3e-05,248,10,90.9
3,xlm_zhij_10_percent_6,28,0.0005,3e-05,248,6,90.85
4,xlm_zhij_10_percent_9,28,0.0005,3e-05,248,7,90.39


In [36]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zhij
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,7.6,90.58
std,0.0,0.0,0.0,0.0,2.3,0.61
min,28.0,0.0,0.0,248.0,5.0,89.61
25%,28.0,0.0,0.0,248.0,6.0,90.39
50%,28.0,0.0,0.0,248.0,7.0,90.85
75%,28.0,0.0,0.0,248.0,10.0,90.9
max,28.0,0.0,0.0,248.0,10.0,91.16


In [93]:
# full debiasing set
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_zhij_248', 'zhij')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,zhij
0,xlm_zhij_248,28,0.0005,3e-05,248,2,91.11
