In this document, I evaluate the debiasing performances of two debiasing techniques:
* CDA
* Delexicalisation

I evaluate the effectiveness of both techniques in terms of LEA score and pronoun score. 

I evaluate the techniques by full retraining, and by fine-tuning. Additionally, I evaluate CDA with a subset of the data.

Finally, I evaluate the performance of the debiased models on the original data, to investigate whether any knowledge is lost through debiasing.

In [11]:
%cd /hpc/uu_cs_nlpsoc/gvanboven/wl-coref

/hpc/uu_cs_nlpsoc/gvanboven/wl-coref


In [2]:
import json
import matplotlib.pyplot as plt
import os

import pandas as pd

## training results overview on the dev set
The results are investigated in order to only keep the weights of the best DEV epochs.

In [3]:
def get_training_results(model_name):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
        #print(logs[])
    dev_f1s = [epoch['sl_f1'] for epoch in logs['dev_eval']]
    
    best_epoch = dev_f1s.index(max(dev_f1s))
    best_epoch_results = logs['dev_eval'][best_epoch]
    if "fine" in model_name or "p_" in model_name :
        best_epoch = logs['epochs'] - 10 + best_epoch
        
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            best_epoch + 1, 
            round(best_epoch_results['sl_p'],4) * 100,
            round(best_epoch_results['sl_r'],4) * 100,
            round(best_epoch_results['sl_f1'],4) * 100,
           ]

In [4]:
col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'best_epoch', 'P', 'R', 'F1']
def get_training_overview(setting):
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    for file in files:
        results.append(get_training_results(file))
    df = pd.DataFrame(results, columns=col_names)
    return df

Delex, finetuning

In [48]:
delex_fine_dev = get_training_overview('delex_fine')
delex_fine_dev

['xlm_delex_fine_1234.json', 'xlm_delex_fine_123.json', 'xlm_delex_fine_2023.json', 'xlm_delex_fine_248.json', 'xlm_delex_fine_2020.json']


Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,P,R,F1
0,xlm_delex_fine_1234,25,0.0005,3e-05,1234,21,51.49,58.04,54.57
1,xlm_delex_fine_123,28,0.0005,3e-05,123,26,50.77,60.85,55.35
2,xlm_delex_fine_2023,27,0.0005,3e-05,2023,23,50.47,60.45,55.01
3,xlm_delex_fine_248,28,0.0005,3e-05,248,26,51.76,57.13,54.31
4,xlm_delex_fine_2020,25,0.0005,3e-05,2020,22,52.97,57.71,55.24


Delex, full retraining

In [44]:
delex_full_dev = get_training_overview('delex_full')
delex_full_dev

['xlm_delex_full_2020.json', 'xlm_delex_full_248.json', 'xlm_delex_full_1234.json', 'xlm_delex_full_123.json', 'xlm_delex_full_2023.json']


Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,P,R,F1
0,xlm_delex_full_2020,20,0.0005,3e-05,2020,15,51.32,59.18,54.97
1,xlm_delex_full_248,20,0.0005,3e-05,248,14,55.02,56.0,55.51
2,xlm_delex_full_1234,20,0.0005,3e-05,1234,16,51.4,59.58,55.19
3,xlm_delex_full_123,20,0.0005,3e-05,123,16,51.53,57.91,54.54
4,xlm_delex_full_2023,20,0.0005,3e-05,2023,15,54.49,56.26,55.36


CDA, full retraining

In [22]:
gn_full_com_dev = get_training_overview('xlm_gn_comb_full')
gn_full_com_dev

['xlm_gn_comb_full_1234.json', 'xlm_gn_comb_full_123.json', 'xlm_gn_comb_full_2023.json', 'xlm_gn_comb_full_248.json', 'xlm_gn_comb_full_2020.json']


Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,P,R,F1
0,xlm_gn_comb_full_1234,20,0.0005,3e-05,1234,18,50.93,58.86,54.61
1,xlm_gn_comb_full_123,20,0.0005,3e-05,123,13,53.32,54.88,54.09
2,xlm_gn_comb_full_2023,20,0.0005,3e-05,2023,17,53.15,54.91,54.01
3,xlm_gn_comb_full_248,20,0.0005,3e-05,248,18,49.77,59.22,54.08
4,xlm_gn_comb_full_2020,20,0.0005,3e-05,2020,19,48.18,61.86,54.17


CDA, finetuning

In [16]:
gn_fine_com_dev = get_training_overview('xlm_gn_comb_fine')
gn_fine_com_dev

['xlm_gn_comb_fine_123.json', 'xlm_gn_comb_fine_248.json', 'xlm_gn_comb_fine_2020.json', 'xlm_gn_comb_fine_1234.json', 'xlm_gn_comb_fine_2023.json']


Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,P,R,F1
0,xlm_gn_comb_fine_123,28,0.0005,3e-05,123,23,50.24,57.48,53.62
1,xlm_gn_comb_fine_248,28,0.0005,3e-05,248,26,51.2,58.06,54.42
2,xlm_gn_comb_fine_2020,25,0.0005,3e-05,2020,25,50.14,59.93,54.6
3,xlm_gn_comb_fine_1234,25,0.0005,3e-05,1234,23,50.0,57.9,53.66
4,xlm_gn_comb_fine_2023,27,0.0005,3e-05,2023,23,49.83,58.78,53.94


CDA finetuning, with 10 percent of the data

In [56]:
gi_gn_10_dev = get_training_overview('xlm_gn_comb_10p_')
gi_gn_10_dev

['xlm_gn_comb_10p_2.json', 'xlm_gn_comb_10p_3.json', 'xlm_gn_comb_10p_1.json', 'xlm_gn_comb_10p_248.json', 'xlm_gn_comb_10p_4.json']


Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,P,R,F1
0,xlm_gn_comb_10p_2,28,0.0005,3e-05,248,26,47.56,58.79,52.58
1,xlm_gn_comb_10p_3,28,0.0005,3e-05,248,28,48.42,58.25,52.88
2,xlm_gn_comb_10p_1,28,0.0005,3e-05,248,26,47.56,58.79,52.58
3,xlm_gn_comb_10p_248,28,0.0005,3e-05,248,26,46.86,58.66,52.1
4,xlm_gn_comb_10p_4,28,0.0005,3e-05,248,27,47.45,58.13,52.25


CDA finetuning, with 5 percent of the data

In [65]:
gi_gn_5_dev = get_training_overview('xlm_gn_comb_5p')
gi_gn_5_dev

['xlm_gn_comb_5p_0.json', 'xlm_gn_comb_5p_1.json', 'xlm_gn_comb_5p_3.json', 'xlm_gn_comb_5p_4.json', 'xlm_gn_comb_5p_2.json']


Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,P,R,F1
0,xlm_gn_comb_5p_0,28,0.0005,3e-05,248,28,48.14,57.12,52.24
1,xlm_gn_comb_5p_1,28,0.0005,3e-05,248,24,48.1,57.69,52.46
2,xlm_gn_comb_5p_3,28,0.0005,3e-05,248,28,48.14,57.02,52.2
3,xlm_gn_comb_5p_4,28,0.0005,3e-05,248,28,49.15,57.19,52.87
4,xlm_gn_comb_5p_2,28,0.0005,3e-05,248,24,47.76,58.54,52.6


CDA finetuning, with 2 percent of the data

In [5]:
gi_gn_2_dev = get_training_overview('xlm_gn_comb_2p')
gi_gn_2_dev

['xlm_gn_comb_2p_0.json', 'xlm_gn_comb_2p_1.json', 'xlm_gn_comb_2p_4.json', 'xlm_gn_comb_2p_2.json', 'xlm_gn_comb_2p_3.json']


Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,P,R,F1
0,xlm_gn_comb_2p_0,28,0.0005,3e-05,248,28,47.64,57.51,52.11
1,xlm_gn_comb_2p_1,28,0.0005,3e-05,248,24,46.32,58.32,51.63
2,xlm_gn_comb_2p_4,28,0.0005,3e-05,248,21,47.85,55.97,51.59
3,xlm_gn_comb_2p_2,28,0.0005,3e-05,248,21,47.06,57.16,51.62
4,xlm_gn_comb_2p_3,28,0.0005,3e-05,248,27,47.2,57.11,51.68


CDA finetuning, with 1 percent of the data

In [18]:
gi_gn_1_dev = get_training_overview('xlm_gn_comb_1p')
gi_gn_1_dev

['xlm_gn_comb_1p_4.json', 'xlm_gn_comb_1p_1.json', 'xlm_gn_comb_1p_0.json', 'xlm_gn_comb_1p_3.json', 'xlm_gn_comb_1p_2.json']


Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,P,R,F1
0,xlm_gn_comb_1p_4,28,0.0005,3e-05,248,23,45.68,57.44,50.89
1,xlm_gn_comb_1p_1,28,0.0005,3e-05,248,26,46.61,57.86,51.63
2,xlm_gn_comb_1p_0,28,0.0005,3e-05,248,28,47.04,56.42,51.31
3,xlm_gn_comb_1p_3,28,0.0005,3e-05,248,28,47.73,56.03,51.55
4,xlm_gn_comb_1p_2,28,0.0005,3e-05,248,28,45.65,58.4,51.24


## test set results overview, in terms of LEA score

In [5]:
def get_pronoun_set_results(model_name):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
    

        
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            round(logs['hij_test_head.jsonlines_eval'][0]['sl_p'],4)* 100,
            round(logs['hij_test_head.jsonlines_eval'][0]['sl_r'],4)* 100,
            round(logs['hij_test_head.jsonlines_eval'][0]['sl_f1'],4)* 100,
            round(logs['zij_test_head.jsonlines_eval'][0]['sl_p'],4)* 100,
            round(logs['zij_test_head.jsonlines_eval'][0]['sl_r'],4)* 100,
            round(logs['zij_test_head.jsonlines_eval'][0]['sl_f1'],4)* 100,
            round(logs['hen_test_head.jsonlines_eval'][0]['sl_p'],4)* 100,
            round(logs['hen_test_head.jsonlines_eval'][0]['sl_r'],4)* 100,
            round(logs['hen_test_head.jsonlines_eval'][0]['sl_f1'],4)* 100,
            round(logs['die_test_head.jsonlines_eval'][0]['sl_p'],4)* 100,
            round(logs['die_test_head.jsonlines_eval'][0]['sl_r'],4)* 100,
            round(logs['die_test_head.jsonlines_eval'][0]['sl_f1'],4)* 100,
           ]

pronoun_sets_col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'hij p', 'hij r', \
                          'hij f1', 'zij p', 'zij r', 'zij f1', 'hen p', 'hen r', 'hen f1', 'die p', 'die r', 'die f1'] 
def get_pronoun_set_overview(setting):
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    for file in files:
        try:
            results.append(get_pronoun_set_results(file))
        except:
            continue
    df = pd.DataFrame(results, columns=pronoun_sets_col_names)
    return df

Delex, full retraining

In [13]:
test_delex_full_df = get_pronoun_set_overview('xlm_delex_full')
test_delex_full_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
0,xlm_delex_full_2020,20,0.0005,3e-05,2020,50.46,54.57,52.43,50.77,55.67,53.11,49.87,52.92,51.35,49.13,52.32,50.68
1,xlm_delex_full_248,20,0.0005,3e-05,248,55.04,52.44,53.71,55.2,52.44,53.78,53.62,46.59,49.86,53.25,48.07,50.53
2,xlm_delex_full_1234,20,0.0005,3e-05,1234,52.42,55.24,53.79,52.28,55.78,53.97,50.4,52.26,51.31,50.69,52.44,51.55
3,xlm_delex_full_123,20,0.0005,3e-05,123,51.97,53.99,52.96,51.94,53.93,52.92,50.87,51.29,51.08,50.49,51.32,50.9
4,xlm_delex_full_2023,20,0.0005,3e-05,2023,54.45,50.32,52.3,54.86,50.83,52.77,52.94,46.96,49.77,52.89,47.03,49.79


In [14]:
test_delex_full_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.0,0.0,0.0,1129.6,52.87,53.31,53.04,53.01,53.73,53.31,51.54,50.0,50.67,51.29,50.24,50.69
std,0.0,0.0,0.0,920.91,1.87,1.97,0.7,1.93,2.13,0.53,1.64,3.01,0.79,1.74,2.52,0.64
min,20.0,0.0,0.0,123.0,50.46,50.32,52.3,50.77,50.83,52.77,49.87,46.59,49.77,49.13,47.03,49.79
25%,20.0,0.0,0.0,248.0,51.97,52.44,52.43,51.94,52.44,52.92,50.4,46.96,49.86,50.49,48.07,50.53
50%,20.0,0.0,0.0,1234.0,52.42,53.99,52.96,52.28,53.93,53.11,50.87,51.29,51.08,50.69,51.32,50.68
75%,20.0,0.0,0.0,2020.0,54.45,54.57,53.71,54.86,55.67,53.78,52.94,52.26,51.31,52.89,52.32,50.9
max,20.0,0.0,0.0,2023.0,55.04,55.24,53.79,55.2,55.78,53.97,53.62,52.92,51.35,53.25,52.44,51.55


Delex, fine-tuning

In [15]:
test_delex_fine_df = get_pronoun_set_overview('xlm_delex_fine')
test_delex_fine_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
0,xlm_delex_fine_1234,25,0.0005,3e-05,1234,51.69,54.81,53.2,51.52,54.85,53.14,49.7,49.31,49.51,49.62,49.78,49.7
1,xlm_delex_fine_123,28,0.0005,3e-05,123,51.78,58.33,54.86,51.15,58.32,54.5,50.05,54.1,52.0,49.78,52.75,51.22
2,xlm_delex_fine_2023,27,0.0005,3e-05,2023,50.43,58.04,53.97,50.52,58.02,54.01,48.95,53.49,51.12,48.8,52.7,50.67
3,xlm_delex_fine_248,28,0.0005,3e-05,248,51.97,53.74,52.84,51.53,53.45,52.47,49.87,49.39,49.63,49.64,48.14,48.88
4,xlm_delex_fine_2020,25,0.0005,3e-05,2020,52.41,55.31,53.82,52.32,54.79,53.53,51.08,49.53,50.29,50.61,49.16,49.87


In [16]:
test_delex_fine_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,26.6,0.0,0.0,1129.6,51.66,56.05,53.74,51.41,55.89,53.53,49.93,51.16,50.51,49.69,50.51,50.07
std,1.52,0.0,0.0,920.91,0.74,2.04,0.78,0.65,2.16,0.78,0.77,2.41,1.05,0.64,2.11,0.9
min,25.0,0.0,0.0,123.0,50.43,53.74,52.84,50.52,53.45,52.47,48.95,49.31,49.51,48.8,48.14,48.88
25%,25.0,0.0,0.0,248.0,51.69,54.81,53.2,51.15,54.79,53.14,49.7,49.39,49.63,49.62,49.16,49.7
50%,27.0,0.0,0.0,1234.0,51.78,55.31,53.82,51.52,54.85,53.53,49.87,49.53,50.29,49.64,49.78,49.87
75%,28.0,0.0,0.0,2020.0,51.97,58.04,53.97,51.53,58.02,54.01,50.05,53.49,51.12,49.78,52.7,50.67
max,28.0,0.0,0.0,2023.0,52.41,58.33,54.86,52.32,58.32,54.5,51.08,54.1,52.0,50.61,52.75,51.22


CDA, full retraining

In [19]:
gn_comb_full_pronoun_df = get_pronoun_set_overview('xlm_gn_comb_full')
gn_comb_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
0,xlm_gn_comb_full_1234,20,0.0005,3e-05,1234,54.5,55.67,55.08,54.45,56.01,55.22,54.33,55.55,54.93,54.37,55.71,55.03
1,xlm_gn_comb_full_123,20,0.0005,3e-05,123,56.78,51.57,54.05,56.68,51.7,54.07,56.64,51.95,54.2,56.47,51.91,54.1
2,xlm_gn_comb_full_2023,20,0.0005,3e-05,2023,56.93,51.54,54.1,56.71,51.5,53.98,56.57,51.74,54.05,56.45,51.71,53.98
3,xlm_gn_comb_full_248,20,0.0005,3e-05,248,53.4,55.66,54.51,53.27,55.96,54.58,53.05,55.81,54.4,52.84,55.97,54.36
4,xlm_gn_comb_full_2020,20,0.0005,3e-05,2020,50.75,58.79,54.48,50.78,58.8,54.5,50.71,58.67,54.4,50.33,58.7,54.2


In [20]:
gn_comb_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.0,0.0,0.0,1129.6,54.47,54.65,54.44,54.38,54.79,54.47,54.26,54.74,54.4,54.09,54.8,54.33
std,0.0,0.0,0.0,920.91,2.57,3.1,0.41,2.5,3.13,0.49,2.5,2.92,0.33,2.6,2.97,0.41
min,20.0,0.0,0.0,123.0,50.75,51.54,54.05,50.78,51.5,53.98,50.71,51.74,54.05,50.33,51.71,53.98
25%,20.0,0.0,0.0,248.0,53.4,51.57,54.1,53.27,51.7,54.07,53.05,51.95,54.2,52.84,51.91,54.1
50%,20.0,0.0,0.0,1234.0,54.5,55.66,54.48,54.45,55.96,54.5,54.33,55.55,54.4,54.37,55.71,54.2
75%,20.0,0.0,0.0,2020.0,56.78,55.67,54.51,56.68,56.01,54.58,56.57,55.81,54.4,56.45,55.97,54.36
max,20.0,0.0,0.0,2023.0,56.93,58.79,55.08,56.71,58.8,55.22,56.64,58.67,54.93,56.47,58.7,55.03


CDA, fine-tuning

In [17]:
gn_comb_fine_pronoun_df = get_pronoun_set_overview('xlm_gn_comb_fine')
gn_comb_fine_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
0,xlm_gn_comb_fine_123,28,0.0005,3e-05,123,53.7,55.33,54.5,53.74,54.62,54.18,53.67,55.03,54.34,53.47,54.83,54.14
1,xlm_gn_comb_fine_248,28,0.0005,3e-05,248,55.04,54.25,54.64,55.05,54.18,54.61,55.14,54.17,54.65,54.88,54.18,54.53
2,xlm_gn_comb_fine_2020,25,0.0005,3e-05,2020,53.21,57.42,55.23,53.28,57.43,55.28,53.15,57.31,55.15,53.13,57.27,55.12
3,xlm_gn_comb_fine_1234,25,0.0005,3e-05,1234,53.85,55.87,54.84,53.76,55.71,54.72,53.78,55.76,54.75,53.56,55.5,54.51
4,xlm_gn_comb_fine_2023,27,0.0005,3e-05,2023,52.49,54.82,53.63,52.57,54.67,53.6,52.59,54.62,53.59,52.42,54.67,53.52


In [18]:
gn_comb_fine_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,26.6,0.0,0.0,1129.6,53.66,55.54,54.57,53.68,55.32,54.48,53.67,55.38,54.5,53.49,55.29,54.36
std,1.52,0.0,0.0,920.91,0.94,1.21,0.59,0.91,1.31,0.63,0.95,1.23,0.58,0.9,1.2,0.59
min,25.0,0.0,0.0,123.0,52.49,54.25,53.63,52.57,54.18,53.6,52.59,54.17,53.59,52.42,54.18,53.52
25%,25.0,0.0,0.0,248.0,53.21,54.82,54.5,53.28,54.62,54.18,53.15,54.62,54.34,53.13,54.67,54.14
50%,27.0,0.0,0.0,1234.0,53.7,55.33,54.64,53.74,54.67,54.61,53.67,55.03,54.65,53.47,54.83,54.51
75%,28.0,0.0,0.0,2020.0,53.85,55.87,54.84,53.76,55.71,54.72,53.78,55.76,54.75,53.56,55.5,54.53
max,28.0,0.0,0.0,2023.0,55.04,57.42,55.23,55.05,57.43,55.28,55.14,57.31,55.15,54.88,57.27,55.12


CDA, fine-tuning with 10% of the documents

In [8]:
test_10_df = get_pronoun_set_overview('xlm_gn_comb_10p')
test_10_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
0,xlm_gn_comb_10p_2,28,0.0005,3e-05,248,49.64,56.35,52.79,49.74,55.49,52.46,50.23,54.2,52.14,49.89,54.46,52.07
1,xlm_gn_comb_10p_3,28,0.0005,3e-05,248,51.29,55.87,53.48,52.06,54.71,53.35,52.12,53.67,52.88,50.92,53.9,52.37
2,xlm_gn_comb_10p_1,28,0.0005,3e-05,248,49.64,56.35,52.79,49.74,55.49,52.46,50.23,54.2,52.14,49.89,54.46,52.07
3,xlm_gn_comb_10p_248,28,0.0005,3e-05,248,49.49,56.8,52.9,49.53,55.99,52.56,50.0,55.14,52.44,49.41,55.35,52.21
4,xlm_gn_comb_10p_4,28,0.0005,3e-05,248,49.65,55.57,52.44,49.74,55.31,52.38,49.52,54.35,51.83,49.81,54.24,51.93


In [9]:
test_10_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,49.94,56.19,52.88,50.16,55.4,52.64,50.42,54.31,52.29,49.98,54.48,52.13
std,0.0,0.0,0.0,0.0,0.76,0.48,0.38,1.06,0.46,0.4,0.99,0.53,0.4,0.56,0.54,0.17
min,28.0,0.0,0.0,248.0,49.49,55.57,52.44,49.53,54.71,52.38,49.52,53.67,51.83,49.41,53.9,51.93
25%,28.0,0.0,0.0,248.0,49.64,55.87,52.79,49.74,55.31,52.46,50.0,54.2,52.14,49.81,54.24,52.07
50%,28.0,0.0,0.0,248.0,49.64,56.35,52.79,49.74,55.49,52.46,50.23,54.2,52.14,49.89,54.46,52.07
75%,28.0,0.0,0.0,248.0,49.65,56.35,52.9,49.74,55.49,52.56,50.23,54.35,52.44,49.89,54.46,52.21
max,28.0,0.0,0.0,248.0,51.29,56.8,53.48,52.06,55.99,53.35,52.12,55.14,52.88,50.92,55.35,52.37


CDA, fine-tuning with 5% of the documents

In [10]:
test_5_df = get_pronoun_set_overview('xlm_gn_comb_5p')
test_5_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
0,xlm_gn_comb_5p_0,28,0.0005,3e-05,248,50.65,55.35,52.9,50.88,54.49,52.62,50.89,53.5,52.16,50.51,53.82,52.11
1,xlm_gn_comb_5p_1,28,0.0005,3e-05,248,50.14,55.21,52.55,50.19,54.93,52.45,50.62,53.73,52.13,50.21,53.56,51.83
2,xlm_gn_comb_5p_3,28,0.0005,3e-05,248,50.54,53.64,52.04,50.95,52.64,51.78,51.05,51.92,51.48,50.69,51.47,51.08
3,xlm_gn_comb_5p_4,28,0.0005,3e-05,248,51.58,54.94,53.21,51.34,53.74,52.51,51.63,52.9,52.25,51.56,52.78,52.16
4,xlm_gn_comb_5p_2,28,0.0005,3e-05,248,50.57,55.7,53.01,50.79,54.77,52.71,50.93,53.39,52.13,50.55,53.91,52.18


In [11]:
test_5_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,50.7,54.97,52.74,50.83,54.11,52.41,51.02,53.09,52.03,50.7,53.11,51.87
std,0.0,0.0,0.0,0.0,0.53,0.79,0.46,0.41,0.94,0.37,0.37,0.72,0.31,0.51,1.02,0.46
min,28.0,0.0,0.0,248.0,50.14,53.64,52.04,50.19,52.64,51.78,50.62,51.92,51.48,50.21,51.47,51.08
25%,28.0,0.0,0.0,248.0,50.54,54.94,52.55,50.79,53.74,52.45,50.89,52.9,52.13,50.51,52.78,51.83
50%,28.0,0.0,0.0,248.0,50.57,55.21,52.9,50.88,54.49,52.51,50.93,53.39,52.13,50.55,53.56,52.11
75%,28.0,0.0,0.0,248.0,50.65,55.35,53.01,50.95,54.77,52.62,51.05,53.5,52.16,50.69,53.82,52.16
max,28.0,0.0,0.0,248.0,51.58,55.7,53.21,51.34,54.93,52.71,51.63,53.73,52.25,51.56,53.91,52.18


CDA, fine-tuning with 2% of the documents

In [26]:
test_2_df = get_pronoun_set_overview('xlm_gn_comb_2p')
test_2_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
0,xlm_gn_comb_2p_0,28,0.0005,3e-05,248,49.95,54.5,52.13,50.64,53.54,52.05,50.75,52.53,51.62,50.42,52.18,51.28
1,xlm_gn_comb_2p_1,28,0.0005,3e-05,248,48.23,55.9,51.78,48.64,55.29,51.75,48.99,53.62,51.2,47.98,53.36,50.53
2,xlm_gn_comb_2p_4,28,0.0005,3e-05,248,50.3,53.42,51.81,50.4,52.46,51.41,50.7,50.73,50.71,50.39,50.81,50.6
3,xlm_gn_comb_2p_2,28,0.0005,3e-05,248,49.1,54.73,51.76,49.46,54.3,51.77,49.67,52.8,51.18,49.26,52.71,50.93
4,xlm_gn_comb_2p_3,28,0.0005,3e-05,248,49.41,55.24,52.16,49.73,54.54,52.02,49.74,53.01,51.32,49.24,52.99,51.05


In [27]:
test_2_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,49.4,54.76,51.93,49.77,54.03,51.8,49.97,52.54,51.21,49.46,52.41,50.88
std,0.0,0.0,0.0,0.0,0.8,0.92,0.2,0.8,1.08,0.26,0.75,1.09,0.33,1.01,0.99,0.31
min,28.0,0.0,0.0,248.0,48.23,53.42,51.76,48.64,52.46,51.41,48.99,50.73,50.71,47.98,50.81,50.53
25%,28.0,0.0,0.0,248.0,49.1,54.5,51.78,49.46,53.54,51.75,49.67,52.53,51.18,49.24,52.18,50.6
50%,28.0,0.0,0.0,248.0,49.41,54.73,51.81,49.73,54.3,51.77,49.74,52.8,51.2,49.26,52.71,50.93
75%,28.0,0.0,0.0,248.0,49.95,55.24,52.13,50.4,54.54,52.02,50.7,53.01,51.32,50.39,52.99,51.05
max,28.0,0.0,0.0,248.0,50.3,55.9,52.16,50.64,55.29,52.05,50.75,53.62,51.62,50.42,53.36,51.28


CDA, fine-tuning with 1% of the documents

In [28]:
test_1_df = get_pronoun_set_overview('xlm_gn_comb_1p')
test_1_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
0,xlm_gn_comb_1p_4,28,0.0005,3e-05,248,48.51,54.91,51.51,48.18,54.38,51.09,48.39,52.5,50.36,48.34,52.19,50.19
1,xlm_gn_comb_1p_1,28,0.0005,3e-05,248,49.24,54.96,51.94,49.56,54.29,51.82,49.49,53.1,51.23,49.25,52.57,50.86
2,xlm_gn_comb_1p_0,28,0.0005,3e-05,248,49.27,53.96,51.51,49.65,53.21,51.37,49.76,51.9,50.8,49.23,51.71,50.44
3,xlm_gn_comb_1p_3,28,0.0005,3e-05,248,50.13,53.77,51.89,50.36,53.46,51.86,50.52,52.03,51.27,49.53,51.48,50.49
4,xlm_gn_comb_1p_2,28,0.0005,3e-05,248,47.48,55.98,51.38,48.02,55.44,51.46,48.23,53.73,50.83,47.36,52.83,49.94


In [29]:
test_1_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,48.93,54.72,51.65,49.15,54.16,51.52,49.28,52.65,50.9,48.74,52.16,50.38
std,0.0,0.0,0.0,0.0,0.99,0.89,0.25,1.01,0.88,0.32,0.96,0.76,0.37,0.89,0.57,0.34
min,28.0,0.0,0.0,248.0,47.48,53.77,51.38,48.02,53.21,51.09,48.23,51.9,50.36,47.36,51.48,49.94
25%,28.0,0.0,0.0,248.0,48.51,53.96,51.51,48.18,53.46,51.37,48.39,52.03,50.8,48.34,51.71,50.19
50%,28.0,0.0,0.0,248.0,49.24,54.91,51.51,49.56,54.29,51.46,49.49,52.5,50.83,49.23,52.19,50.44
75%,28.0,0.0,0.0,248.0,49.27,54.96,51.89,49.65,54.38,51.82,49.76,53.1,51.23,49.25,52.57,50.49
max,28.0,0.0,0.0,248.0,50.13,55.98,51.94,50.36,55.44,51.86,50.52,53.73,51.27,49.53,52.83,50.86


Executing the same test, but then on data where names are not anonymised and gendered terms are not rewritten:

In [6]:
def get_pronoun_set_results(model_name):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
    

        
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            round(logs['hen_pron_only_test_head.jsonlines_eval'][0]['sl_p'],4)* 100,
            round(logs['hen_pron_only_test_head.jsonlines_eval'][0]['sl_r'],4)* 100,
            round(logs['hen_pron_only_test_head.jsonlines_eval'][0]['sl_f1'],4)* 100
           ]

pronoun_sets_col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'p', 'r', 'f1'] 
def get_pronoun_set_overview(setting):
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    for file in files:
        try:
            results.append(get_pronoun_set_results(file))
        except:
            continue
    df = pd.DataFrame(results, columns=pronoun_sets_col_names)
    #df = df.style.highlight_max(color = 'lightgreen', axis = 0)
    return df

Evaluating on the hen-test set, after debiasing with 5% of the data

In [9]:
test_2_df = get_pronoun_set_overview('xlm_gn_comb_5p')
test_2_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,p,r,f1
0,xlm_gn_comb_5p_0,28,0.0005,3e-05,248,50.68,56.22,53.31
1,xlm_gn_comb_5p_1,28,0.0005,3e-05,248,51.02,56.13,53.46
2,xlm_gn_comb_5p_3,28,0.0005,3e-05,248,51.1,55.17,53.06
3,xlm_gn_comb_5p_4,28,0.0005,3e-05,248,51.81,55.41,53.55
4,xlm_gn_comb_5p_2,28,0.0005,3e-05,248,50.29,57.12,53.49


In [10]:
test_2_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,p,r,f1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,50.98,56.01,53.37
std,0.0,0.0,0.0,0.0,0.56,0.77,0.2
min,28.0,0.0,0.0,248.0,50.29,55.17,53.06
25%,28.0,0.0,0.0,248.0,50.68,55.41,53.31
50%,28.0,0.0,0.0,248.0,51.02,56.13,53.46
75%,28.0,0.0,0.0,248.0,51.1,56.22,53.49
max,28.0,0.0,0.0,248.0,51.81,57.12,53.55


## Test set performance in terms of pronoun scores

In [14]:
def get_pronoun_results(model_name):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
    
    dev_f1s = [epoch['sl_f1'] for epoch in logs['dev_eval']]
    
    best_epoch = dev_f1s.index(max(dev_f1s))
    best_epoch_results = logs['dev_eval'][best_epoch]
    if "finetune" in model_name:
        best_epoch = logs['epochs'] - 10 + best_epoch
        
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            best_epoch + 1, 
            round(logs['hij_test_head.jsonlines_pronoun_score'],2),
            round(logs['zij_test_head.jsonlines_pronoun_score'],2),
            round(logs['hen_test_head.jsonlines_pronoun_score'],2),
            round(logs['die_test_head.jsonlines_pronoun_score'],2),
           ]

pronoun_col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'best_epoch', 'hij', 'zij', 'hen', 'die']
def get_pronoun_overview(setting):
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    for file in files:
        try:
            results.append(get_pronoun_results(file))
        except:
            continue
    df = pd.DataFrame(results, columns=pronoun_col_names)
    return df

Delex, fine-tuning

In [15]:
delex_fine_pronoun_df = get_pronoun_overview('xlm_delex_fine')
delex_fine_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
0,xlm_delex_fine_1234,25,0.0005,3e-05,1234,6,87.32,87.48,69.22,60.42
1,xlm_delex_fine_123,28,0.0005,3e-05,123,8,90.16,89.72,74.25,56.81
2,xlm_delex_fine_2023,27,0.0005,3e-05,2023,6,90.16,89.78,74.74,55.17
3,xlm_delex_fine_248,28,0.0005,3e-05,248,8,89.56,88.41,75.67,56.53
4,xlm_delex_fine_2020,25,0.0005,3e-05,2020,7,89.23,88.41,70.69,56.92


In [16]:
delex_fine_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,26.6,0.0,0.0,1129.6,7.0,89.29,88.76,72.91,57.17
std,1.52,0.0,0.0,920.91,1.0,1.17,0.98,2.8,1.95
min,25.0,0.0,0.0,123.0,6.0,87.32,87.48,69.22,55.17
25%,25.0,0.0,0.0,248.0,6.0,89.23,88.41,70.69,56.53
50%,27.0,0.0,0.0,1234.0,7.0,89.56,88.41,74.25,56.81
75%,28.0,0.0,0.0,2020.0,8.0,90.16,89.72,74.74,56.92
max,28.0,0.0,0.0,2023.0,8.0,90.16,89.78,75.67,60.42


Delex, full retraining

In [17]:
delex_full_pronoun_df = get_pronoun_overview('xlm_delex_full')
delex_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
0,xlm_delex_full_2020,20,0.0005,3e-05,2020,15,71.57,81.9,74.25,61.34
1,xlm_delex_full_248,20,0.0005,3e-05,248,14,80.7,83.21,64.24,57.79
2,xlm_delex_full_1234,20,0.0005,3e-05,1234,16,80.43,86.17,75.18,67.96
3,xlm_delex_full_123,20,0.0005,3e-05,123,16,78.13,83.16,75.45,66.98
4,xlm_delex_full_2023,20,0.0005,3e-05,2023,15,71.68,79.5,68.62,55.39


In [18]:
delex_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.0,0.0,0.0,1129.6,15.2,76.5,82.79,71.55,61.89
std,0.0,0.0,0.0,920.91,0.84,4.56,2.42,4.94,5.53
min,20.0,0.0,0.0,123.0,14.0,71.57,79.5,64.24,55.39
25%,20.0,0.0,0.0,248.0,15.0,71.68,81.9,68.62,57.79
50%,20.0,0.0,0.0,1234.0,15.0,78.13,83.16,74.25,61.34
75%,20.0,0.0,0.0,2020.0,16.0,80.43,83.21,75.18,66.98
max,20.0,0.0,0.0,2023.0,16.0,80.7,86.17,75.45,67.96


CDA, full retraining

In [19]:
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_gn_comb_full')
gn_combo_full_pronoun_df.sort_values('seed')

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
1,xlm_gn_comb_full_123,20,0.0005,3e-05,123,13,85.46,88.19,88.35,88.41
3,xlm_gn_comb_full_248,20,0.0005,3e-05,248,18,85.68,88.96,88.46,89.61
0,xlm_gn_comb_full_1234,20,0.0005,3e-05,1234,18,87.48,89.83,89.45,89.83
4,xlm_gn_comb_full_2020,20,0.0005,3e-05,2020,19,89.45,90.21,90.1,89.72
2,xlm_gn_comb_full_2023,20,0.0005,3e-05,2023,17,86.33,88.19,88.74,89.28


In [33]:
gn_combo_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.0,0.0,0.0,1129.6,17.0,86.88,89.08,89.02,89.37
std,0.0,0.0,0.0,920.91,2.35,1.64,0.93,0.74,0.57
min,20.0,0.0,0.0,123.0,13.0,85.46,88.19,88.35,88.41
25%,20.0,0.0,0.0,248.0,17.0,85.68,88.19,88.46,89.28
50%,20.0,0.0,0.0,1234.0,18.0,86.33,88.96,88.74,89.61
75%,20.0,0.0,0.0,2020.0,18.0,87.48,89.83,89.45,89.72
max,20.0,0.0,0.0,2023.0,19.0,89.45,90.21,90.1,89.83


CDA, fine-tuning

In [20]:
gn_combo_fine_pronoun_df = get_pronoun_overview('xlm_gn_comb_fine')
gn_combo_fine_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
0,xlm_gn_comb_fine_123,28,0.0005,3e-05,123,5,89.83,90.21,89.45,89.12
1,xlm_gn_comb_fine_248,28,0.0005,3e-05,248,8,90.76,90.6,89.94,89.67
2,xlm_gn_comb_fine_2020,25,0.0005,3e-05,2020,10,90.87,91.09,90.81,90.38
3,xlm_gn_comb_fine_1234,25,0.0005,3e-05,1234,8,90.81,90.65,90.16,89.61
4,xlm_gn_comb_fine_2023,27,0.0005,3e-05,2023,6,90.32,90.43,90.43,89.23


In [21]:
gn_combo_fine_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,26.6,0.0,0.0,1129.6,7.4,90.52,90.6,90.16,89.6
std,1.52,0.0,0.0,920.91,1.95,0.44,0.33,0.51,0.5
min,25.0,0.0,0.0,123.0,5.0,89.83,90.21,89.45,89.12
25%,25.0,0.0,0.0,248.0,6.0,90.32,90.43,89.94,89.23
50%,27.0,0.0,0.0,1234.0,8.0,90.76,90.6,90.16,89.61
75%,28.0,0.0,0.0,2020.0,8.0,90.81,90.65,90.43,89.67
max,28.0,0.0,0.0,2023.0,10.0,90.87,91.09,90.81,90.38


CDA, fine-tuning with 10% of the data

In [22]:
gn_10_pronoun_df = get_pronoun_overview('xlm_gn_comb_10')
gn_10_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
0,xlm_gn_comb_10p_2,28,0.0005,3e-05,248,8,92.62,91.69,89.23,86.22
1,xlm_gn_comb_10p_3,28,0.0005,3e-05,248,10,92.24,90.81,87.64,85.24
2,xlm_gn_comb_10p_1,28,0.0005,3e-05,248,8,92.62,91.69,89.23,86.22
3,xlm_gn_comb_10p_248,28,0.0005,3e-05,248,8,92.29,91.14,87.92,83.93
4,xlm_gn_comb_10p_4,28,0.0005,3e-05,248,9,92.29,90.98,89.17,85.51


In [15]:
gn_10_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,8.6,92.41,91.26,88.64,85.42
std,0.0,0.0,0.0,0.0,0.89,0.19,0.41,0.79,0.94
min,28.0,0.0,0.0,248.0,8.0,92.24,90.81,87.64,83.93
25%,28.0,0.0,0.0,248.0,8.0,92.29,90.98,87.92,85.24
50%,28.0,0.0,0.0,248.0,8.0,92.29,91.14,89.17,85.51
75%,28.0,0.0,0.0,248.0,9.0,92.62,91.69,89.23,86.22
max,28.0,0.0,0.0,248.0,10.0,92.62,91.69,89.23,86.22


CDA, fine-tuning with 5% of the data

In [23]:
gn_5_pronoun_df = get_pronoun_overview('xlm_gn_comb_5p')
gn_5_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
0,xlm_gn_comb_5p_0,28,0.0005,3e-05,248,10,92.51,90.16,86.39,83.6
1,xlm_gn_comb_5p_1,28,0.0005,3e-05,248,6,91.74,90.76,86.88,83.0
2,xlm_gn_comb_5p_3,28,0.0005,3e-05,248,10,91.36,90.32,87.59,82.5
3,xlm_gn_comb_5p_4,28,0.0005,3e-05,248,10,92.45,91.25,88.74,85.35
4,xlm_gn_comb_5p_2,28,0.0005,3e-05,248,6,92.02,90.81,86.99,83.82


In [24]:
gn_5_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,8.4,92.02,90.66,87.32,83.65
std,0.0,0.0,0.0,0.0,2.19,0.48,0.43,0.9,1.08
min,28.0,0.0,0.0,248.0,6.0,91.36,90.16,86.39,82.5
25%,28.0,0.0,0.0,248.0,6.0,91.74,90.32,86.88,83.0
50%,28.0,0.0,0.0,248.0,10.0,92.02,90.76,86.99,83.6
75%,28.0,0.0,0.0,248.0,10.0,92.45,90.81,87.59,83.82
max,28.0,0.0,0.0,248.0,10.0,92.51,91.25,88.74,85.35


CDA, fine-tuning with 2% of the data

In [25]:
gn_2_pronoun_df = get_pronoun_overview('xlm_gn_comb_2p')
gn_2_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
0,xlm_gn_comb_2p_0,28,0.0005,3e-05,248,10,91.09,89.72,85.51,80.7
1,xlm_gn_comb_2p_1,28,0.0005,3e-05,248,6,92.24,90.49,85.24,79.17
2,xlm_gn_comb_2p_4,28,0.0005,3e-05,248,3,90.54,89.17,83.6,78.13
3,xlm_gn_comb_2p_2,28,0.0005,3e-05,248,3,91.42,90.05,85.13,79.88
4,xlm_gn_comb_2p_3,28,0.0005,3e-05,248,9,91.69,90.38,85.95,79.5


In [26]:
gn_2_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,6.2,91.4,89.96,85.09,79.48
std,0.0,0.0,0.0,0.0,3.27,0.64,0.54,0.89,0.94
min,28.0,0.0,0.0,248.0,3.0,90.54,89.17,83.6,78.13
25%,28.0,0.0,0.0,248.0,3.0,91.09,89.72,85.13,79.17
50%,28.0,0.0,0.0,248.0,6.0,91.42,90.05,85.24,79.5
75%,28.0,0.0,0.0,248.0,9.0,91.69,90.38,85.51,79.88
max,28.0,0.0,0.0,248.0,10.0,92.24,90.49,85.95,80.7


CDA, fine-tuning with 1% of the data

In [27]:
gn_1_pronoun_df = get_pronoun_overview('xlm_gn_comb_1p_')
gn_1_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
0,xlm_gn_comb_1p_4,28,0.0005,3e-05,248,5,90.38,89.34,83.38,77.26
1,xlm_gn_comb_1p_1,28,0.0005,3e-05,248,8,91.14,90.32,85.62,79.55
2,xlm_gn_comb_1p_0,28,0.0005,3e-05,248,10,91.63,90.1,85.78,80.59
3,xlm_gn_comb_1p_3,28,0.0005,3e-05,248,10,91.85,90.71,85.95,78.73
4,xlm_gn_comb_1p_2,28,0.0005,3e-05,248,10,91.8,90.76,84.86,76.05


In [28]:
gn_1_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,28.0,0.0,0.0,248.0,8.6,91.36,90.25,85.12,78.44
std,0.0,0.0,0.0,0.0,2.19,0.62,0.58,1.06,1.81
min,28.0,0.0,0.0,248.0,5.0,90.38,89.34,83.38,76.05
25%,28.0,0.0,0.0,248.0,8.0,91.14,90.1,84.86,77.26
50%,28.0,0.0,0.0,248.0,10.0,91.63,90.32,85.62,78.73
75%,28.0,0.0,0.0,248.0,10.0,91.8,90.71,85.78,79.55
max,28.0,0.0,0.0,248.0,10.0,91.85,90.76,85.95,80.59


## Performance of debiased models on regular data
I evaluate this to inspect whether any information is lost through debiasing

In [11]:
def get_regular_results(model_name):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
        
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            round(logs['regular_test_head.jsonlines_eval'][0]['sl_p'],4)* 100,
            round(logs['regular_test_head.jsonlines_eval'][0]['sl_r'],4)* 100,
            round(logs['regular_test_head.jsonlines_eval'][0]['sl_f1'],4)* 100,
           ]

regular_col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'p', 'r', 'f1'] 
def get_regular_overview(setting):
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    for file in files:
        try:
            results.append(get_regular_results(file))
        except:
            continue
    df = pd.DataFrame(results, columns=regular_col_names)
    return df

CDA, full retraining

In [14]:
gn_combo_full_regular_df = get_regular_overview('xlm_gn_comb_full')
gn_combo_full_regular_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,p,r,f1
0,xlm_gn_comb_full_1234,20,0.0005,3e-05,1234,54.41,55.81,55.1
1,xlm_gn_comb_full_123,20,0.0005,3e-05,123,56.61,52.1,54.26
2,xlm_gn_comb_full_2023,20,0.0005,3e-05,2023,56.99,51.13,53.9
3,xlm_gn_comb_full_248,20,0.0005,3e-05,248,52.96,55.53,54.21
4,xlm_gn_comb_full_2020,20,0.0005,3e-05,2020,50.87,59.73,54.94


In [15]:
gn_combo_full_regular_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,p,r,f1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.0,0.0,0.0,1129.6,54.37,54.86,54.48
std,0.0,0.0,0.0,920.91,2.56,3.41,0.51
min,20.0,0.0,0.0,123.0,50.87,51.13,53.9
25%,20.0,0.0,0.0,248.0,52.96,52.1,54.21
50%,20.0,0.0,0.0,1234.0,54.41,55.53,54.26
75%,20.0,0.0,0.0,2020.0,56.61,55.81,54.94
max,20.0,0.0,0.0,2023.0,56.99,59.73,55.1


CDA, fine-tuning

In [20]:
gn_combo_fine_regular_df = get_regular_overview('xlm_gn_comb_fine')
gn_combo_fine_regular_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,p,r,f1
0,xlm_gn_comb_fine_123,28,0.0005,3e-05,123,54.33,55.97,55.14
1,xlm_gn_comb_fine_248,28,0.0005,3e-05,248,54.71,54.69,54.7
2,xlm_gn_comb_fine_2020,25,0.0005,3e-05,2020,53.76,57.95,55.77
3,xlm_gn_comb_fine_1234,25,0.0005,3e-05,1234,54.65,56.4,55.51
4,xlm_gn_comb_fine_2023,27,0.0005,3e-05,2023,53.43,56.11,54.74


In [21]:
gn_combo_fine_regular_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,p,r,f1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,26.6,0.0,0.0,1129.6,54.18,56.22,55.17
std,1.52,0.0,0.0,920.91,0.56,1.17,0.47
min,25.0,0.0,0.0,123.0,53.43,54.69,54.7
25%,25.0,0.0,0.0,248.0,53.76,55.97,54.74
50%,27.0,0.0,0.0,1234.0,54.33,56.11,55.14
75%,28.0,0.0,0.0,2020.0,54.65,56.4,55.51
max,28.0,0.0,0.0,2023.0,54.71,57.95,55.77


Delex, fine-tuning

In [36]:
delex_fine_regular_df = get_regular_overview('xlm_delex_fine')
delex_fine_regular_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,p,r,f1
0,xlm_delex_fine_1234,25,0.0005,3e-05,1234,51.72,55.73,53.65
1,xlm_delex_fine_123,28,0.0005,3e-05,123,51.94,59.57,55.49
2,xlm_delex_fine_2023,27,0.0005,3e-05,2023,50.77,59.43,54.76
3,xlm_delex_fine_248,28,0.0005,3e-05,248,52.24,54.58,53.38
4,xlm_delex_fine_2020,25,0.0005,3e-05,2020,53.35,55.98,54.63


In [37]:
delex_fine_regular_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,p,r,f1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,26.6,0.0,0.0,1129.6,52.0,57.06,54.38
std,1.52,0.0,0.0,920.91,0.93,2.29,0.86
min,25.0,0.0,0.0,123.0,50.77,54.58,53.38
25%,25.0,0.0,0.0,248.0,51.72,55.73,53.65
50%,27.0,0.0,0.0,1234.0,51.94,55.98,54.63
75%,28.0,0.0,0.0,2020.0,52.24,59.43,54.76
max,28.0,0.0,0.0,2023.0,53.35,59.57,55.49


Delex, full retraining

In [47]:
delex_full_regular_df = get_regular_overview('xlm_delex_full')
delex_full_regular_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,p,r,f1
0,xlm_delex_full_2020,20,0.0005,3e-05,2020,50.85,54.52,52.62
1,xlm_delex_full_248,20,0.0005,3e-05,248,54.89,52.05,53.43
2,xlm_delex_full_1234,20,0.0005,3e-05,1234,51.98,55.85,53.85
3,xlm_delex_full_123,20,0.0005,3e-05,123,52.1,53.58,52.83
4,xlm_delex_full_2023,20,0.0005,3e-05,2023,54.69,50.43,52.47


In [49]:
delex_full_regular_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,p,r,f1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.0,0.0,0.0,1129.6,52.9,53.29,53.04
std,0.0,0.0,0.0,920.91,1.79,2.11,0.58
min,20.0,0.0,0.0,123.0,50.85,50.43,52.47
25%,20.0,0.0,0.0,248.0,51.98,52.05,52.62
50%,20.0,0.0,0.0,1234.0,52.1,53.58,52.83
75%,20.0,0.0,0.0,2020.0,54.69,54.52,53.43
max,20.0,0.0,0.0,2023.0,54.89,55.85,53.85
