In this notebook, I evaluate the performance of the original and the debiased models on previously unseen pronouns, in terms of the LEA score and the pronoun score.

In [4]:
import json
import matplotlib.pyplot as plt
import os

import pandas as pd

## Neopronoun performance, in terms on LEA score

In [3]:
def get_pronoun_set_results(model_name):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
    

        
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            round(logs['unseen_test_head.jsonlines_eval'][0]['sl_p'],4)* 100,
            round(logs['unseen_test_head.jsonlines_eval'][0]['sl_r'],4)* 100,
            round(logs['unseen_test_head.jsonlines_eval'][0]['sl_f1'],4)* 100,
           ]

pronoun_sets_col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'P', 'R', 'F1'] 
def get_pronoun_set_overview(setting):
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    for file in files:
        try:
            results.append(get_pronoun_set_results(file))
        except:
            continue
    df = pd.DataFrame(results, columns=pronoun_sets_col_names)
    return df

Regular model

In [19]:
test_gn_full_df = get_pronoun_set_overview('xlm_regular')
test_gn_full_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,P,R,F1
0,xlm_regular_123,21,0.0005,3e-05,123,48.92,49.31,49.12
1,xlm_regular_2020,21,0.0005,3e-05,2020,53.97,42.75,47.71
2,xlm_regular_248,20,0.0005,3e-05,248,52.41,44.35,48.04
3,xlm_regular_1234,21,0.0005,3e-05,1234,54.75,41.8,47.41
4,xlm_regular_2023,21,0.0005,3e-05,2023,54.12,43.65,48.32


In [20]:
test_gn_full_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,P,R,F1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.8,0.0,0.0,1129.6,52.83,44.37,48.12
std,0.45,0.0,0.0,920.91,2.35,2.92,0.66
min,20.0,0.0,0.0,123.0,48.92,41.8,47.41
25%,21.0,0.0,0.0,248.0,52.41,42.75,47.71
50%,21.0,0.0,0.0,1234.0,53.97,43.65,48.04
75%,21.0,0.0,0.0,2020.0,54.12,44.35,48.32
max,21.0,0.0,0.0,2023.0,54.75,49.31,49.12


Delex, full retraining

In [39]:
test_delex_full_df = get_pronoun_set_overview('xlm_delex_full')
test_delex_full_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,P,R,F1
0,xlm_delex_full_2020,20,0.0005,3e-05,2020,50.04,51.6,50.81
1,xlm_delex_full_248,20,0.0005,3e-05,248,54.43,48.18,51.12
2,xlm_delex_full_1234,20,0.0005,3e-05,1234,51.31,51.32,51.31
3,xlm_delex_full_123,20,0.0005,3e-05,123,51.06,49.57,50.3
4,xlm_delex_full_2023,20,0.0005,3e-05,2023,54.03,47.09,50.32


In [40]:
test_delex_full_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,P,R,F1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.0,0.0,0.0,1129.6,52.17,49.55,50.77
std,0.0,0.0,0.0,920.91,1.94,1.95,0.46
min,20.0,0.0,0.0,123.0,50.04,47.09,50.3
25%,20.0,0.0,0.0,248.0,51.06,48.18,50.32
50%,20.0,0.0,0.0,1234.0,51.31,49.57,50.81
75%,20.0,0.0,0.0,2020.0,54.03,51.32,51.12
max,20.0,0.0,0.0,2023.0,54.43,51.6,51.31


Delex, fine-tuning

In [37]:
test_delex_fine_df = get_pronoun_set_overview('xlm_delex_fine')
test_delex_fine_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,P,R,F1
0,xlm_delex_fine_1234,25,0.0005,3e-05,1234,51.2,51.02,51.11
1,xlm_delex_fine_123,28,0.0005,3e-05,123,50.8,52.71,51.74
2,xlm_delex_fine_2023,27,0.0005,3e-05,2023,49.77,52.42,51.06
3,xlm_delex_fine_248,28,0.0005,3e-05,248,51.06,48.6,49.8
4,xlm_delex_fine_2020,25,0.0005,3e-05,2020,52.08,50.4,51.22


In [38]:
test_delex_fine_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,P,R,F1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,26.6,0.0,0.0,1129.6,50.98,51.03,50.99
std,1.52,0.0,0.0,920.91,0.83,1.66,0.72
min,25.0,0.0,0.0,123.0,49.77,48.6,49.8
25%,25.0,0.0,0.0,248.0,50.8,50.4,51.06
50%,27.0,0.0,0.0,1234.0,51.06,51.02,51.11
75%,28.0,0.0,0.0,2020.0,51.2,52.42,51.22
max,28.0,0.0,0.0,2023.0,52.08,52.71,51.74


CDA, fine-tuning

In [33]:
gn_comb_fine_pronoun_df = get_pronoun_set_overview('xlm_gn_comb_fine')
gn_comb_fine_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,P,R,F1
0,xlm_gn_comb_fine_123,28,0.0005,3e-05,123,52.83,49.27,50.99
1,xlm_gn_comb_fine_248,28,0.0005,3e-05,248,53.81,49.44,51.54
2,xlm_gn_comb_fine_2020,25,0.0005,3e-05,2020,52.59,51.78,52.18
3,xlm_gn_comb_fine_1234,25,0.0005,3e-05,1234,53.69,50.78,52.19
4,xlm_gn_comb_fine_2023,27,0.0005,3e-05,2023,52.11,49.83,50.95


In [34]:
gn_comb_fine_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,P,R,F1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,26.6,0.0,0.0,1129.6,53.01,50.22,51.57
std,1.52,0.0,0.0,920.91,0.73,1.05,0.61
min,25.0,0.0,0.0,123.0,52.11,49.27,50.95
25%,25.0,0.0,0.0,248.0,52.59,49.44,50.99
50%,27.0,0.0,0.0,1234.0,52.83,49.83,51.54
75%,28.0,0.0,0.0,2020.0,53.69,50.78,52.18
max,28.0,0.0,0.0,2023.0,53.81,51.78,52.19


CDA, full retraining

In [35]:
gn_comb_full_pronoun_df = get_pronoun_set_overview('xlm_gn_comb_full')
gn_comb_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,P,R,F1
0,xlm_gn_comb_full_1234,20,0.0005,3e-05,1234,53.52,50.49,51.96
1,xlm_gn_comb_full_123,20,0.0005,3e-05,123,55.83,46.13,50.52
2,xlm_gn_comb_full_2023,20,0.0005,3e-05,2023,55.95,46.81,50.97
3,xlm_gn_comb_full_248,20,0.0005,3e-05,248,52.11,50.89,51.49
4,xlm_gn_comb_full_2020,20,0.0005,3e-05,2020,49.9,54.0,51.87


In [36]:
gn_comb_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,P,R,F1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.0,0.0,0.0,1129.6,53.46,49.66,51.36
std,0.0,0.0,0.0,920.91,2.56,3.23,0.61
min,20.0,0.0,0.0,123.0,49.9,46.13,50.52
25%,20.0,0.0,0.0,248.0,52.11,46.81,50.97
50%,20.0,0.0,0.0,1234.0,53.52,50.49,51.49
75%,20.0,0.0,0.0,2020.0,55.83,50.89,51.87
max,20.0,0.0,0.0,2023.0,55.95,54.0,51.96


## Neopronoun performance, in terms of pronoun score

In [2]:
def get_pronoun_results(model_name):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
    
    dev_f1s = [epoch['sl_f1'] for epoch in logs['dev_eval']]
    best_epoch = dev_f1s.index(max(dev_f1s))
    best_epoch_results = logs['dev_eval'][best_epoch]
    if "finetune" in model_name:
        best_epoch = logs['epochs'] - 10 + best_epoch
        
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            best_epoch + 1, 
            round(logs['unseen_test_head.jsonlines_pronoun_score'],2),
           ]

pronoun_col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'best_epoch', 'unseen']
def get_pronoun_overview(setting):
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    for file in files:
        try:
            results.append(get_pronoun_results(file))
        except:
            continue
    df = pd.DataFrame(results, columns=pronoun_col_names)
    return df

Regular model

In [5]:
gn_full_pronoun_df = get_pronoun_overview('xlm_regular')
gn_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,unseen
0,xlm_regular_123,21,0.0005,3e-05,123,18,49.41
1,xlm_regular_2020,21,0.0005,3e-05,2020,15,45.58
2,xlm_regular_248,20,0.0005,3e-05,248,18,48.84
3,xlm_regular_1234,21,0.0005,3e-05,1234,15,45.43
4,xlm_regular_2023,21,0.0005,3e-05,2023,17,44.13


In [22]:
gn_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,unseen
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.8,0.0,0.0,1129.6,16.6,46.68
std,0.45,0.0,0.0,920.91,1.52,2.31
min,20.0,0.0,0.0,123.0,15.0,44.13
25%,21.0,0.0,0.0,248.0,15.0,45.43
50%,21.0,0.0,0.0,1234.0,17.0,45.58
75%,21.0,0.0,0.0,2020.0,18.0,48.84
max,21.0,0.0,0.0,2023.0,18.0,49.41


Delex, fine-tuned

In [6]:
delex_fine_pronoun_df = get_pronoun_overview('xlm_delex_fine')
delex_fine_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,unseen
0,xlm_delex_fine_1234,25,0.0005,3e-05,1234,6,52.76
1,xlm_delex_fine_123,28,0.0005,3e-05,123,8,47.34
2,xlm_delex_fine_2023,27,0.0005,3e-05,2023,6,48.53
3,xlm_delex_fine_248,28,0.0005,3e-05,248,8,48.94
4,xlm_delex_fine_2020,25,0.0005,3e-05,2020,7,50.23


In [42]:
delex_fine_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,unseen
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,26.6,0.0,0.0,1129.6,7.0,49.56
std,1.52,0.0,0.0,920.91,1.0,2.07
min,25.0,0.0,0.0,123.0,6.0,47.34
25%,25.0,0.0,0.0,248.0,6.0,48.53
50%,27.0,0.0,0.0,1234.0,7.0,48.94
75%,28.0,0.0,0.0,2020.0,8.0,50.23
max,28.0,0.0,0.0,2023.0,8.0,52.76


Delex, full retraining

In [43]:
delex_full_pronoun_df = get_pronoun_overview('xlm_delex_full')
delex_full_pronoun_df

['xlm_delex_full_2020.json', 'xlm_delex_full_248.json', 'xlm_delex_full_1234.json', 'xlm_delex_full_123.json', 'xlm_delex_full_2023.json']


Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,unseen
0,xlm_delex_full_2020,20,0.0005,3e-05,2020,15,48.42
1,xlm_delex_full_248,20,0.0005,3e-05,248,14,49.04
2,xlm_delex_full_1234,20,0.0005,3e-05,1234,16,49.82
3,xlm_delex_full_123,20,0.0005,3e-05,123,16,48.27
4,xlm_delex_full_2023,20,0.0005,3e-05,2023,15,44.6


In [44]:
delex_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,unseen
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.0,0.0,0.0,1129.6,15.2,48.03
std,0.0,0.0,0.0,920.91,0.84,2.01
min,20.0,0.0,0.0,123.0,14.0,44.6
25%,20.0,0.0,0.0,248.0,15.0,48.27
50%,20.0,0.0,0.0,1234.0,15.0,48.42
75%,20.0,0.0,0.0,2020.0,16.0,49.04
max,20.0,0.0,0.0,2023.0,16.0,49.82


CDA, fine-tuning

In [7]:
gn_combo_fine_pronoun_df = get_pronoun_overview('xlm_gn_comb_fine')
gn_combo_fine_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,unseen
0,xlm_gn_comb_fine_123,28,0.0005,3e-05,123,5,47.86
1,xlm_gn_comb_fine_248,28,0.0005,3e-05,248,8,53.54
2,xlm_gn_comb_fine_2020,25,0.0005,3e-05,2020,10,53.59
3,xlm_gn_comb_fine_1234,25,0.0005,3e-05,1234,8,57.78
4,xlm_gn_comb_fine_2023,27,0.0005,3e-05,2023,6,54.06


In [46]:
gn_combo_fine_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,unseen
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,26.6,0.0,0.0,1129.6,7.4,53.37
std,1.52,0.0,0.0,920.91,1.95,3.55
min,25.0,0.0,0.0,123.0,5.0,47.86
25%,25.0,0.0,0.0,248.0,6.0,53.54
50%,27.0,0.0,0.0,1234.0,8.0,53.59
75%,28.0,0.0,0.0,2020.0,8.0,54.06
max,28.0,0.0,0.0,2023.0,10.0,57.78


CDA, full retraining

In [8]:
gn_combo_full_pronoun_df = get_pronoun_overview('xlm_gn_comb_full')
gn_combo_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,unseen
0,xlm_gn_comb_full_1234,20,0.0005,3e-05,1234,18,51.32
1,xlm_gn_comb_full_123,20,0.0005,3e-05,123,13,47.13
2,xlm_gn_comb_full_2023,20,0.0005,3e-05,2023,17,51.99
3,xlm_gn_comb_full_248,20,0.0005,3e-05,248,18,53.28
4,xlm_gn_comb_full_2020,20,0.0005,3e-05,2020,19,54.88
