In this notebook, I evaluate the performance of the regular model on:
* the regular data, in terms of LEA score
* the pronoun-specific data, in terms of LEA score
* the pronoun-specific data, in terms of pronouns score
* deformed test data, in which names are anonymised and nouns are rewritten, but the pronouns not swapped

In [2]:
import json
import matplotlib.pyplot as plt
import os

import pandas as pd

## Model dev performance overview
This is evaluated in order to only keep the weigths for the best epoch

In [6]:
def get_training_results(model_name):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
    dev_f1s = [epoch['sl_f1'] for epoch in logs['dev_eval']]
    
    best_epoch = dev_f1s.index(max(dev_f1s))
    best_epoch_results = logs['dev_eval'][best_epoch]
    if "finetune" in model_name:
        best_epoch = logs['epochs'] - 10 + best_epoch
        
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            best_epoch + 1, 
            round(best_epoch_results['wl_p'],4) * 100,
            round(best_epoch_results['wl_r'],4) * 100,
            round(best_epoch_results['wl_f1'],4) * 100,
            round(best_epoch_results['sl_p'],4) * 100,
            round(best_epoch_results['sl_r'],4) * 100,
            round(best_epoch_results['sl_f1'],4) * 100,
           ]

In [7]:
col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'best_epoch', 'wl P', 'wl R', 'wl F1','P', 'R', 'F1']
def get_training_overview(setting):
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    print(files)
    for file in files:
        results.append(get_training_results(file))
    df = pd.DataFrame(results, columns=col_names)
    return df

Performance on the regular DEV data

In [8]:
regular_dev = get_training_overview('xlm_regular')
regular_dev

['xlm_regular_123.json', 'xlm_regular_2020.json', 'xlm_regular_248.json', 'xlm_regular_1234.json', 'xlm_regular_2023.json']


Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
0,xlm_regular_123,21,0.0005,3e-05,123,18,52.13,67.43,58.8,49.47,62.9,55.38
1,xlm_regular_2020,21,0.0005,3e-05,2020,15,58.01,59.45,58.72,55.14,55.54,55.34
2,xlm_regular_248,20,0.0005,3e-05,248,18,55.13,63.06,58.83,52.06,58.58,55.13
3,xlm_regular_1234,21,0.0005,3e-05,1234,15,57.45,59.44,58.43,54.5,55.54,55.02
4,xlm_regular_2023,21,0.0005,3e-05,2023,17,57.15,60.97,59.0,54.02,56.77,55.36


In [11]:
regular_dev.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,wl P,wl R,wl F1,P,R,F1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.8,0.0,0.0,1129.6,16.6,55.97,62.07,58.76,53.04,57.87,55.25
std,0.45,0.0,0.0,920.91,1.52,2.41,3.34,0.21,2.3,3.08,0.16
min,20.0,0.0,0.0,123.0,15.0,52.13,59.44,58.43,49.47,55.54,55.02
25%,21.0,0.0,0.0,248.0,15.0,55.13,59.45,58.72,52.06,55.54,55.13
50%,21.0,0.0,0.0,1234.0,17.0,57.15,60.97,58.8,54.02,56.77,55.34
75%,21.0,0.0,0.0,2020.0,18.0,57.45,63.06,58.83,54.5,58.58,55.36
max,21.0,0.0,0.0,2023.0,18.0,58.01,67.43,59.0,55.14,62.9,55.38


## Model TEST results overview in terms of LEA score

In [13]:
def get_test_results(model_name):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
        #print(logs)
        
    test_logs = logs['regular_test_head.jsonlines_eval'][0]
        
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            round(test_logs['sl_p'],4)*100,
            round(test_logs['sl_r'],4)*100,
            round(test_logs['sl_f1'],4)*100,
           ]

test_col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'P', 'R', 'F1']
def get_test_overview(setting):
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    print(files)
    for file in files:
        results.append(get_test_results(file))
    df = pd.DataFrame(results, columns=test_col_names)
    return df

F1-score performance on the test set of the regular data

In [20]:
test_full_df = get_test_overview('xlm_regular')
test_full_df

['xlm_regular_123.json', 'xlm_regular_2020.json', 'xlm_regular_248.json', 'xlm_regular_1234.json', 'xlm_regular_2023.json']


Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,P,R,F1
0,xlm_regular_123,21,0.0005,3e-05,123,51.82,60.23,55.71
1,xlm_regular_2020,21,0.0005,3e-05,2020,57.44,53.89,55.61
2,xlm_regular_248,20,0.0005,3e-05,248,54.57,56.13,55.34
3,xlm_regular_1234,21,0.0005,3e-05,1234,56.54,53.49,54.98
4,xlm_regular_2023,21,0.0005,3e-05,2023,57.05,55.4,56.21


In [21]:
test_full_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,P,R,F1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.8,0.0,0.0,1129.6,55.48,55.83,55.57
std,0.45,0.0,0.0,920.91,2.33,2.69,0.46
min,20.0,0.0,0.0,123.0,51.82,53.49,54.98
25%,21.0,0.0,0.0,248.0,54.57,53.89,55.34
50%,21.0,0.0,0.0,1234.0,56.54,55.4,55.61
75%,21.0,0.0,0.0,2020.0,57.05,56.13,55.71
max,21.0,0.0,0.0,2023.0,57.44,60.23,56.21


## LEA scores on the pronoun specific test sets

In [14]:
def get_pronoun_set_results(model_name):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
          
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            round(logs['hij_test_head.jsonlines_eval'][0]['sl_p'],4)* 100,
            round(logs['hij_test_head.jsonlines_eval'][0]['sl_r'],4)* 100,
            round(logs['hij_test_head.jsonlines_eval'][0]['sl_f1'],4)* 100,
            round(logs['zij_test_head.jsonlines_eval'][0]['sl_p'],4)* 100,
            round(logs['zij_test_head.jsonlines_eval'][0]['sl_r'],4)* 100,
            round(logs['zij_test_head.jsonlines_eval'][0]['sl_f1'],4)* 100,
            round(logs['hen_test_head.jsonlines_eval'][0]['sl_p'],4)* 100,
            round(logs['hen_test_head.jsonlines_eval'][0]['sl_r'],4)* 100,
            round(logs['hen_test_head.jsonlines_eval'][0]['sl_f1'],4)* 100,
            round(logs['die_test_head.jsonlines_eval'][0]['sl_p'],4)* 100,
            round(logs['die_test_head.jsonlines_eval'][0]['sl_r'],4)* 100,
            round(logs['die_test_head.jsonlines_eval'][0]['sl_f1'],4)* 100,
           ]

pronoun_sets_col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'hij p', 'hij r', \
                          'hij f1', 'zij p', 'zij r', 'zij f1', 'hen p', 'hen r', 'hen f1', 'die p', 'die r', 'die f1'] 
def get_pronoun_set_overview(setting):
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    for file in files:
            results.append(get_pronoun_set_results(file))
    df = pd.DataFrame(results, columns=pronoun_sets_col_names)
    return df

LEA scores on all pronoun specific test sets

In [15]:
test_full_pronoun_df = get_pronoun_set_overview('regular')
test_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
0,xlm_regular_123,21,0.0005,3e-05,123,49.28,54.35,51.69,49.34,53.04,51.12,49.05,50.88,49.95,48.01,48.47,48.24
1,xlm_regular_2020,21,0.0005,3e-05,2020,54.35,48.17,51.07,53.83,47.82,50.65,54.86,44.0,48.83,53.73,43.93,48.34
2,xlm_regular_248,20,0.0005,3e-05,248,53.23,49.17,51.12,52.86,48.45,50.56,52.91,46.14,49.29,53.02,45.73,49.1
3,xlm_regular_1234,21,0.0005,3e-05,1234,55.0,47.2,50.8,55.3,46.18,50.33,55.49,42.54,48.16,54.65,42.74,47.97
4,xlm_regular_2023,21,0.0005,3e-05,2023,54.3,49.43,51.75,54.57,48.18,51.17,54.16,45.52,49.47,53.36,43.84,48.14


In [21]:
test_full_pronoun_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,hij p,hij r,hij f1,zij p,zij r,zij f1,hen p,hen r,hen f1,die p,die r,die f1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.8,0.0,0.0,1129.6,53.23,49.66,51.29,53.18,48.73,50.77,53.29,45.82,49.14,52.55,44.94,48.36
std,0.45,0.0,0.0,920.91,2.3,2.76,0.42,2.33,2.56,0.37,2.56,3.16,0.68,2.61,2.24,0.44
min,20.0,0.0,0.0,123.0,49.28,47.2,50.8,49.34,46.18,50.33,49.05,42.54,48.16,48.01,42.74,47.97
25%,21.0,0.0,0.0,248.0,53.23,48.17,51.07,52.86,47.82,50.56,52.91,44.0,48.83,53.02,43.84,48.14
50%,21.0,0.0,0.0,1234.0,54.3,49.17,51.12,53.83,48.18,50.65,54.16,45.52,49.29,53.36,43.93,48.24
75%,21.0,0.0,0.0,2020.0,54.35,49.43,51.69,54.57,48.45,51.12,54.86,46.14,49.47,53.73,45.73,48.34
max,21.0,0.0,0.0,2023.0,55.0,54.35,51.75,55.3,53.04,51.17,55.49,50.88,49.95,54.65,48.47,49.1


## Pronoun scores on the pronoun-specifc test sets

In [3]:
def get_pronoun_results(model_name):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
        
    dev_f1s = [epoch['sl_f1'] for epoch in logs['dev_eval']]
    
    best_epoch = dev_f1s.index(max(dev_f1s))
    best_epoch_results = logs['dev_eval'][best_epoch]
    if "finetune" in model_name:
        best_epoch = logs['epochs'] - 10 + best_epoch
        
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            best_epoch + 1, 
            round(logs['hij_test_head.jsonlines_pronoun_score'],2),
            round(logs['zij_test_head.jsonlines_pronoun_score'],2),
            round(logs['hen_test_head.jsonlines_pronoun_score'],2),
            round(logs['die_test_head.jsonlines_pronoun_score'],2),
           ]

pronoun_col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'best_epoch', 'hij', 'zij', 'hen', 'die']
def get_pronoun_overview(setting):
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    for file in files:
        try:
            results.append(get_pronoun_results(file))
        except:
            continue
    df = pd.DataFrame(results, columns=pronoun_col_names)
    return df

In [24]:
test_full_pronoun_df = get_pronoun_overview('xlm_regular')
test_full_pronoun_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
0,xlm_regular_123,21,0.0005,3e-05,123,18,89.61,88.41,78.62,50.03
1,xlm_regular_2020,21,0.0005,3e-05,2020,15,88.24,87.1,73.1,60.94
2,xlm_regular_248,20,0.0005,3e-05,248,18,88.19,86.66,78.79,65.77
3,xlm_regular_1234,21,0.0005,3e-05,1234,15,87.15,85.13,72.61,58.99
4,xlm_regular_2023,21,0.0005,3e-05,2023,17,88.63,85.95,76.11,51.72


In [25]:
test_full_pronoun_df.describe()

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,best_epoch,hij,zij,hen,die
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.8,0.0005,3e-05,1129.6,16.6,88.364,86.65,75.846,57.49
std,0.447214,0.0,3.788046e-21,920.914925,1.516575,0.886386,1.23497,2.934456,6.550866
min,20.0,0.0005,3e-05,123.0,15.0,87.15,85.13,72.61,50.03
25%,21.0,0.0005,3e-05,248.0,15.0,88.19,85.95,73.1,51.72
50%,21.0,0.0005,3e-05,1234.0,17.0,88.24,86.66,76.11,58.99
75%,21.0,0.0005,3e-05,2020.0,18.0,88.63,87.1,78.62,60.94
max,21.0,0.0005,3e-05,2023.0,18.0,89.61,88.41,78.79,65.77


# Evaluating model performance on deformed test sets
These testsets contain only the following transformations:
* name anonymisation
* noun rewriting
* name anonymisation + noun rewriting
In none of these three sets are pronouns rewritten.

In [32]:
def get_deform_results(model_name):
    with open(f'data/train_logs/{model_name}') as json_file:
        logs = json.load(json_file)
    
    return [model_name.replace('.json',''),
            logs['epochs'], 
            logs['learning-rate'], 
            logs['bert-learning-rate'], 
            logs['seed'], 
            round(logs['anon_test_head.jsonlines_eval'][0]['sl_p'],4) * 100,
            round(logs['anon_test_head.jsonlines_eval'][0]['sl_r'],4) * 100,
            round(logs['anon_test_head.jsonlines_eval'][0]['sl_f1'],4) * 100,
            round(logs['noun_test_head.jsonlines_eval'][0]['sl_p'],4) * 100,
            round(logs['noun_test_head.jsonlines_eval'][0]['sl_r'],4) * 100,
            round(logs['noun_test_head.jsonlines_eval'][0]['sl_f1'],4) * 100,
            round(logs['anon_noun_test_head.jsonlines_eval'][0]['sl_p'],4) * 100,
            round(logs['anon_noun_test_head.jsonlines_eval'][0]['sl_r'],4) * 100,
            round(logs['anon_noun_test_head.jsonlines_eval'][0]['sl_f1'],4) * 100,
           ]

deform_col_names = ['name', 'epochs', 'learning-rate', 'bert-learning-rate', 'seed', 'anon P', 'anon R', 'anon F1',  'noun P', 'noun R', 'noun F1',  'anon noun P', 'anon noun R', 'anon noun F1']
def get_deform_overview(setting):
    files = [file for file in os.listdir("data/train_logs/") if "xlm" in file and setting in file.lower()]
    results = []
    for file in files:
        try:
            results.append(get_deform_results(file))
        except:
            continue
    df = pd.DataFrame(results, columns=deform_col_names)
    #df = df.style.highlight_max(color = 'lightgreen', axis = 0)
    return df

In [36]:
test_deform_df = get_deform_overview('regular')
test_deform_df

Unnamed: 0,name,epochs,learning-rate,bert-learning-rate,seed,anon P,anon R,anon F1,noun P,noun R,noun F1,anon noun P,anon noun R,anon noun F1
0,xlm_regular_123,21,0.0005,3e-05,123,49.81,54.07,51.85,51.12,59.61,55.04,49.74,53.98,51.77
1,xlm_regular_2020,21,0.0005,3e-05,2020,55.13,48.5,51.6,56.94,53.13,54.97,54.47,48.09,51.08
2,xlm_regular_248,20,0.0005,3e-05,248,53.75,49.45,51.51,54.22,55.31,54.76,53.62,49.13,51.27
3,xlm_regular_1234,21,0.0005,3e-05,1234,55.92,47.59,51.42,56.2,53.25,54.69,55.42,47.18,50.97
4,xlm_regular_2023,21,0.0005,3e-05,2023,55.11,49.23,52.01,56.48,54.96,55.71,54.65,49.56,51.98


In [37]:
test_deform_df.describe().applymap(lambda x: f"{x:0.2f}")

Unnamed: 0,epochs,learning-rate,bert-learning-rate,seed,anon P,anon R,anon F1,noun P,noun R,noun F1,anon noun P,anon noun R,anon noun F1
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20.8,0.0,0.0,1129.6,53.94,49.77,51.68,54.99,55.25,55.03,53.58,49.59,51.41
std,0.45,0.0,0.0,920.91,2.44,2.51,0.25,2.4,2.63,0.4,2.24,2.62,0.44
min,20.0,0.0,0.0,123.0,49.81,47.59,51.42,51.12,53.13,54.69,49.74,47.18,50.97
25%,21.0,0.0,0.0,248.0,53.75,48.5,51.51,54.22,53.25,54.76,53.62,48.09,51.08
50%,21.0,0.0,0.0,1234.0,55.11,49.23,51.6,56.2,54.96,54.97,54.47,49.13,51.27
75%,21.0,0.0,0.0,2020.0,55.13,49.45,51.85,56.48,55.31,55.04,54.65,49.56,51.77
max,21.0,0.0,0.0,2023.0,55.92,54.07,52.01,56.94,59.61,55.71,55.42,53.98,51.98
