In [1]:
import os
import json
import pandas as pd
import glob
import yaml

In [2]:
def open_data_file(filepath):
    data = None
    with open(filepath, 'r') as f:
        if filepath.endswith('.json'):
            data = json.load(f)
        else:
            data = yaml.safe_load(f)
    return data

def load_data_dict_from_folder(finetuning_dir):
    data = {}
    results_path = os.path.join(finetuning_dir, 'all_results.json')
    hparams_path = os.path.join(finetuning_dir, 'config.yaml')
    if not os.path.exists(results_path) or not os.path.exists(hparams_path):
        return
    config = open_data_file(hparams_path)
    data['model_name_or_path'] = config['model_name_or_path']
    data['task_name'] = config['dataset_config_name']
    data['hparam_learning_rate'] = config['learning_rate']
    data['hparam_batch_size'] = config['per_device_train_batch_size']
    results = open_data_file(results_path)
    data['f1'] = results['predict_f1']
    data['eval_f1'] = results['eval_f1']
    return data

In [86]:
DATA_PATH = "/workspace/models/evaluate_lm_tese"
study_names_to_include = [
    "ju_ner_focus_v1_2",
    "ju_ner_focus_v1_1",
    "ju_cls_focus_v1",
    "cnj_cls_focus_v1",
    "cnj_ner_focus_v1",
    "cnj_ner_focus_v1_retry",
    #"ju_ner_focus_v2",
    #"ju_cls_focus_v2",
    #"cnj_cls_focus_v2",
    #"cnj_ner_focus_v2",
]

In [91]:
all_res = []
for study in study_names_to_include:
    study_dir = os.path.join(DATA_PATH, study)
    if not os.path.exists(study_dir):
        print(f'{study_dir} not exists')
        continue
    trials_list = [d for d in os.listdir(os.path.join(DATA_PATH, study)) if 'trial_' in d]
    for trial_id in trials_list:
        res = load_data_dict_from_folder(os.path.join(study_dir, trial_id))
        if res:
            res['trial_id'] = trial_id
            all_res.append(res)
df_res = pd.DataFrame(all_res)

/workspace/models/evaluate_lm_tese/ju_ner_focus_v1_2 not exists
/workspace/models/evaluate_lm_tese/ju_ner_focus_v1_1 not exists
/workspace/models/evaluate_lm_tese/ju_cls_focus_v1 not exists
/workspace/models/evaluate_lm_tese/cnj_cls_focus_v1 not exists


In [94]:
df_res.groupby('model_name_or_path')['f1'].count()

model_name_or_path
eduagarcia-temp/brwac_large_v1_2__checkpoint_last    19
eduagarcia-temp/brwac_v1_2__checkpoint_last           9
eduagarcia-temp/cnj_v1_2__checkpoint_last            16
eduagarcia/r_j_v2_checkpoint_12000                   24
neuralmind/bert-base-portuguese-cased                20
neuralmind/bert-large-portuguese-cased               12
projetocnj/roberta-base-juridico-v0.3                12
Name: f1, dtype: int64

In [95]:
df_res.groupby('task_name')['f1'].count()

task_name
ceia-entidades    39
datalawyer-ner    37
fgv-coarse        36
Name: f1, dtype: int64

In [96]:
df_res.groupby(['task_name'])['eval_f1'].describe().sort_values('std', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
task_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ceia-entidades,39.0,0.793723,0.132911,0.0,0.807405,0.820154,0.826939,0.839958
fgv-coarse,36.0,0.752656,0.017668,0.718442,0.743242,0.749269,0.760164,0.792774
datalawyer-ner,37.0,0.852961,0.017503,0.820538,0.839107,0.849091,0.865484,0.885525


In [97]:
df_res.groupby('model_name_or_path')['eval_f1'].mean().sort_values(ascending=False)

model_name_or_path
projetocnj/roberta-base-juridico-v0.3                0.821713
eduagarcia-temp/brwac_v1_2__checkpoint_last          0.814273
eduagarcia-temp/brwac_large_v1_2__checkpoint_last    0.807846
eduagarcia/r_j_v2_checkpoint_12000                   0.807450
neuralmind/bert-base-portuguese-cased                0.798372
eduagarcia-temp/cnj_v1_2__checkpoint_last            0.795539
neuralmind/bert-large-portuguese-cased               0.749782
Name: eval_f1, dtype: float64

In [98]:
df_res.groupby('model_name_or_path')['f1'].mean().sort_values(ascending=False)

model_name_or_path
projetocnj/roberta-base-juridico-v0.3                0.828656
eduagarcia/r_j_v2_checkpoint_12000                   0.820942
eduagarcia-temp/cnj_v1_2__checkpoint_last            0.817351
eduagarcia-temp/brwac_v1_2__checkpoint_last          0.816599
neuralmind/bert-base-portuguese-cased                0.813946
eduagarcia-temp/brwac_large_v1_2__checkpoint_last    0.811740
neuralmind/bert-large-portuguese-cased               0.758358
Name: f1, dtype: float64

In [100]:
pd.set_option('display.max_rows', 200)
df_res.groupby(['task_name', 'model_name_or_path']).max()[['f1']].sort_values(['task_name', 'f1'], ascending=False).head

<bound method NDFrame.head of                                                                         f1
task_name      model_name_or_path                                         
fgv-coarse     eduagarcia/r_j_v2_checkpoint_12000                 0.808250
               eduagarcia-temp/brwac_large_v1_2__checkpoint_last  0.808061
               projetocnj/roberta-base-juridico-v0.3              0.806596
               neuralmind/bert-large-portuguese-cased             0.806237
               eduagarcia-temp/cnj_v1_2__checkpoint_last          0.802121
               neuralmind/bert-base-portuguese-cased              0.795889
               eduagarcia-temp/brwac_v1_2__checkpoint_last        0.787975
datalawyer-ner neuralmind/bert-base-portuguese-cased              0.885339
               projetocnj/roberta-base-juridico-v0.3              0.878879
               neuralmind/bert-large-portuguese-cased             0.877336
               eduagarcia-temp/cnj_v1_2__checkpoint_last          0.86

In [11]:
task_to_consider = [
    'datalawyer-frases',
    'LeNER-Br',
    'ceia-frases',
    'rrip',
    'UlyssesNER-Br-PL-fine',
    'UlyssesNER-Br-PL-coarse',
    'UlyssesNER-Br-C-coarse',
    'UlyssesNER-Br-C-fine',
    'mapa_pt_coarse',
    'mapa_pt_fine'
]
filter_tasks_df = df_res[df_res['task_name'].isin(task_to_consider)]

In [12]:
filter_tasks_df.groupby('model_name_or_path')['eval_f1'].mean().sort_values(ascending=False)

model_name_or_path
eduagarcia-temp/brwac_large_v1_2__checkpoint_last    0.842606
neuralmind/bert-large-portuguese-cased               0.842200
eduagarcia-temp/brwac_v1_2__checkpoint_last          0.830269
neuralmind/bert-base-portuguese-cased                0.823985
eduagarcia/r_j_v2_checkpoint_12000                   0.821876
eduagarcia-temp/cnj_v1_2__checkpoint_last            0.815678
projetocnj/roberta-base-juridico-v0.3                0.813277
Name: eval_f1, dtype: float64

In [13]:
filter_tasks_df.groupby(['task_name', 'model_name_or_path']).mean()[['f1']].sort_values(['task_name', 'f1'], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1
task_name,model_name_or_path,Unnamed: 2_level_1
rrip,eduagarcia/r_j_v2_checkpoint_12000,0.834518
rrip,projetocnj/roberta-base-juridico-v0.3,0.83216
rrip,neuralmind/bert-large-portuguese-cased,0.831162
rrip,eduagarcia-temp/cnj_v1_2__checkpoint_last,0.825999
rrip,neuralmind/bert-base-portuguese-cased,0.820164
rrip,eduagarcia-temp/brwac_large_v1_2__checkpoint_last,0.815074
rrip,eduagarcia-temp/brwac_v1_2__checkpoint_last,0.80259
mapa_pt_fine,neuralmind/bert-large-portuguese-cased,0.921605
mapa_pt_fine,eduagarcia-temp/brwac_large_v1_2__checkpoint_last,0.918709
mapa_pt_fine,neuralmind/bert-base-portuguese-cased,0.914871


In [14]:
filter_tasks_df.groupby(['task_name', 'model_name_or_path']).max()[['f1']].sort_values(['task_name', 'f1'], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1
task_name,model_name_or_path,Unnamed: 2_level_1
rrip,eduagarcia/r_j_v2_checkpoint_12000,0.848454
rrip,neuralmind/bert-large-portuguese-cased,0.845761
rrip,projetocnj/roberta-base-juridico-v0.3,0.844371
rrip,eduagarcia-temp/cnj_v1_2__checkpoint_last,0.835208
rrip,neuralmind/bert-base-portuguese-cased,0.829627
rrip,eduagarcia-temp/brwac_large_v1_2__checkpoint_last,0.826734
rrip,eduagarcia-temp/brwac_v1_2__checkpoint_last,0.821618
mapa_pt_fine,eduagarcia-temp/brwac_v1_2__checkpoint_last,0.935897
mapa_pt_fine,neuralmind/bert-base-portuguese-cased,0.935567
mapa_pt_fine,eduagarcia-temp/brwac_large_v1_2__checkpoint_last,0.934866


In [15]:
filter_tasks_df.sort_values(['eval_f1'], ascending=False).groupby(['model_name_or_path', 'task_name']).head(1)

Unnamed: 0,model_name_or_path,task_name,hparam_learning_rate,hparam_batch_size,f1,eval_f1
294,neuralmind/bert-large-portuguese-cased,mapa_pt_fine,2.5e-05,16,0.925786,0.968661
363,eduagarcia-temp/brwac_v1_2__checkpoint_last,mapa_pt_fine,2.5e-05,32,0.923469,0.96648
199,neuralmind/bert-base-portuguese-cased,mapa_pt_fine,5e-05,16,0.911652,0.965909
240,eduagarcia-temp/brwac_large_v1_2__checkpoint_last,mapa_pt_fine,5e-05,16,0.891534,0.962751
198,eduagarcia/r_j_v2_checkpoint_12000,mapa_pt_fine,2.5e-05,16,0.896465,0.930748
312,projetocnj/roberta-base-juridico-v0.3,mapa_pt_fine,2.5e-05,16,0.802938,0.930636
243,eduagarcia-temp/cnj_v1_2__checkpoint_last,mapa_pt_fine,1e-05,16,0.791027,0.913165
21,eduagarcia-temp/brwac_large_v1_2__checkpoint_last,UlyssesNER-Br-PL-coarse,2.5e-05,32,0.894231,0.906115
88,projetocnj/roberta-base-juridico-v0.3,LeNER-Br,5e-05,16,0.911755,0.902003
98,eduagarcia-temp/brwac_large_v1_2__checkpoint_last,UlyssesNER-Br-PL-fine,2.5e-05,16,0.886914,0.900777


In [26]:
best_hparams_df = filter_tasks_df.sort_values('eval_f1', ascending=False).groupby(['model_name_or_path', 'task_name']).head(3).groupby(['model_name_or_path', 'task_name'])[['f1', 'eval_f1']].mean().reset_index()
best_hparams_df.sort_values('f1', ascending=False)

Unnamed: 0,model_name_or_path,task_name,f1,eval_f1
48,neuralmind/bert-base-portuguese-cased,mapa_pt_fine,0.925356,0.962962
58,neuralmind/bert-large-portuguese-cased,mapa_pt_fine,0.923265,0.96197
18,eduagarcia-temp/brwac_v1_2__checkpoint_last,mapa_pt_fine,0.921088,0.963803
8,eduagarcia-temp/brwac_large_v1_2__checkpoint_last,mapa_pt_fine,0.914395,0.96055
60,projetocnj/roberta-base-juridico-v0.3,LeNER-Br,0.912964,0.894165
30,eduagarcia/r_j_v2_checkpoint_12000,LeNER-Br,0.906193,0.897018
20,eduagarcia-temp/cnj_v1_2__checkpoint_last,LeNER-Br,0.896819,0.879509
38,eduagarcia/r_j_v2_checkpoint_12000,mapa_pt_fine,0.89353,0.919857
3,eduagarcia-temp/brwac_large_v1_2__checkpoint_last,UlyssesNER-Br-PL-coarse,0.893515,0.901497
0,eduagarcia-temp/brwac_large_v1_2__checkpoint_last,LeNER-Br,0.892277,0.888999


In [27]:
best_hparams_df.groupby(['task_name'])['f1'].describe().sort_values('std', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
task_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
mapa_pt_coarse,7.0,0.679646,0.061402,0.554976,0.670551,0.703342,0.718551,0.721001
mapa_pt_fine,7.0,0.888287,0.047713,0.819327,0.85729,0.914395,0.922177,0.925356
UlyssesNER-Br-C-fine,7.0,0.730523,0.029728,0.695546,0.711703,0.720454,0.749812,0.77463
UlyssesNER-Br-C-coarse,7.0,0.739857,0.027375,0.693843,0.726479,0.744604,0.754896,0.7778
UlyssesNER-Br-PL-coarse,7.0,0.868481,0.019076,0.839119,0.857152,0.870165,0.88113,0.893515
UlyssesNER-Br-PL-fine,7.0,0.848142,0.014511,0.832238,0.838288,0.844468,0.857498,0.868716
ceia-frases,7.0,0.805455,0.012367,0.78533,0.797103,0.810187,0.81545,0.817559
rrip,7.0,0.826846,0.011149,0.805858,0.823459,0.828472,0.833803,0.839072
LeNER-Br,7.0,0.895042,0.011013,0.883363,0.886839,0.892277,0.901506,0.912964
datalawyer-frases,7.0,0.837699,0.008517,0.828915,0.830067,0.835841,0.84493,0.849139


In [28]:
best_hparams_df.groupby(['model_name_or_path'])['f1'].describe().sort_values('mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
model_name_or_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
eduagarcia-temp/brwac_large_v1_2__checkpoint_last,10.0,0.829092,0.062566,0.721001,0.782654,0.826241,0.88611,0.914395
neuralmind/bert-large-portuguese-cased,10.0,0.82374,0.067619,0.703342,0.776416,0.836868,0.870102,0.923265
eduagarcia/r_j_v2_checkpoint_12000,10.0,0.82157,0.067274,0.71713,0.762686,0.84177,0.864908,0.906193
neuralmind/bert-base-portuguese-cased,10.0,0.816183,0.067871,0.719972,0.768884,0.827004,0.857492,0.925356
eduagarcia-temp/brwac_v1_2__checkpoint_last,10.0,0.813249,0.0781,0.698925,0.745429,0.817667,0.88062,0.921088
projetocnj/roberta-base-juridico-v0.3,10.0,0.796195,0.082578,0.642177,0.745012,0.824761,0.838916,0.912964
eduagarcia-temp/cnj_v1_2__checkpoint_last,10.0,0.783955,0.102317,0.554976,0.731741,0.824519,0.841007,0.896819


In [22]:
best_hparams_df[['model_name_or_path', 'task_name', 'f1']].sort_values(['task_name', 'f1'], ascending=False).set_index(['task_name', 'model_name_or_path'])

Unnamed: 0_level_0,Unnamed: 1_level_0,f1
task_name,model_name_or_path,Unnamed: 2_level_1
rrip,eduagarcia/r_j_v2_checkpoint_12000,0.839072
rrip,neuralmind/bert-large-portuguese-cased,0.837894
rrip,eduagarcia-temp/cnj_v1_2__checkpoint_last,0.829711
rrip,projetocnj/roberta-base-juridico-v0.3,0.828472
rrip,eduagarcia-temp/brwac_large_v1_2__checkpoint_last,0.823567
rrip,neuralmind/bert-base-portuguese-cased,0.823351
rrip,eduagarcia-temp/brwac_v1_2__checkpoint_last,0.805858
mapa_pt_fine,neuralmind/bert-base-portuguese-cased,0.925356
mapa_pt_fine,neuralmind/bert-large-portuguese-cased,0.923265
mapa_pt_fine,eduagarcia-temp/brwac_v1_2__checkpoint_last,0.921088


In [19]:
best_hparams_df.groupby('model_name_or_path')['f1'].mean().sort_values(ascending=False)

model_name_or_path
eduagarcia-temp/brwac_large_v1_2__checkpoint_last    0.829092
neuralmind/bert-large-portuguese-cased               0.823740
eduagarcia/r_j_v2_checkpoint_12000                   0.821570
neuralmind/bert-base-portuguese-cased                0.816183
eduagarcia-temp/brwac_v1_2__checkpoint_last          0.813249
projetocnj/roberta-base-juridico-v0.3                0.796195
eduagarcia-temp/cnj_v1_2__checkpoint_last            0.783955
Name: f1, dtype: float64