# Get config jsons from fine-tuning experiments and analyze

## Ny kjøring i 2024


In [1]:
import json, os, sys, io, shutil
from pathlib import Path
import pandas as pd
from datetime import date
from collections import Counter


In [2]:
# Get timestamps and tasks
configs_path = Path("/home/egil/gits_wsl/seq-label-github/configs/saga")
all_jsons = [f for f in configs_path.iterdir() if f.name.endswith("json")]
all_jsons = [(f.stem.split("_")[0], f.stem.split("_")[1], f) for f in all_jsons]
timestamps = sorted(list(set([j[0] for j in all_jsons])))
for t in timestamps:
    tasks = set([j[1] for j in all_jsons if j[0]==t])
    for task in tasks:
        print(f"{t}: {task}", len([j for j in all_jsons if j[0]==t and j[1]==task]))

03081014: elsa-intensity 30
03091023: elsa-intensity 60
03111215: ner2 12
03121434: elsa-polarity 36


In [3]:



name_filter = ["0115", "0117"]
name_filter = ["01191518_tsa-bin_NB-BERT_large_07-b"]
name_filter = ["03111215"] # "01170944" # 
jsons = [f[2] for f in all_jsons if any([f[0] in n for n in name_filter])]
jsons = [f[2] for f in all_jsons if name_filter[0] in f[2].name]

name_filter = "_".join(name_filter)
len(jsons)


12

In [4]:
jsons[0].read_text()

'{"timestamp": "03111215", "num_seeds": 1, "task": "ner2", "model_shortname": "norbert3-large", "machinery": "saga", "local_dataset": true, "args_dict": {"model_name_or_path": "ltg/norbert3-large", "dataset_name": "data/ner_2cat", "seed": 101, "per_device_train_batch_size": 64, "task_name": "03111215_ner2_norbert3-large", "output_dir": "/cluster/work/users/egilron/finetunes/03111215_ner2_norbert3-large", "overwrite_cache": true, "overwrite_output_dir": true, "do_train": true, "num_train_epochs": 12, "do_eval": true, "return_entity_level_metrics": false, "use_auth_token": false, "logging_strategy": "epoch", "save_strategy": "epoch", "evaluation_strategy": "epoch", "save_total_limit": 1, "load_best_model_at_end": true, "label_column_name": "ner_labels", "disable_tqdm": true, "report_to": null, "do_predict": true, "text_column_name": "tokens", "learning_rate": 1e-05, "trust_remote_code": true}, "best_epoch": 9, "train_epochs_val": [{"eval_loss": 0.01571115292608738, "eval_precision": 0.95

In [5]:
def copyover(file_l:list[Path], dest_folder:str):
    dest_folder = Path("history", dest_folder)
    dest_folder.mkdir(exist_ok=True, parents=True)
    for f in file_l:
        shutil.copy(f, dest_folder)


In [6]:

completed_paths, records = [], []
root_keys = ["timestamp",  "task",  "machinery",  "best_epoch" ]
args_keys = [ 'model_name_or_path', 'task_name', 'dataset_name', 'output_dir', 'seed' , 'per_device_train_batch_size',  'learning_rate' , 'num_train_epochs']
epoch_keys = ['eval_loss', 'eval_precision', 'eval_recall', 'eval_f1', 'eval_accuracy','epoch', 'step' ]
for j in jsons:
    jdata = json.loads(j.read_text())
    try:
        for epoch_data in jdata["train_epochs_val"]:
            record = {k:v for k,v in jdata.items() if k in root_keys}
            record["config-file"] = j.name
            record.update({k:v for k,v in jdata["args_dict"].items() if k in args_keys})
            record.update({k:v for k,v in epoch_data.items() if k in epoch_keys})
            records.append(record)
            completed_paths.append(j)
    except:
        print(j.name, "Seems not to have completed training")
df = pd.DataFrame.from_records(records)
print("Shape:", df.shape)

xc_path = Path("excels", name_filter+"_full-report.xlsx" )

df.to_excel(xc_path, index=False)
copyover(completed_paths, name_filter+"_"+configs_path.stem)
df[df["model_name_or_path"].isin(["ltg/norbert3-base"])].sort_values("eval_f1", ascending=False)[["timestamp","config-file" ,"model_name_or_path" , "dataset_name","output_dir","eval_f1"]].head(3)


03111215_ner2_nb-bert-large_04_101.json Seems not to have completed training
03111215_ner2_nb-bert_base_05_101.json Seems not to have completed training
03111215_ner2_nb-bert_base_02_101.json Seems not to have completed training
03111215_ner2_norbert3-large_03_101.json Seems not to have completed training
Shape: (96, 20)


Unnamed: 0,timestamp,config-file,model_name_or_path,dataset_name,output_dir,eval_f1


Report hyperparameters
cols = ["timestamp",   "machinery", 'model_name_or_path', 'task_name', 'dataset_name', 'seed' , 'per_device_train_batch_size',  'learning_rate' , 'num_train_epochs']
records = []
for col in cols:
    record = {col: df[col].unique()}

In [7]:
# df.at[1080,"output_dir" ]

In [8]:
for i, row in df[df["output_dir"].isin(['/cluster/work/projects/ec30/egilron/tsa-hf/01191518_tsa-bin_NorBERT_3_base'])].sort_values("eval_f1", ascending=False)[["timestamp","config-file" ,"model_name_or_path" , "dataset_name","output_dir","eval_f1"]].head(7).iterrows():
    print(row.output_dir)

In [9]:
cols = ["timestamp",   "machinery", 'model_name_or_path','dataset_name', 'seed' , 'per_device_train_batch_size',  'learning_rate' , 'num_train_epochs']
records = {}
longest = 0
for col in cols:
    records[col]= df[col].unique()
    longest = max(longest, len(records[col]))
df_vars = pd.DataFrame(columns=cols)
for i in range(longest):
    for col in cols:
        try:
            df_vars.at[i, col] =records[col][i]
            if col=="learning_rate":
                df_vars.at[i, col] =f"{records[col][i]}"

        except:
            df_vars.at[i, col] =""
df_vars
xc_path = Path("excels", name_filter+"_search-space.xlsx" )
df_vars.to_excel(xc_path, index=False)


In [10]:
# Best epochs only
df_bests = df.loc[ df["epoch"] ==df["best_epoch"]].copy()
df_bests["learning_rate"] = df_bests["learning_rate"].apply(lambda x: f"{x:.0e}")
report = df_bests[['machinery','timestamp', 'task', "config-file", 'num_train_epochs', 'best_epoch', 
       'model_name_or_path', 'seed',
       'per_device_train_batch_size',  
      'learning_rate', 'eval_loss', 'eval_precision',
       'eval_recall', 'eval_f1', 'eval_accuracy',  ]].sort_values("eval_f1", ascending=False)
xc_path = Path("excels", name_filter+"_best-report.xlsx" )
report.to_excel(xc_path, index=False)
report.head()

Unnamed: 0,machinery,timestamp,task,config-file,num_train_epochs,best_epoch,model_name_or_path,seed,per_device_train_batch_size,learning_rate,eval_loss,eval_precision,eval_recall,eval_f1,eval_accuracy
84,saga,3111215,ner2,03111215_ner2_nb-bert_base_08_101.json,12,10,NbAiLab/nb-bert-base,101,64,1e-05,0.012571,0.964323,0.96686,0.96559,0.997974
0,saga,3111215,ner2,03111215_ner2_norbert3-large_06_101.json,12,9,ltg/norbert3-large,101,64,1e-05,0.015711,0.954498,0.971068,0.962712,0.997737
48,saga,3111215,ner2,03111215_ner2_norbert3-large_00_101.json,12,5,ltg/norbert3-large,101,32,1e-05,0.015272,0.957292,0.96686,0.962052,0.997737
36,saga,3111215,ner2,03111215_ner2_norbert3-large_09_101.json,12,7,ltg/norbert3-large,101,64,5e-05,0.015519,0.960867,0.955813,0.958333,0.997678
72,saga,3111215,ner2,03111215_ner2_nb-bert_base_11_101.json,12,11,NbAiLab/nb-bert-base,101,64,5e-05,0.018261,0.954997,0.960021,0.957503,0.997427


In [11]:
list({1:11}.values())[0]
from pathlib import Path
Path("ja.json").name

'ja.json'