In [1]:
import pandas as pd 

In [2]:
import wandb
api = wandb.Api()
runs = api.runs("wuzhengx/fine_tuning")
eval_metrics = {
    "sst3":"eval/Macro-F1",
    "mrpc":"eval/accuracy",
    "qnli":"eval/accuracy",
}

#### Performance analysis for syntactic shifts

In [3]:
task_name = "qnli" # please change to the task you are analyzing.

In [4]:
all_data = []
for run in runs:
    if task_name in run.name:
        perf_metrics = run.history(keys=[eval_metrics[task_name]])[eval_metrics[task_name]]
        best_perf_metrics = max(perf_metrics)
        run_name = run.name
        
        name_list = run_name.split("_")
        for i in range(len(name_list)):
            if name_list[i] == "seed":
                seed = int(name_list[i+1])
            if name_list[i] == "reverse":
                if name_list[i+1] == "True":
                    reverse_order = True
                else:
                    reverse_order = False
            if name_list[i] == "random":
                if name_list[i+1].strip("/") == "True":
                    random_order = True
                else:
                    random_order = False
            if name_list[i] == "data":
                if len(name_list[i+1].split("-")) > 2:
                    perturbed_type = "-".join(name_list[i+1].split("-")[2:])
                else:
                    perturbed_type = "null"
            if name_list[i] == "inoculation":
                inoculation_p = float(name_list[i+1])
        all_data.append([perturbed_type, reverse_order, random_order, inoculation_p, seed, best_perf_metrics])

In [5]:
df = pd.DataFrame(
    all_data, 
    columns = ['perturbed_type','reverse_order','random_order', 'inoculation_p', 'seed', 'best_perf_metrics']
)

In [6]:
df[(df["perturbed_type"]=="null")&(df["random_order"]==False)&(df["reverse_order"]==False)].sort_values("inoculation_p")

Unnamed: 0,perturbed_type,reverse_order,random_order,inoculation_p,seed,best_perf_metrics
2,,False,False,0.0,42,0.931539
1,,False,False,1.0,42,0.931539


In [7]:
df[(df["perturbed_type"]=="null")&df["random_order"]].sort_values("inoculation_p")

Unnamed: 0,perturbed_type,reverse_order,random_order,inoculation_p,seed,best_perf_metrics
6,,False,True,0.0,42,0.881018
5,,False,True,1.0,42,0.885228


In [8]:
df[(df["perturbed_type"]=="null")&df["reverse_order"]].sort_values("inoculation_p")

Unnamed: 0,perturbed_type,reverse_order,random_order,inoculation_p,seed,best_perf_metrics
4,,True,False,0.0,42,0.89676
3,,True,False,1.0,42,0.908292


In [9]:
df[(df["perturbed_type"]=="en~fr@N~fr@V")].sort_values("inoculation_p")

Unnamed: 0,perturbed_type,reverse_order,random_order,inoculation_p,seed,best_perf_metrics
0,en~fr@N~fr@V,False,False,0.0,42,0.90756
10,en~fr@N~fr@V,False,False,1.0,42,0.914882


In [10]:
df[(df["perturbed_type"]=="en~jaktc@N~jaktc@V")].sort_values("inoculation_p")

Unnamed: 0,perturbed_type,reverse_order,random_order,inoculation_p,seed,best_perf_metrics
11,en~jaktc@N~jaktc@V,False,False,0.0,42,0.912868
9,en~jaktc@N~jaktc@V,False,False,1.0,42,0.91836


In [11]:
df[(df["perturbed_type"]=="en~fr@N~jaktc@V")].sort_values("inoculation_p")

Unnamed: 0,perturbed_type,reverse_order,random_order,inoculation_p,seed,best_perf_metrics
8,en~fr@N~jaktc@V,False,False,0.0,42,0.904448
7,en~fr@N~jaktc@V,False,False,1.0,42,0.913784


#### Performance analysis for tokenization

In [82]:
def fetch_data(task_name):
    all_data = []
    tokenizer_combo = ["roberta-base_albert-base-v2", "roberta-base_bert-base-cased"]
    for run in runs:
        if task_name in run.name:
            
            perf_metrics = run.history(keys=[eval_metrics[task_name]])[eval_metrics[task_name]]
            best_perf_metrics = max(perf_metrics)
            run_name = run.name
            is_valid = False
            for t in tokenizer_combo:
                if t in run_name:
                    if "NEED RERUN" not in run_name and "9-6" not in run_name:
                        
                        is_valid = True
                        break
            
            if is_valid:
                name_list = run_name.split("_")
                examined_1 = False
                examined_2 = False
                for i in range(len(name_list)):
                    if name_list[i] == "seed":
                        seed = int(name_list[i+1])
                    if name_list[i] == "reinit" and name_list[i+1] == "emb" and not examined_1:
                        examined_1 = True
                        if name_list[i+2] == "True":
                            reinit_random = True
                        else:
                            reinit_random = False
                    if name_list[i] == "reinit" and name_list[i+1] == "avg" and not examined_2:
                        examined_2 = True
                        if name_list[i+2].strip("/") == "True":
                            reinit_avg = True
                        else:
                            reinit_avg = False
                    if name_list[i] == "data":
                        if len(name_list[i+1].split("-")) > 2:
                            perturbed_type = "-".join(name_list[i+1].split("-")[2:])
                        else:
                            perturbed_type = "null"
                    if name_list[i] == "inoculation":
                        inoculation_p = float(name_list[i+1])
                model_type = name_list[4]
                tokenizer_type = name_list[5]
                all_data.append([model_type, tokenizer_type, True, reinit_avg, inoculation_p, seed, best_perf_metrics])

            # maybe it is other baselines?
            if "finetune_roberta-base_reinit_emb_True_reinit_avg_False" in run_name and "9-7" not in run_name and "9-8" not in run_name:
                all_data.append(["roberta-base", "roberta-base", True, False, 0.0, 42, best_perf_metrics])
            if "finetune_roberta-base_roberta-base_seed_42_data_wikitext-15M_inoculation_1.0_reverse_False_random_False_reinit_emb_True_reinit_avg_False_reinit_emb_False_reinit_avg_False" in run_name:
                all_data.append(["roberta-base", "roberta-base", True, False, 1.0, 42, best_perf_metrics])
    return all_data

In [91]:
task_name = "qnli" # please change to the task you are analyzing.

In [92]:
all_data = fetch_data(task_name)

9-20_task_qnli_finetune_roberta-base_reinit_emb_False_reinit_avg_False_token_s_True_word_s_False
9-16_task_qnli_finetune_roberta-base_albert-base-v2_seed_42_data_wikitext-15M_inoculation_1.0_reverse_False_random_False_reinit_emb_False_reinit_avg_False_reinit_emb_False_reinit_avg_False
9-15_task_qnli_finetune_roberta-base_bert-base-cased_seed_42_data_wikitext-15M_inoculation_1.0_reverse_False_random_False_reinit_emb_False_reinit_avg_False_reinit_emb_False_reinit_avg_False
9-15_task_qnli_finetune_roberta-base_roberta-base_seed_42_data_wikitext-15M_inoculation_1.0_reverse_False_random_False_reinit_emb_True_reinit_avg_False_reinit_emb_False_reinit_avg_False
9-12_task_qnli_finetune_roberta-base_reinit_emb_True_reinit_avg_False
9-11_task_qnli_finetune_roberta-base_bert-base-cased_seed_42_data_wikitext-15M_inoculation_0.0_reverse_False_random_False_reinit_emb_False_reinit_avg_False
9-9_task_qnli_finetune_roberta-base_albert-base-v2_seed_42_data_wikitext-15M_inoculation_0.0_reverse_False_rando

In [93]:
df = pd.DataFrame(
    all_data, 
    columns = ['model_type', 'tokenizer_type', 'reinit_random','reinit_avg', 'inoculation_p', 'seed', 'best_perf_metrics']
)

In [94]:
df

Unnamed: 0,model_type,tokenizer_type,reinit_random,reinit_avg,inoculation_p,seed,best_perf_metrics
0,roberta-base,albert-base-v2,True,False,1.0,42,0.817683
1,roberta-base,bert-base-cased,True,False,1.0,42,0.814571
2,roberta-base,roberta-base,True,False,1.0,42,0.811276
3,roberta-base,roberta-base,True,False,0.0,42,0.548417
4,roberta-base,bert-base-cased,True,False,0.0,42,0.556288
5,roberta-base,albert-base-v2,True,False,0.0,42,0.552444


In [34]:
task_name = "qnli" # please change to the task you are analyzing.

In [None]:
all_data = fetch_data(task_name)

In [None]:
df = pd.DataFrame(
    all_data, 
    columns = ['model_type', 'tokenizer_type', 'reinit_random','reinit_avg', 'inoculation_p', 'seed', 'best_perf_metrics']
)

In [None]:
df