In [1]:
import os
import json
import glob
import pandas as pd

In [2]:
models = [
    'EleutherAI/pythia-70m',
    'google/gemma-3-270m-it',
    'Qwen/Qwen3-0.6B'
]

zero_shot_datasets = [
    'ChilleD/SVAMP',
    'tau/commonsense_qa',
    'ehovy/race.middle'
]

dataset_names = {
    'allenai/ai2_arc.ARC-Challenge': 'ARC Challenge',
    'allenai/ai2_arc.ARC-Easy': 'ARC Easy',
    'allenai/openbookqa.main': 'OpenBookQA',
    'allenai/winogrande.winogrande_m': 'Winogrande (M)',
    'ChilleD/SVAMP': 'SVAMP',
    'ehovy/race.middle': 'RACE (Middle)',
    'google/boolq': 'BoolQ',
    'openai/gsm8k.main': "GMS8K",
    'Rowan/hellaswag': "HELLASWAG",
    'stanfordnlp/snli': "SNLI",
    "tau/commonsense_qa": "CommonSenseQA"
}

## Base model scores

In [3]:
base_path = '../_results/base'

raw_base_scores = {}
for model in models:
    raw_base_scores[model] = {}
    summary_path = os.path.join(base_path, model.replace('/', '_'), '_summary.*.json')
    summary_files = glob.glob(summary_path)
    for summary_file in summary_files:
        summary_content = json.load(open(summary_file, 'r'))
        raw_base_scores[model].update(summary_content['task_accuracies'])

base_scores = {}
for model in models:
    base_scores[model] = {}
    zero_shot_scores = {}
    for dataset, score in sorted(raw_base_scores[model].items(), key=lambda x: x[0].split('/')[1]):
        dataset_name = dataset_names[dataset]
        score = f"{score * 100:.3}%"
        if dataset in zero_shot_datasets:
            zero_shot_scores[dataset_name] = score        
        else:
            base_scores[model][dataset_name] = score
    base_scores[model] = dict(**base_scores[model], **zero_shot_scores)
        

base_scores_df = pd.DataFrame.from_dict(base_scores, orient='index')
display(base_scores_df)

Unnamed: 0,ARC Challenge,ARC Easy,BoolQ,GMS8K,HELLASWAG,OpenBookQA,SNLI,Winogrande (M),SVAMP,CommonSenseQA,RACE (Middle)
EleutherAI/pythia-70m,9.13%,7.07%,14.2%,1.29%,4.94%,8.2%,0.519%,6.08%,2.33%,3.44%,8.64%
google/gemma-3-270m-it,18.9%,23.0%,43.5%,4.55%,24.7%,24.6%,34.3%,50.6%,19.3%,18.4%,20.8%
Qwen/Qwen3-0.6B,21.7%,31.7%,63.6%,50.0%,25.9%,34.2%,42.4%,50.4%,73.0%,38.2%,34.4%


## Individual LoRA Scores

In [4]:
lora_base_path = '../_results/lora'

raw_lora_scores = {}
for model in models:
    raw_lora_scores[model] = {}
    model_path = os.path.join(lora_base_path, model.replace('/', '_'), '*')
    lora_paths = [path for path in glob.glob(model_path) if not path.endswith('mix_8')]
    
    for lora_path in lora_paths:
        results_file = glob.glob(os.path.join(lora_path, '*.results.*.json'))[0]
        results_data = json.load(open(results_file, 'r'))
        dataset = results_data['task']
        score = results_data['metrics']['accuracy']
        raw_lora_scores[model][dataset] = score

lora_scores = {}
for model in models:
    lora_scores[model] = {}
    zero_shot_scores = {}
    for dataset, score in sorted(raw_lora_scores[model].items(), key=lambda x: x[0].split('/')[1]):
        dataset_name = dataset_names[dataset]
        score = f"{score * 100:.3}%"
        if dataset in zero_shot_datasets:
            zero_shot_scores[dataset_name] = score        
        else:
            lora_scores[model][dataset_name] = score
    lora_scores[model] = dict(**lora_scores[model], **zero_shot_scores)

lora_scores_df = pd.DataFrame.from_dict(lora_scores, orient='index')
display(lora_scores_df)

Unnamed: 0,ARC Challenge,ARC Easy,BoolQ,GMS8K,HELLASWAG,OpenBookQA,SNLI,Winogrande (M)
EleutherAI/pythia-70m,22.8%,23.7%,53.5%,1.14%,25.4%,22.4%,34.9%,49.7%
google/gemma-3-270m-it,25.6%,25.0%,56.4%,3.64%,24.2%,26.4%,42.9%,53.8%
Qwen/Qwen3-0.6B,50.7%,71.8%,78.8%,35.9%,50.7%,62.2%,84.2%,48.7%


## Mixed LoRA Scores

In [5]:
lora_base_path = '../_results/lora'

raw_mixed_lora_scores = {}
for model in models:
    raw_mixed_lora_scores[model] = {}
    summary_path = os.path.join(lora_base_path, model.replace('/','_'), 'mix_8', '*_summary.*.json')
    summary_files = glob.glob(summary_path)
    
    for summary_file in summary_files:
        summary_content = json.load(open(summary_file, 'r'))
        raw_mixed_lora_scores[model].update(summary_content['task_accuracies'])

mixed_lora_scores = {}
for model in models:
    mixed_lora_scores[model] = {}
    zero_shot_scores = {}
    for dataset, score in sorted(raw_mixed_lora_scores[model].items(), key=lambda x: x[0].split('/')[1]):
        dataset_name = dataset_names[dataset]
        score = f"{score * 100:.3}%"
        if dataset in zero_shot_datasets:
            zero_shot_scores[dataset_name] = score        
        else:
            mixed_lora_scores[model][dataset_name] = score
    mixed_lora_scores[model] = dict(**mixed_lora_scores[model], **zero_shot_scores)

mixed_lora_scores_df = pd.DataFrame.from_dict(mixed_lora_scores, orient='index')
display(mixed_lora_scores_df)

Unnamed: 0,ARC Challenge,ARC Easy,BoolQ,GMS8K,HELLASWAG,OpenBookQA,SNLI,Winogrande (M),SVAMP,CommonSenseQA,RACE (Middle)
EleutherAI/pythia-70m,24.1%,22.3%,53.9%,1.97%,24.6%,26.6%,31.0%,44.3%,3.0%,18.3%,18.7%
google/gemma-3-270m-it,25.3%,21.3%,52.0%,3.41%,23.5%,25.2%,33.3%,50.9%,4.67%,20.6%,22.8%
Qwen/Qwen3-0.6B,54.4%,68.8%,68.6%,34.5%,33.0%,53.4%,76.9%,48.5%,52.7%,50.9%,64.8%


## Hypernet Scores

In [6]:
hypernet_base_path = '../_results/hypernet'

raw_hypernet_scores = {}

for model in models:
    raw_hypernet_scores[model] = {}
    summary_path = os.path.join(hypernet_base_path, model.replace('/', '_'), 'mix_8_d1024_r2_a8', '*_summary.*.json')
    summary_files = glob.glob(summary_path)

    for summary_file in summary_files:
        summary_content = json.load(open(summary_file, 'r'))
        raw_hypernet_scores[model].update(summary_content['task_accuracies'])

hypernet_scores = {}
for model in models:
    hypernet_scores[model] = {}
    zero_shot_scores = {}
    for dataset, score in sorted(raw_hypernet_scores[model].items(), key=lambda x: x[0].split('/')[1]):
        dataset_name = dataset_names[dataset]
        score = f"{score * 100:.3}%"
        if dataset in zero_shot_datasets:
            zero_shot_scores[dataset_name] = score        
        else:
            hypernet_scores[model][dataset_name] = score
    hypernet_scores[model] = dict(**hypernet_scores[model], **zero_shot_scores)

hypernet_scores_df = pd.DataFrame.from_dict(hypernet_scores, orient='index')
display(hypernet_scores_df)

Unnamed: 0,ARC Challenge,ARC Easy,BoolQ,GMS8K,HELLASWAG,OpenBookQA,SNLI,Winogrande (M),SVAMP,CommonSenseQA,RACE (Middle)
EleutherAI/pythia-70m,24.9%,23.9%,54.5%,1.67%,24.8%,28.6%,29.5%,48.4%,1.0%,19.7%,25.1%
google/gemma-3-270m-it,21.1%,23.7%,45.7%,5.84%,24.5%,27.6%,34.7%,47.8%,16.3%,21.5%,23.0%
Qwen/Qwen3-0.6B,35.2%,48.5%,64.1%,52.5%,29.0%,38.8%,49.2%,52.1%,74.7%,45.9%,44.2%


## Combining

In [7]:
scores_df = pd.concat([base_scores_df, lora_scores_df, mixed_lora_scores_df, hypernet_scores_df], 
               keys=['Base', 'LoRA (Individual)', 'LoRA (Mixed)', 'TaskWeaver (Ours)'],
               names=['Mode', 'Model']).swaplevel().reindex(models, level=0).fillna('-')

display(scores_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,ARC Challenge,ARC Easy,BoolQ,GMS8K,HELLASWAG,OpenBookQA,SNLI,Winogrande (M),SVAMP,CommonSenseQA,RACE (Middle)
Model,Mode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
EleutherAI/pythia-70m,Base,9.13%,7.07%,14.2%,1.29%,4.94%,8.2%,0.519%,6.08%,2.33%,3.44%,8.64%
EleutherAI/pythia-70m,LoRA (Individual),22.8%,23.7%,53.5%,1.14%,25.4%,22.4%,34.9%,49.7%,-,-,-
EleutherAI/pythia-70m,LoRA (Mixed),24.1%,22.3%,53.9%,1.97%,24.6%,26.6%,31.0%,44.3%,3.0%,18.3%,18.7%
EleutherAI/pythia-70m,TaskWeaver (Ours),24.9%,23.9%,54.5%,1.67%,24.8%,28.6%,29.5%,48.4%,1.0%,19.7%,25.1%
google/gemma-3-270m-it,Base,18.9%,23.0%,43.5%,4.55%,24.7%,24.6%,34.3%,50.6%,19.3%,18.4%,20.8%
google/gemma-3-270m-it,LoRA (Individual),25.6%,25.0%,56.4%,3.64%,24.2%,26.4%,42.9%,53.8%,-,-,-
google/gemma-3-270m-it,LoRA (Mixed),25.3%,21.3%,52.0%,3.41%,23.5%,25.2%,33.3%,50.9%,4.67%,20.6%,22.8%
google/gemma-3-270m-it,TaskWeaver (Ours),21.1%,23.7%,45.7%,5.84%,24.5%,27.6%,34.7%,47.8%,16.3%,21.5%,23.0%
Qwen/Qwen3-0.6B,Base,21.7%,31.7%,63.6%,50.0%,25.9%,34.2%,42.4%,50.4%,73.0%,38.2%,34.4%
Qwen/Qwen3-0.6B,LoRA (Individual),50.7%,71.8%,78.8%,35.9%,50.7%,62.2%,84.2%,48.7%,-,-,-
