In [33]:
import pandas as pd

# 确保所有列都能显示出来
pd.set_option('display.max_columns', 500)

# 确保列宽足够，不会把长字符串（比如 Method 名）截断
pd.set_option('display.max_colwidth', 100)

# 确保表格的总宽度足够，不会换行显示
pd.set_option('display.width', 1000)

In [34]:
TASK_METRIC = {
    "cola": ["eval_matthews_correlation"],
    "mnli": ["matched_accuracy", "mismatched_accuracy"],
    "mrpc": ["eval_accuracy", "eval_f1"],
    "qnli": ["eval_accuracy"],
    "qqp": ["eval_accuracy", "eval_f1"],
    "rte": ["eval_accuracy"],
    "sst2": ["eval_accuracy"],
    "stsb": ["eval_pearson", "eval_spearman"],
    "wnli": ["eval_accuracy"],
}

METRIC_NAME_MAP = {
    'eval_matthews_correlation': 'Mcc',
    'matched_accuracy': 'm',
    'mismatched_accuracy': 'mm',
    'eval_accuracy': 'Acc',
    'eval_f1': 'F1',
    'eval_pearson': 'Corr_p',
    'eval_spearman': 'Corr_s',
}

TASK_NAME_MAP = {
    'mnli': 'MNLI',
    'sst2': 'SST-2',
    'cola': 'CoLA',
    'qqp': 'QQP',
    'qnli': 'QNLI',
    'rte': 'RTE',
    'mrpc': 'MRPC',
    'stsb': 'STS-B',
}

FAMILY_NAME_MAP = {
    'bert': 'BERT-b',
    'roberta': 'RoB-b',
    'deberta': 'DeB-b',
}

METHOD_NAME_MAP = {
    'lora': 'LoRA',
    'olora': 'OLoRA',
    'dora': 'DoRA',
    'mrlora': 'MR-LoRA',
    'adalora': 'AdaLoRA',
    'mrlora-rs': 'MR-LoRA-RS',
    'rslora': 'RS-LoRA'
}

In [35]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from dictor import dictor
import seaborn as sns
import matplotlib.pyplot as plt

def extract_experiment_data(json_file):
    variant = Path(json_file).relative_to('./results').parts[0]

    with open(json_file, 'r') as f:
        data = json.load(f)

    data['variant'] = variant
    # with open(json_file, 'w') as f:
    #     json.dump(data, f, indent=4)

    # Extract metadata
    model_family = dictor(data, 'args.model_family')
    peft_method = dictor(data, 'args.peft')
    task = dictor(data, 'args.task')

    eval_runtime = data.get('eval_runtime', -1)

    # Get training-specific metrics
    trainable_params = dictor(data, 'train.trainable_params_count', -1)
    train_runtime = dictor(data, 'train.train_time', -1)

    # Calculate Average GPU Memory (Allocated)
    memory_list = dictor(data, 'train.memory_allocated', [])
    avg_memory = np.mean(memory_list) if memory_list else -1

    rank = dictor(data, 'args.rank')
    if 'mrlora' in peft_method:
        rank = 2*rank - 1 # r = 2*R - 1
        
    # Get metrics
    # Some tasks use eval_accuracy, others eval_matthews_correlation
    for key in TASK_METRIC[task]:
        if key in data:
            accuracy = data[key]
            yield {
                "family": model_family,
                "peft": peft_method,
                "task": task,
                "variant": variant,
                "value": round(accuracy, 4),
                "metric": key,
                "params": round(trainable_params, 4),
                "traintime": round(train_runtime, 2),
                "evaltime": round(eval_runtime, 2),
                "gpumem": round(avg_memory, 2),
                "rank": rank, # total rank.
            }


def aggregate_experiment_results(root_dir):
    """
    Finds all .json files under a directory recursively, extracts data,
    and concatenates them into one large DataFrame.
    """
    root_path = Path(root_dir)
    # Recursively find all JSON files
    json_files = list(root_path.rglob("*.json"))

    if not json_files:
        print(f"No JSON files found in {root_dir}")
        return pd.DataFrame()

    all_dfs = []
    for f in json_files:
        try:
            rows = extract_experiment_data(f)
            all_dfs.extend(rows)
        except Exception as e:
            print(f"Failed to extract data from {f}")
            raise e

    if not all_dfs:
        print("No valid data extracted from found files.")
        return pd.DataFrame()

    # Concatenate all individual DataFrames by row
    final_df = pd.DataFrame.from_records(all_dfs)

    return final_df

df = aggregate_experiment_results('./results/')

In [36]:
for key, value in METRIC_NAME_MAP.items():
    df.replace(key, value, inplace=True)
for key, value in TASK_NAME_MAP.items():
    df.replace(key, value, inplace=True)
for key, value in METHOD_NAME_MAP.items():
    df.replace(key, value, inplace=True)
for key, value in FAMILY_NAME_MAP.items():
    df.replace(key, value, inplace=True)

In [37]:
df['value'] = df.value * 100

## FFT, KD-LoRA, LoRA

In [38]:
# 1. 格式化 params 的函数
def format_params(x):
    val = float(x)
    # 如果是整数（如 184.0），显示为 184M
    if val.is_integer():
        return f"{int(val)}M"
    # 如果有小数（如 0.312），保留两位显示为 0.31M
    else:
        return f"{val:.2f}M"


In [39]:
TOTAL_RANKS = [15, 31]
MODEL_FAMILY = 'bert'

In [40]:
df



Unnamed: 0,family,peft,task,variant,value,metric,params,traintime,evaltime,gpumem,rank
0,BERT-b,OLoRA,wnli,kd-lora,36.62,Acc,0.8870,0.93,0.02,300.54,16
1,BERT-b,RS-LoRA,wnli,kd-lora,40.85,Acc,0.7396,1.09,0.03,299.26,8
2,BERT-b,LoRA,wnli,kd-lora,42.25,Acc,0.7396,1.48,0.02,297.93,8
3,BERT-b,DoRA,wnli,kd-lora,40.85,Acc,1.1912,1.06,0.02,304.50,32
4,BERT-b,LoRA,wnli,kd-lora,40.85,Acc,1.7718,0.97,0.02,313.68,64
...,...,...,...,...,...,...,...,...,...,...,...
4730,BERT-b,MR-LoRA-RS,MRPC,LoRA,90.66,F1,2.3241,20.26,0.31,493.23,63
4731,BERT-b,LoRA,MRPC,LoRA,79.17,Acc,0.5914,7.19,0.15,466.33,16
4732,BERT-b,LoRA,MRPC,LoRA,86.18,F1,0.5914,7.19,0.15,466.33,16
4733,BERT-b,MR-LoRA,MRPC,LoRA,79.66,Acc,2.3241,17.01,0.20,493.23,63


In [41]:
df['rank'].unique(), df.family.unique(), df.peft.unique(), df.task.unique(), df.metric.unique()

(array([ 16,   8,  32,  64, 127,  15,  31,  63]),
 array(['BERT-b', 'DeB-b', 'RoB-b'], dtype=object),
 array(['OLoRA', 'RS-LoRA', 'LoRA', 'DoRA', 'MR-LoRA-RS', 'MR-LoRA'],
       dtype=object),
 array(['wnli', 'MNLI', 'CoLA', 'MRPC', 'QQP', 'SST-2', 'RTE', 'QNLI',
        'STS-B'], dtype=object),
 array(['Acc', 'm', 'mm', 'Mcc', 'F1', 'Corr_p', 'Corr_s'], dtype=object))

In [42]:
import pandas as pd

df['params'] = df.groupby(['variant', 'rank'])['params'].transform('first')
df['params_formatted'] = df['params'].apply(format_params)

# 2. Combine multi-metric tasks (MNLI m/mm and QQP Acc/F1)
# We create a helper function to merge values into strings
def format_values(group):
    task = group['task'].iloc[0]
    if task == 'MNLI':
        # Assumes 'm' and 'mm' metrics exist for MNLI
        m = group[group['metric'] == 'm']['value'].iloc[0]
        mm = group[group['metric'] == 'mm']['value'].iloc[0]
        return pd.Series({'val': f"{m:.2f}/{mm:.2f}", 'met': 'm/mm'})
    elif task == 'QQP':
        # Assumes 'Acc' and 'F1' metrics exist for QQP
        acc = group[group['metric'] == 'Acc']['value'].iloc[0]
        f1 = group[group['metric'] == 'F1']['value'].iloc[0]
        return pd.Series({'val': f"{acc:.2f}/{f1:.2f}", 'met': 'Acc/F1'})
    elif task == 'STS-B':
        corr_s = group[group['metric'] == 'Corr_s']['value'].iloc[0]
        corr_p = group[group['metric'] == 'Corr_p']['value'].iloc[0]
        corr_mean = (corr_s + corr_p) / 2
        return pd.Series({'val': f"{corr_mean:.2f}", 'met': 'Corr'})
    else:
        # Standard tasks with single metrics
        return pd.Series({'val': f"{group['value'].iloc[0]:.2f}", 'met': group['metric'].iloc[0]})

df_transformed = df.groupby(['variant', 'rank', 'Method', 'params_formatted', 'task'], as_index=False).apply(format_values)

# 1. Create a numeric version of the task scores for averaging
def get_task_score(group):
    # Average the 'value' column for the task (e.g., average of m and mm for MNLI)
    return group['value'].mean()

# Calculate task-level means
task_means = df.groupby(['variant', 'rank', 'Method', 'params_formatted', 'task'])['value'].mean().reset_index()

# 2. Calculate the 'All' average across all tasks for each method
all_avg = task_means.groupby(['variant', 'rank', 'Method', 'params_formatted'])['value'].mean().reset_index()
all_avg['task'] = 'All'
all_avg['met'] = 'Ave.'
all_avg['val'] = all_avg['value'].apply(lambda x: f"{x:.2f}")

# 3. Append this to your transformed dataframe
df_with_avg = pd.concat([df_transformed, all_avg[['variant', 'rank', 'Method', 'params_formatted', 'task', 'met', 'val']]], ignore_index=True)

# 3. Pivot the table
# Index: Method and Params
# Columns: Task and the combined Metric name
pivot_df = df_with_avg.pivot(
    index=['variant', 'rank', 'Method', 'params_formatted'],
    columns=['task', 'met'],
    values='val'
)

# 4. Custom Sorting
# Use variant to put 'fft' first, then rank descending (31 then 15)
pivot_df = pivot_df.sort_index(level=['variant', 'rank'], ascending=[True, False])

# Clean up index: remove the helper columns used for sorting
pivot_df.index = pivot_df.index.droplevel(['variant', 'rank'])

# 5. Column Ordering (to match the image)
task_order = ['MNLI', 'SST-2', 'CoLA', 'QQP', 'QNLI', 'RTE', 'MRPC', 'STS-B', 'All']
# Filter tasks to only those present in your data
existing_tasks = [t for t in task_order if t in pivot_df.columns.get_level_values(0)]
pivot_df = pivot_df.reindex(columns=existing_tasks, level=0)

pivot_df.index.names = ['Method', r'\# Params']
pivot_df.columns.names = [None, None]
pivot_df.index.names = [None, None]

# Display result
print(pivot_df)

KeyError: 'Method'