In [1]:
import pandas as pd

# 确保所有列都能显示出来
pd.set_option('display.max_columns', 500)

# 确保列宽足够，不会把长字符串（比如 Method 名）截断
pd.set_option('display.max_colwidth', 100)

# 确保表格的总宽度足够，不会换行显示
pd.set_option('display.width', 1000)

In [2]:
TASK_METRIC = {
    "cola": ["eval_matthews_correlation"],
    "mnli": ["matched_accuracy", "mismatched_accuracy"],
    "mrpc": ["eval_accuracy", "eval_f1"],
    "qnli": ["eval_accuracy"],
    "qqp": ["eval_accuracy", "eval_f1"],
    "rte": ["eval_accuracy"],
    "sst2": ["eval_accuracy"],
    "stsb": ["eval_pearson", "eval_spearman"],
    "wnli": ["eval_accuracy"],
}

METRIC_NAME_MAP = {
    'eval_matthews_correlation': 'Mcc',
    'matched_accuracy': 'm',
    'mismatched_accuracy': 'mm',
    'eval_accuracy': 'Acc',
    'eval_f1': 'F1',
    'eval_pearson': 'Corr_p',
    'eval_spearman': 'Corr_s',
}

TASK_NAME_MAP = {
    'mnli': 'MNLI',
    'sst2': 'SST-2',
    'cola': 'CoLA',
    'qqp': 'QQP',
    'qnli': 'QNLI',
    'rte': 'RTE',
    'mrpc': 'MRPC',
    'stsb': 'STS-B',
}

FAMILY_NAME_MAP = {
    'bert': 'BERT-b',
    'roberta': 'RoB-b',
    'deberta': 'DeB-b',
}

METHOD_NAME_MAP = {
    'lora': 'LoRA',
    'olora': 'OLoRA',
    'dora': 'DoRA',
    'mrlora': 'MR-LoRA',
    'adalora': 'AdaLoRA',
    'mrlora-lcoef': 'MR-LoRA',
    'rslora': 'RS-LoRA'
}
VARIANT_NAME_MAP = {
    'fft': 'FFT',
    'lora': 'LoRA-Finetuning',
    'kd-lora': 'KD-LoRA-Finetuning'
}

REMOVE_PEFT = ['mrlora-rs']

In [3]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from dictor import dictor
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import  NA

def extract_experiment_data(json_file, root_dir):
    variant = Path(json_file).relative_to(root_dir).parts[0]

    with open(json_file, 'r') as f:
        data = json.load(f)

    # Extract metadata
    model_family = dictor(data, 'args.model_family')
    peft_method = dictor(data, 'args.peft')
    task = dictor(data, 'args.task')

    # for mnli, need patching.
    if 'eval_runtime' in data:
        eval_runtime = data.get('eval_runtime')
    else:
        eval_runtime_history = []
        for item in data['log_history']:
            if 'eval_runtime' in item:
                eval_runtime_history.append(item['eval_runtime'])
        eval_runtime = sum(eval_runtime_history) / len(eval_runtime_history)

    # Get training-specific metrics
    trainable_params = dictor(data, 'train.trainable_params_count', NA)
    train_runtime = dictor(data, 'train.train_time', NA)

    # Calculate Average GPU Memory (Allocated)
    memory_list = dictor(data, 'train.memory_allocated', [])
    avg_memory = np.mean(memory_list) if memory_list else NA

    rank = dictor(data, 'args.rank')

    # Get metrics
    # Some tasks use eval_accuracy, others eval_matthews_correlation
    for key in TASK_METRIC[task]:
        if key in data:
            accuracy = data[key]
            yield {
                "family": model_family,
                "peft": peft_method,
                "task": task,
                "variant": variant,
                "value": round(accuracy, 4),
                "metric": key,
                "params": round(trainable_params, 4),
                "traintime": round(train_runtime, 2),
                "evaltime": round(eval_runtime, 2),
                "gpumem": round(avg_memory, 2),
                "rank": rank, # total rank.
                'seed': dictor(data, 'args.seed'),
                'path': str(json_file)
            }


def aggregate_experiment_results(root_dir):
    """
    Finds all .json files under a directory recursively, extracts data,
    and concatenates them into one large DataFrame.
    """
    root_path = Path(root_dir)
    # Recursively find all JSON files
    json_files = list(root_path.rglob("*.json"))

    if not json_files:
        print(f"No JSON files found in {root_dir}")
        return pd.DataFrame()

    all_dfs = []
    for f in json_files:
        try:
            rows = extract_experiment_data(f, root_dir)
            all_dfs.extend(rows)
        except Exception as e:
            print(f"Failed to extract data from {f}")
            raise e

    if not all_dfs:
        print("No valid data extracted from found files.")
        return pd.DataFrame()

    # Concatenate all individual DataFrames by row
    final_df = pd.DataFrame.from_records(all_dfs)

    return final_df

df_ba = aggregate_experiment_results('./results/')
df_ba = df_ba[~df_ba.peft.str.contains('mrlora')]
df_ab = aggregate_experiment_results('./ablation2/')
df = pd.concat([df_ba, df_ab], axis=0)

In [4]:
df[df.peft.str.contains('mrlora')].groupby('peft').count()

Unnamed: 0_level_0,family,task,variant,value,metric,params,traintime,evaltime,gpumem,rank,seed,path
peft,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
mrlora,26,26,26,26,26,26,26,26,26,26,26,26
mrlora-rs,25,25,25,25,25,25,25,25,25,25,25,25


In [5]:
df = df[df['rank'] == 8]
df = df[df['variant'].isin(['lora', 'fft'])]
df = df[df.peft.isin(['lora', 'rslora', 'olora', 'dora', 'mrlora-rs'])]


In [6]:
df.peft.unique()

array(['lora', 'rslora', 'olora', 'dora', 'mrlora-rs'], dtype=object)

In [7]:
df['rank'].unique()

array([8])

In [8]:
# 1. 格式化 params 的函数
def format_params(x):
    val = float(x)
    # 如果是整数（如 184.0），显示为 184M
    if val.is_integer():
        return f"{int(val)}M"
    # 如果有小数（如 0.312），保留两位显示为 0.31M
    else:
        return f"{val:.2f}M"

In [9]:
for key, value in METRIC_NAME_MAP.items():
    df.replace(key, value, inplace=True)
for key, value in TASK_NAME_MAP.items():
    df.replace(key, value, inplace=True)
for key, value in FAMILY_NAME_MAP.items():
    df.replace(key, value, inplace=True)

In [10]:
import pandas as pd
import numpy as np

# 1. Standardize Method Names
def get_method_name(row):
    if row['variant'] == 'fft': return 'FFT'
    if 'mrlora' in row['peft']:
        return 'MR-LoRA'
    return METHOD_NAME_MAP[row['peft']]

df['Method'] = df.apply(get_method_name, axis=1)

df['params'] = df.groupby(['family', 'variant', 'rank'])['params'].transform('mean')

# 2. SEED AVERAGING (The "Fix")
# We group by the configuration identifying columns.
# We include 'params' here NOT as a grouper, but to average it
# (though it should be constant across seeds).
# We exclude 'metric' from the grouper so we can aggregate different metrics later.
df_agged = df.groupby(
    ['family', 'variant', 'rank', 'Method', 'task', 'metric'],
    as_index=False
).agg({
    'value': 'mean',  # Performance Metric
    'params': 'mean'  # Efficiency Metric
})

# 3. Handle Multi-Metric Tasks (MNLI, QQP, STS-B)
def format_task_entries(group):
    task = group['task'].iloc[0]
    # Create a map for the averaged performance metrics
    perf_map = dict(zip(group['metric'], group['value']))

    # Take the mean of params for this group (efficiency is constant for the method)
    avg_params = group['params'].mean()

    if task == 'MNLI':
        val = f"{perf_map.get('m', 0):.2f}/{perf_map.get('mm', 0):.2f}"
        met = 'm/mm'
    elif task == 'QQP':
        val = f"{perf_map.get('Acc', 0):.2f}/{perf_map.get('F1', 0):.2f}"
        met = 'Acc/F1'
    elif task == 'STS-B':
        # GLUE standard: average of Pearson and Spearman
        avg_corr = (perf_map.get('Corr_s', 0) + perf_map.get('Corr_p', 0)) / 2
        val, met = f"{avg_corr:.2f}", 'Corr'
    else:
        val, met = f"{group['value'].iloc[0]:.2f}", group['metric'].iloc[0]

    return pd.Series({
        'val': val,
        'met': met,
        'params': avg_params,
        'numeric_score': group['value'].mean() # Used for 'All Ave.'
    })

# Apply the formatting logic
df_transformed = df_agged.groupby(
    ['family', 'variant', 'rank', 'Method', 'task'],
    as_index=False
).apply(format_task_entries)

# 4. Calculate 'All Ave.' Column
# Grouping by the method configuration to average performance across all tasks
all_avg = df_transformed.groupby(
    ['family', 'variant', 'rank', 'Method']
).agg({
    'numeric_score': 'mean',
    'params': 'mean'
}).reset_index()

all_avg['task'], all_avg['met'] = 'All', 'Ave.'
all_avg['val'] = all_avg['numeric_score'].apply(lambda x: f"{x:.2f}")

# 5. Pivot and Final Formatting
df_final = pd.concat([df_transformed, all_avg], ignore_index=True)

# Formatting the Efficiency Metric (Params) for the Index
df_final['# Params'] = df_final['params'].apply(format_params)

pivot_df = df_final.pivot(
    index=['family', 'variant', 'rank', 'Method', '# Params'],
    columns=['task', 'met'],
    values='val'
)

# Sorting and Cleaning
pivot_df = pivot_df.sort_index(level=['family', 'variant', 'rank'], ascending=[True, True, False])
pivot_df.index = pivot_df.index.droplevel(['variant', 'rank'])
pivot_df.index.names = [None, None, None]
pivot_df.columns.names = [None, None]

# Column Ordering
task_order = ['MNLI', 'SST-2', 'CoLA', 'QQP', 'QNLI', 'RTE', 'MRPC', 'STS-B', 'WNLI', 'All']
existing_tasks = [t for t in task_order if t in pivot_df.columns.get_level_values(0)]
pivot_df = pivot_df.reindex(columns=existing_tasks, level=0)

print(pivot_df)

                             MNLI SST-2  CoLA        QQP  QNLI   RTE  MRPC STS-B   All
                             m/mm   Acc   Mcc     Acc/F1   Acc   Acc   Acc  Corr  Ave.
BERT-b FFT     109.48M  0.82/0.83  0.92  0.61  0.89/0.86  0.91  0.66  0.84  0.89  0.82
       DoRA    0.30M    0.78/0.79  0.92  0.58  0.76/0.49  0.89  0.69  0.86  0.89  0.78
       LoRA    0.30M    0.78/0.79  0.92  0.59  0.76/0.49  0.89  0.70  0.86  0.89  0.78
       OLoRA   0.30M    0.69/0.71  0.92  0.59  0.58/0.11  0.90  0.68  0.84  0.89  0.73
       RS-LoRA 0.30M    0.80/0.80  0.92  0.59  0.85/0.82  0.90  0.70  0.86  0.89  0.81
DeB-b  FFT     184.42M  0.90/0.90  0.96  0.71  0.90/0.88  0.94  0.83  0.89  0.91  0.88
       DoRA    0.30M    0.88/0.88  0.95  0.68  0.47/0.32  0.93  0.83  0.89  0.90  0.81
       LoRA    0.30M    0.88/0.88  0.95  0.68  0.47/0.32  0.93  0.83  0.89  0.90  0.81
       MR-LoRA 0.30M    0.32/0.32  0.51  0.67  0.37/0.54  0.51  0.81  0.88  0.90  0.63
       OLoRA   0.30M    0.88/0.89  0.95  0.

  ).apply(format_task_entries)
