In [55]:
import pandas as pd

# 确保所有列都能显示出来
pd.set_option('display.max_columns', 500)

# 确保列宽足够，不会把长字符串（比如 Method 名）截断
pd.set_option('display.max_colwidth', 100)

# 确保表格的总宽度足够，不会换行显示
pd.set_option('display.width', 1000)

In [56]:
TASK_METRIC = {
    "cola": ["eval_matthews_correlation"],
    "mnli": ["matched_accuracy", "mismatched_accuracy"],
    "mrpc": ["eval_accuracy", "eval_f1"],
    "qnli": ["eval_accuracy"],
    "qqp": ["eval_accuracy", "eval_f1"],
    "rte": ["eval_accuracy"],
    "sst2": ["eval_accuracy"],
    "stsb": ["eval_pearson", "eval_spearman"],
    "wnli": ["eval_accuracy"],
}

METRIC_NAME_MAP = {
    'eval_matthews_correlation': 'Mcc',
    'matched_accuracy': 'm',
    'mismatched_accuracy': 'mm',
    'eval_accuracy': 'Acc',
    'eval_f1': 'F1',
    'eval_pearson': 'Corr_p',
    'eval_spearman': 'Corr_s',
}

TASK_NAME_MAP = {
    'mnli': 'MNLI',
    'sst2': 'SST-2',
    'cola': 'CoLA',
    'qqp': 'QQP',
    'qnli': 'QNLI',
    'rte': 'RTE',
    'mrpc': 'MRPC',
    'stsb': 'STS-B',
    'wnli': 'WNLI',
}

FAMILY_NAME_MAP = {
    'bert': 'BERT-b',
    'roberta': 'RoB-b',
    'deberta': 'DeB-b',
}

In [57]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from dictor import dictor
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import  NA

def extract_experiment_data(json_file, root_dir):
    variant = Path(json_file).relative_to(root_dir).parts[0]

    with open(json_file, 'r') as f:
        data = json.load(f)

    # Extract metadata
    model_family = dictor(data, 'args.model_family')
    peft_method = dictor(data, 'args.peft')
    task = dictor(data, 'args.task')

    # for mnli, need patching.
    if 'eval_runtime' in data:
        eval_runtime = data.get('eval_runtime')
    else:
        eval_runtime_history = []
        for item in data['log_history']:
            if 'eval_runtime' in item:
                eval_runtime_history.append(item['eval_runtime'])
        eval_runtime = sum(eval_runtime_history) / len(eval_runtime_history)

    # Get training-specific metrics
    trainable_params = dictor(data, 'train.trainable_params_count', NA)
    train_runtime = dictor(data, 'train.train_time', NA)

    # Calculate Average GPU Memory (Allocated)
    memory_list = dictor(data, 'train.memory_allocated', [])
    avg_memory = np.mean(memory_list) if memory_list else NA

    rank = dictor(data, 'args.rank')

    # Get metrics
    # Some tasks use eval_accuracy, others eval_matthews_correlation
    for key in TASK_METRIC[task]:
        if key in data:
            accuracy = data[key]
            yield {
                "family": model_family,
                "peft": peft_method,
                "task": task,
                "variant": variant,
                "value": round(accuracy, 4),
                "metric": key,
                "params": round(trainable_params, 4),
                "traintime": round(train_runtime, 2),
                "evaltime": round(eval_runtime, 2),
                "gpumem": round(avg_memory, 2),
                "rank": rank, # total rank.
                'seed': dictor(data, 'args.seed'),
                'path': str(json_file)
            }


def aggregate_experiment_results(root_dir):
    """
    Finds all .json files under a directory recursively, extracts data,
    and concatenates them into one large DataFrame.
    """
    root_path = Path(root_dir)
    # Recursively find all JSON files
    json_files = list(root_path.rglob("*.json"))

    if not json_files:
        print(f"No JSON files found in {root_dir}")
        return pd.DataFrame()

    all_dfs = []
    for f in json_files:
        try:
            rows = extract_experiment_data(f, root_dir)
            all_dfs.extend(rows)
        except Exception as e:
            print(f"Failed to extract data from {f}")
            raise e

    if not all_dfs:
        print("No valid data extracted from found files.")
        return pd.DataFrame()

    # Concatenate all individual DataFrames by row
    final_df = pd.DataFrame.from_records(all_dfs)

    return final_df

df = aggregate_experiment_results('./results/')

## FFT, KD-LoRA, LoRA

In [58]:
TOTAL_RANKS = [8]
CAPTION = "Results on GLUE development set for BERT-base (BERT-b), DeBERTa-v3-base (DeB-b), and RoBERTa-base (RoB-b). " +\
    "We compare different fine-tuning strategies: Fully Fine-Tuning (FFT), MR-LoRA Fine-Tuning (MR), and Knowledge Distillation MR-LoRA Fine-Tuning (KD). " \
    "Results of two total ranks $r={}$ are reported. ".format(', '.join(map(str, TOTAL_RANKS))) +\
    "We report the average correlation for STS-B. " +\
    "We report mean of 3 runs using different random seeds. "
LABEL = 'tab:perf-params'

In [59]:
CAPTION

'Results on GLUE development set for BERT-base (BERT-b), DeBERTa-v3-base (DeB-b), and RoBERTa-base (RoB-b). We compare different fine-tuning strategies: Fully Fine-Tuning (FFT), MR-LoRA Fine-Tuning (MR), and Knowledge Distillation MR-LoRA Fine-Tuning (KD). Results of two total ranks $r=8$ are reported. We report the average correlation for STS-B. We report mean of 3 runs using different random seeds. '

In [60]:
df = df[(df.variant == 'fft') | (df.peft.str.contains('mrlora') & df['rank'].isin(TOTAL_RANKS))]

In [61]:
df.family.unique()

array(['bert', 'deberta', 'roberta'], dtype=object)

In [62]:
for key, value in METRIC_NAME_MAP.items():
    df.replace(key, value, inplace=True)
for key, value in TASK_NAME_MAP.items():
    df.replace(key, value, inplace=True)
for key, value in FAMILY_NAME_MAP.items():
    df.replace(key, value, inplace=True)

In [63]:
df['value'] = df.value * 100

In [64]:
df['rank'].unique(), df.family.unique(), df.peft.unique(), df.task.unique(), df.metric.unique()

(array([8]),
 array(['BERT-b', 'DeB-b', 'RoB-b'], dtype=object),
 array(['mrlora-lcoef', 'lora', 'mrlora-rs-olora', 'mrlora', 'mrlora-rs',
        'mrlora-olora'], dtype=object),
 array(['QQP', 'CoLA', 'MNLI', 'SST-2', 'MRPC', 'STS-B', 'QNLI', 'RTE'],
       dtype=object),
 array(['Acc', 'F1', 'Mcc', 'm', 'mm', 'Corr_p', 'Corr_s'], dtype=object))

In [65]:
# 1. 格式化 params 的函数
def format_params(x):
    val = float(x)
    # 如果是整数（如 184.0），显示为 184M
    if val.is_integer():
        return f"{int(val)}M"
    # 如果有小数（如 0.312），保留两位显示为 0.31M
    else:
        return f"{val:.2f}M"


In [66]:
import pandas as pd
import numpy as np

# 1. Standardize Method Names
def get_method_name(row):
    if row['variant'] == 'fft': return 'FFT'
    prefix = 'MR' if row['variant'] == 'lora' else 'KD'
    return f"{prefix}$_{{r={int(row['rank'])}}}$"

df['Method'] = df.apply(get_method_name, axis=1)

df['params'] = df.groupby(['family', 'variant', 'rank'])['params'].transform('mean')

# 2. SEED AVERAGING (The "Fix")
# We group by the configuration identifying columns. 
# We include 'params' here NOT as a grouper, but to average it 
# (though it should be constant across seeds).
# We exclude 'metric' from the grouper so we can aggregate different metrics later.
df_agged = df.groupby(
    ['family', 'variant', 'rank', 'Method', 'task', 'metric'], 
    as_index=False
).agg({
    'value': 'mean',  # Performance Metric
    'params': 'mean'  # Efficiency Metric
})

# 3. Handle Multi-Metric Tasks (MNLI, QQP, STS-B)
def format_task_entries(group):
    task = group['task'].iloc[0]
    # Create a map for the averaged performance metrics
    perf_map = dict(zip(group['metric'], group['value']))
    
    # Take the mean of params for this group (efficiency is constant for the method)
    avg_params = group['params'].mean()
    
    if task == 'MNLI':
        val = f"{perf_map.get('m', 0):.2f}/{perf_map.get('mm', 0):.2f}"
        met = 'm/mm'
    elif task == 'QQP':
        val = f"{perf_map.get('Acc', 0):.2f}/{perf_map.get('F1', 0):.2f}"
        met = 'Acc/F1'
    elif task == 'STS-B':
        # GLUE standard: average of Pearson and Spearman
        avg_corr = (perf_map.get('Corr_s', 0) + perf_map.get('Corr_p', 0)) / 2
        val, met = f"{avg_corr:.2f}", 'Corr'
    else:
        val, met = f"{group['value'].iloc[0]:.2f}", group['metric'].iloc[0]
    
    return pd.Series({
        'val': val, 
        'met': met, 
        'params': avg_params,
        'numeric_score': group['value'].mean() # Used for 'All Ave.'
    })

# Apply the formatting logic
df_transformed = df_agged.groupby(
    ['family', 'variant', 'rank', 'Method', 'task'], 
    as_index=False
).apply(format_task_entries)

# 4. Calculate 'All Ave.' Column
# Grouping by the method configuration to average performance across all tasks
all_avg = df_transformed.groupby(
    ['family', 'variant', 'rank', 'Method']
).agg({
    'numeric_score': 'mean',
    'params': 'mean'
}).reset_index()

all_avg['task'], all_avg['met'] = 'All', 'Ave.'
all_avg['val'] = all_avg['numeric_score'].apply(lambda x: f"{x:.2f}")

# 5. Pivot and Final Formatting
df_final = pd.concat([df_transformed, all_avg], ignore_index=True)

# Formatting the Efficiency Metric (Params) for the Index
df_final['# Params'] = df_final['params'].apply(format_params)

pivot_df = df_final.pivot(
    index=['family', 'variant', 'rank', 'Method', '# Params'],
    columns=['task', 'met'],
    values='val'
)

# Sorting and Cleaning
pivot_df = pivot_df.sort_index(level=['family', 'variant', 'rank'], ascending=[True, True, False])
pivot_df.index = pivot_df.index.droplevel(['variant', 'rank'])
pivot_df.index.names = [None, None, None]
pivot_df.columns.names = [None, None]

# Column Ordering
task_order = ['MNLI', 'SST-2', 'CoLA', 'QQP', 'QNLI', 'RTE', 'MRPC', 'STS-B', 'WNLI', 'All']
existing_tasks = [t for t in task_order if t in pivot_df.columns.get_level_values(0)]
pivot_df = pivot_df.reindex(columns=existing_tasks, level=0)

print(pivot_df)

                                  MNLI  SST-2   CoLA          QQP   QNLI    RTE   MRPC  STS-B    All
                                  m/mm    Acc    Mcc       Acc/F1    Acc    Acc    Acc   Corr   Ave.
BERT-b FFT        109.48M  82.47/82.74  92.18  60.51  88.97/85.62  90.95  66.48  83.99  89.13  81.96
       KD$_{r=8}$ 0.74M    76.29/77.81  89.10  49.65  75.73/69.72  87.77  58.56  82.79  85.25  75.70
       MR$_{r=8}$ 0.29M    78.63/79.66  91.03  56.55  85.92/81.24  89.95  68.95  84.92  88.47  80.59
DeB-b  FFT        184.42M  89.61/89.60  95.85  70.84  90.45/87.52  93.82  82.73  89.34  91.06  87.97
       KD$_{r=8}$ 0.15M    43.67/43.61  93.53  61.74   63.35/2.53  90.84  72.13  88.09  88.21  71.60
       MR$_{r=8}$ 0.30M    88.35/88.59  94.91  66.96  75.32/55.43  93.06  82.67  87.99  89.92  83.89
RoB-b  FFT        124.65M  86.19/86.20  94.31  63.13  89.05/85.91  92.54  78.28  88.60  90.87  85.38
       KD$_{r=8}$ 0.74M    69.71/70.30  91.38  57.90  72.89/41.15  89.28  62.10  86.08  86.

  ).apply(format_task_entries)


In [67]:
# 2. Use Styler to generate the LaTeX code
latex_code = pivot_df.style.to_latex(
    column_format='l|l|c|' + 'c' * len(pivot_df.columns),
    hrules=True,
    multicol_align="c",
    multirow_align="c"
).strip()

# 3. Adjust spacing for the 'tight' look in the image
final_latex = (
    "\\begin{table}[h]\n"
    "\\centering\n"
    "\\setlength{\\tabcolsep}{4pt} % Smaller column gap\n"
    "\\renewcommand{\\arraystretch}{1.2} % Better vertical spacing\n"
    "\\caption{" + CAPTION + "}\n"
    "\\label{" + LABEL + "}\n"
    "\\resizebox{\\textwidth}{!}{% <--- Start resize\n"
    f"{latex_code}"
    "% <--- End resize\n}\n"
    "\\end{table}"
)

In [68]:
final_latex = final_latex.replace('&  &  &',
 r'\multirow{2}{*}{\textbf{Model}} & \multirow{2}{*}{\textbf{Method}} & \multirow{2}{*}{\textbf{\# Params}} &', 1)

In [69]:
task_values = list(TASK_NAME_MAP.values()) + ['All']
for task in task_values:
    final_latex = final_latex.replace('& ' + task, r'& \textbf{'+task+'}')

print(final_latex)

\begin{table}[h]
\centering
\setlength{\tabcolsep}{4pt} % Smaller column gap
\renewcommand{\arraystretch}{1.2} % Better vertical spacing
\caption{Results on GLUE development set for BERT-base (BERT-b), DeBERTa-v3-base (DeB-b), and RoBERTa-base (RoB-b). We compare different fine-tuning strategies: Fully Fine-Tuning (FFT), MR-LoRA Fine-Tuning (MR), and Knowledge Distillation MR-LoRA Fine-Tuning (KD). Results of two total ranks $r=8$ are reported. We report the average correlation for STS-B. We report mean of 3 runs using different random seeds. }
\label{tab:perf-params}
\resizebox{\textwidth}{!}{% <--- Start resize
\begin{tabular}{l|l|c|ccccccccc}
\toprule
 \multirow{2}{*}{\textbf{Model}} & \multirow{2}{*}{\textbf{Method}} & \multirow{2}{*}{\textbf{\# Params}} & \textbf{MNLI} & \textbf{SST-2} & \textbf{CoLA} & \textbf{QQP} & \textbf{QNLI} & \textbf{RTE} & \textbf{MRPC} & \textbf{STS-B} & \textbf{All} \\
 &  &  & m/mm & Acc & Mcc & Acc/F1 & Acc & Acc & Acc & Corr & Ave. \\
\midrule
\multi

In [70]:
import re

def add_latex_family_dividers(latex_str):
    # This regex looks for a line starting with a family name (not an empty cell)
    # It avoids the header by looking for rows after the first \midrule
    
    # 1. Find the start of the data rows (usually after the metric header)
    header_end_split = latex_str.split('\\midrule', 1)
    if len(header_end_split) < 2:
        return latex_str
    
    header = header_end_split[0] + '\\midrule'
    body = header_end_split[1]

    # 2. Regex to find the start of a NEW family block.
    # In MultiIndex LaTeX, a new index level starts with text, 
    # while sub-rows start with ' &' or '  &'.
    # We look for: Start of line + word characters + '&'
    # But we skip the very first line of the body to avoid double midrules.
    
    lines = body.split('\n')
    new_body = []
    
    # We track if we are at the very first data line
    first_data_line = True
    
    for line in lines:
        # Check if the line starts a new family (e.g., 'BERT-b &')
        # This regex matches lines that start with text before the first '&'
        if re.match(r'\\multirow.* &', line.strip()):
            if not first_data_line:
                # Insert a midrule before this line
                new_body.append('\\midrule')
            first_data_line = False
        
        new_body.append(line)

    return header + '\n'.join(new_body)

# Usage:
final_latex = add_latex_family_dividers(final_latex)
print(final_latex)

\begin{table}[h]
\centering
\setlength{\tabcolsep}{4pt} % Smaller column gap
\renewcommand{\arraystretch}{1.2} % Better vertical spacing
\caption{Results on GLUE development set for BERT-base (BERT-b), DeBERTa-v3-base (DeB-b), and RoBERTa-base (RoB-b). We compare different fine-tuning strategies: Fully Fine-Tuning (FFT), MR-LoRA Fine-Tuning (MR), and Knowledge Distillation MR-LoRA Fine-Tuning (KD). Results of two total ranks $r=8$ are reported. We report the average correlation for STS-B. We report mean of 3 runs using different random seeds. }
\label{tab:perf-params}
\resizebox{\textwidth}{!}{% <--- Start resize
\begin{tabular}{l|l|c|ccccccccc}
\toprule
 \multirow{2}{*}{\textbf{Model}} & \multirow{2}{*}{\textbf{Method}} & \multirow{2}{*}{\textbf{\# Params}} & \textbf{MNLI} & \textbf{SST-2} & \textbf{CoLA} & \textbf{QQP} & \textbf{QNLI} & \textbf{RTE} & \textbf{MRPC} & \textbf{STS-B} & \textbf{All} \\
 &  &  & m/mm & Acc & Mcc & Acc/F1 & Acc & Acc & Acc & Corr & Ave. \\
\midrule
\multi

In [71]:
Path('./MrLoRA/table1.tex').write_text(final_latex)

2109