In [72]:
import pandas as pd

# 确保所有列都能显示出来
pd.set_option('display.max_columns', 500)

# 确保列宽足够，不会把长字符串（比如 Method 名）截断
pd.set_option('display.max_colwidth', 100)

# 确保表格的总宽度足够，不会换行显示
pd.set_option('display.width', 1000)

In [73]:
TASK_METRIC = {
    "cola": ["eval_matthews_correlation"],
    "mnli": ["matched_accuracy", "mismatched_accuracy"],
    "mrpc": ["eval_accuracy", "eval_f1"],
    "qnli": ["eval_accuracy"],
    "qqp": ["eval_accuracy", "eval_f1"],
    "rte": ["eval_accuracy"],
    "sst2": ["eval_accuracy"],
    "stsb": ["eval_pearson", "eval_spearman"],
    "wnli": ["eval_accuracy"],
}

METRIC_NAME_MAP = {
    'eval_matthews_correlation': 'Mcc',
    'matched_accuracy': 'm',
    'mismatched_accuracy': 'mm',
    'eval_accuracy': 'Acc',
    'eval_f1': 'F1',
    'eval_pearson': 'Corr_p',
    'eval_spearman': 'Corr_s',
}

TASK_NAME_MAP = {
    'mnli': 'MNLI',
    'sst2': 'SST-2',
    'cola': 'CoLA',
    'qqp': 'QQP',
    'qnli': 'QNLI',
    'rte': 'RTE',
    'mrpc': 'MRPC',
    'stsb': 'STS-B',
}

FAMILY_NAME_MAP = {
    'bert': 'BERT-b',
    'roberta': 'RoB-b',
    'deberta': 'DeB-b',
}

In [74]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from dictor import dictor
import seaborn as sns
import matplotlib.pyplot as plt

def extract_experiment_data(json_file):
    variant = Path(json_file).relative_to('./results').parts[0]

    with open(json_file, 'r') as f:
        data = json.load(f)

    data['variant'] = variant
    # with open(json_file, 'w') as f:
    #     json.dump(data, f, indent=4)

    # Extract metadata
    model_family = dictor(data, 'args.model_family')
    peft_method = dictor(data, 'args.peft')
    task = dictor(data, 'args.task')

    eval_runtime = data.get('eval_runtime', -1)

    # Get training-specific metrics
    trainable_params = dictor(data, 'train.trainable_params_count', -1)
    train_runtime = dictor(data, 'train.train_time', -1)

    # Calculate Average GPU Memory (Allocated)
    memory_list = dictor(data, 'train.memory_allocated', [])
    avg_memory = np.mean(memory_list) if memory_list else -1

    rank = dictor(data, 'args.rank')
    if 'mrlora' in peft_method:
        rank = 2*rank - 1 # r = 2*R - 1
        
    # Get metrics
    # Some tasks use eval_accuracy, others eval_matthews_correlation
    for key in TASK_METRIC[task]:
        if key in data:
            accuracy = data[key]
            yield {
                "family": model_family,
                "peft": peft_method,
                "task": task,
                "variant": variant,
                "value": round(accuracy, 4),
                "metric": key,
                "params": round(trainable_params, 4),
                "traintime": round(train_runtime, 2),
                "evaltime": round(eval_runtime, 2),
                "gpumem": round(avg_memory, 2),
                "rank": rank, # total rank.
            }


def aggregate_experiment_results(root_dir):
    """
    Finds all .json files under a directory recursively, extracts data,
    and concatenates them into one large DataFrame.
    """
    root_path = Path(root_dir)
    # Recursively find all JSON files
    json_files = list(root_path.rglob("*.json"))

    if not json_files:
        print(f"No JSON files found in {root_dir}")
        return pd.DataFrame()

    all_dfs = []
    for f in json_files:
        try:
            rows = extract_experiment_data(f)
            all_dfs.extend(rows)
        except Exception as e:
            print(f"Failed to extract data from {f}")
            raise e

    if not all_dfs:
        print("No valid data extracted from found files.")
        return pd.DataFrame()

    # Concatenate all individual DataFrames by row
    final_df = pd.DataFrame.from_records(all_dfs)

    return final_df

df = aggregate_experiment_results('./results/')

## FFT, KD-LoRA, LoRA

In [75]:
TOTAL_RANKS = [15, 31]

In [76]:
df = df[(df.variant == 'fft') | (df.peft.str.contains('mrlora') & df['rank'].isin(TOTAL_RANKS))]

In [77]:
df.family.unique()

array(['bert', 'deberta', 'roberta'], dtype=object)

In [78]:
for key, value in METRIC_NAME_MAP.items():
    df.replace(key, value, inplace=True)
for key, value in TASK_NAME_MAP.items():
    df.replace(key, value, inplace=True)
for key, value in FAMILY_NAME_MAP.items():
    df.replace(key, value, inplace=True)

In [79]:
df['value'] = df.value * 100

In [80]:
df['rank'].unique(), df.family.unique(), df.peft.unique(), df.task.unique(), df.metric.unique()

(array([15, 31,  8]),
 array(['BERT-b', 'DeB-b', 'RoB-b'], dtype=object),
 array(['mrlora', 'mrlora-rs', 'lora'], dtype=object),
 array(['wnli', 'MNLI', 'CoLA', 'MRPC', 'QQP', 'SST-2', 'RTE', 'QNLI',
        'STS-B'], dtype=object),
 array(['Acc', 'm', 'mm', 'Mcc', 'F1', 'Corr_p', 'Corr_s'], dtype=object))

In [81]:
# 1. 格式化 params 的函数
def format_params(x):
    val = float(x)
    # 如果是整数（如 184.0），显示为 184M
    if val.is_integer():
        return f"{int(val)}M"
    # 如果有小数（如 0.312），保留两位显示为 0.31M
    else:
        return f"{val:.2f}M"


In [82]:
import pandas as pd

# 1. Define the Method Label logic
def get_method_name(row):
    if row['variant'] == 'fft':
        return 'FFT'
    elif row['variant'] == 'lora':
        return f"LoRA$_{{r={int(row['rank'])}}}$"
    elif row['variant'] == 'kd-lora':
        return f"KD$_{{r={int(row['rank'])}}}$"
        # return f"KD-LoRA$_{{r={int(row['rank'])}}}$"
    return row['variant']

df['Method'] = df.apply(get_method_name, axis=1)
df['params'] = df.groupby(['variant', 'rank', 'family'])['params'].transform('first')
df['params_formatted'] = df['params'].apply(format_params)

# 2. Combine multi-metric tasks (MNLI m/mm and QQP Acc/F1)
# We create a helper function to merge values into strings
def format_values(group):
    task = group['task'].iloc[0]
    if task == 'MNLI':
        # Assumes 'm' and 'mm' metrics exist for MNLI
        m = group[group['metric'] == 'm']['value'].iloc[0]
        mm = group[group['metric'] == 'mm']['value'].iloc[0]
        return pd.Series({'val': f"{m:.2f}/{mm:.2f}", 'met': 'm/mm'})
    elif task == 'QQP':
        # Assumes 'Acc' and 'F1' metrics exist for QQP
        acc = group[group['metric'] == 'Acc']['value'].iloc[0]
        f1 = group[group['metric'] == 'F1']['value'].iloc[0]
        return pd.Series({'val': f"{acc:.2f}/{f1:.2f}", 'met': 'Acc/F1'})
    elif task == 'STS-B':
        corr_s = group[group['metric'] == 'Corr_s']['value'].iloc[0]
        corr_p = group[group['metric'] == 'Corr_p']['value'].iloc[0]
        corr_mean = (corr_s + corr_p) / 2
        return pd.Series({'val': f"{corr_mean:.2f}", 'met': 'Corr'})
    else:
        # Standard tasks with single metrics
        return pd.Series({'val': f"{group['value'].iloc[0]:.2f}", 'met': group['metric'].iloc[0]})
# 1. Update the transformations to include 'family'
# Modify your groupby to include the family field
df_transformed = df.groupby(['family', 'variant', 'rank', 'Method', 'params_formatted', 'task'], as_index=False).apply(format_values)

# 1. Create a numeric version of the task scores for averaging
def get_task_score(group):
    # Average the 'value' column for the task (e.g., average of m and mm for MNLI)
    return group['value'].mean()

# Update task means for averaging
task_means = df.groupby(['family', 'variant', 'rank', 'Method', 'params_formatted', 'task'])['value'].mean().reset_index()

# Update 'All' average to be family-specific
all_avg = task_means.groupby(['family', 'variant', 'rank', 'Method', 'params_formatted'])['value'].mean().reset_index()
all_avg['task'] = 'All'
all_avg['met'] = 'Ave.'
all_avg['val'] = all_avg['value'].apply(lambda x: f"{x:.2f}")

# Append with family preserved
df_with_avg = pd.concat([df_transformed, all_avg[['family', 'variant', 'rank', 'Method', 'params_formatted', 'task', 'met', 'val']]], ignore_index=True)

# 2. Pivot with 'family' as the top index level
pivot_df = df_with_avg.pivot(
    index=['family', 'variant', 'rank', 'Method', 'params_formatted'],
    columns=['task', 'met'],
    values='val'
)

# 3. Custom Sorting: Family first, then your existing logic
pivot_df = pivot_df.sort_index(level=['family', 'variant', 'rank'], ascending=[True, True, False])

# 4. Clean up Index
# Keep 'family', 'Method', and 'params_formatted'
pivot_df.index = pivot_df.index.droplevel(['variant', 'rank'])

# Set index names (you can leave family as a label or remove it for a cleaner look)
pivot_df.index.names = ['Family', 'Method', r'\# Params']

# 5. Column Ordering (to match the image)
task_order = ['MNLI', 'SST-2', 'CoLA', 'QQP', 'QNLI', 'RTE', 'MRPC', 'STS-B', 'All']
# Filter tasks to only those present in your data
existing_tasks = [t for t in task_order if t in pivot_df.columns.get_level_values(0)]
pivot_df = pivot_df.reindex(columns=existing_tasks, level=0)

pivot_df.columns.names = [None, None]
pivot_df.index.names = [None, None, None]

# Display result
print(pivot_df)

                                     MNLI  SST-2   CoLA          QQP   QNLI    RTE   MRPC  STS-B    All
                                     m/mm    Acc    Mcc       Acc/F1    Acc    Acc    Acc   Corr   Ave.
BERT-b FFT           109.48M  83.27/83.58  92.55  55.21  90.93/88.01  90.74  64.98  87.01  88.03  77.57
       KD$_{r=31}$   1.16M    80.02/80.44  89.68  44.28  87.72/83.40  86.53  55.23  71.81  83.34  74.38
       KD$_{r=15}$   0.87M    79.24/80.04  90.60  42.10  87.44/82.96  86.14  53.07  72.55  83.29  73.12
       LoRA$_{r=31}$ 1.14M    82.80/83.43  91.28  50.23  89.07/85.27  90.44  55.96  78.92  87.70  77.05
       LoRA$_{r=15}$ 0.55M    83.16/83.54  91.51  48.60  88.64/84.63  90.26  56.32  79.66  87.22  76.94
DeB-b  FFT           184.42M  89.21/89.52  95.87  64.58  92.30/89.81  93.68  80.14  89.46  91.23  83.54
       KD$_{r=31}$   0.57M    87.39/86.95  93.46  55.73  89.96/86.45  91.36  52.71  69.61  81.95  77.34
       KD$_{r=15}$   0.28M    86.94/86.71  93.35  54.43  89.55/8

  df_transformed = df.groupby(['family', 'variant', 'rank', 'Method', 'params_formatted', 'task'], as_index=False).apply(format_values)


In [83]:
# 2. Use Styler to generate the LaTeX code
latex_code = pivot_df.style.to_latex(
    column_format='l|l|c|' + 'c' * len(pivot_df.columns),
    hrules=True,
    multicol_align="c",
    multirow_align="c"
)

# 3. Adjust spacing for the 'tight' look in the image
final_latex = (
    "\\begin{table}[h]\n"
    "\\centering\n"
    "\\setlength{\\tabcolsep}{4pt} % Smaller column gap\n"
    "\\renewcommand{\\arraystretch}{1.2} % Better vertical spacing\n"
    f"{latex_code}\n"
    "\\end{table}"
)

In [84]:
final_latex = final_latex.replace('&  &  &',
 r'\multirow{2}{*}{\textbf{Model}} & \multirow{2}{*}{\textbf{Method}} & \multirow{2}{*}{\textbf{\# Params}} &', 1)

In [85]:
task_values = list(TASK_NAME_MAP.values()) + ['All']
for task in task_values:
    final_latex = final_latex.replace(task, r'\textbf{'+task+'}')

print(final_latex)

\begin{table}[h]
\centering
\setlength{\tabcolsep}{4pt} % Smaller column gap
\renewcommand{\arraystretch}{1.2} % Better vertical spacing
\begin{tabular}{l|l|c|ccccccccc}
\toprule
 \multirow{2}{*}{\textbf{Model}} & \multirow{2}{*}{\textbf{Method}} & \multirow{2}{*}{\textbf{\# Params}} & \textbf{MNLI} & \textbf{SST-2} & \textbf{CoLA} & \textbf{QQP} & \textbf{QNLI} & \textbf{RTE} & \textbf{MRPC} & \textbf{STS-B} & \textbf{All} \\
 &  &  & m/mm & Acc & Mcc & Acc/F1 & Acc & Acc & Acc & Corr & Ave. \\
\midrule
\multirow[c]{5}{*}{BERT-b} & FFT & 109.48M & 83.27/83.58 & 92.55 & 55.21 & 90.93/88.01 & 90.74 & 64.98 & 87.01 & 88.03 & 77.57 \\
 & KD$_{r=31}$ & 1.16M & 80.02/80.44 & 89.68 & 44.28 & 87.72/83.40 & 86.53 & 55.23 & 71.81 & 83.34 & 74.38 \\
 & KD$_{r=15}$ & 0.87M & 79.24/80.04 & 90.60 & 42.10 & 87.44/82.96 & 86.14 & 53.07 & 72.55 & 83.29 & 73.12 \\
 & LoRA$_{r=31}$ & 1.14M & 82.80/83.43 & 91.28 & 50.23 & 89.07/85.27 & 90.44 & 55.96 & 78.92 & 87.70 & 77.05 \\
 & LoRA$_{r=15}$ & 0.55M & 8