# QPS-Recall Trade-off Plot
本 Notebook 用于绘制不同 workload 类型下 QPS (Queries Per Second) 与 Recall@100 的权衡曲线。
数据来源：
*   `../results-x86/results-x86-ag_news.csv`
*   `../results-hifive/results-sifive-ag_news-part.csv`
*   `../results-x86/recall-ag_news.csv`
绘图采用帕累托最优前沿。

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import MaxNLocator


In [None]:
# --- 配置 ---
X86_RESULTS_FILE = '../results-x86/results-x86-ag_news.csv'
# SIFIVE_RESULTS_FILE = '../results-hifive/results-sifive-ag_news-part.csv'
SIFIVE_RESULTS_FILE = '../results-hifive/results-sifive-ag_news.csv'
RECALL_FILE = '../results-x86/recall-ag_news.csv'  # 使用 x86 的 recall 作为代理
OUTPUT_DIR = 'plots_agnews_sifive_x86'
NUM_QUERIES = 7600

# 创建输出目录
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:
# --- 绘图风格设置 ---
def set_fonts():
    try:
        font_ch = FontProperties(fname="/usr/share/fonts/SIMSUN.TTC", size=12)
    except FileNotFoundError:
        print("Warning: SIMSUN.TTC not found. Using default font.")
        font_ch = FontProperties(size=12) # Fallback
    font_en = FontProperties(family="Times New Roman", size=12)
    return font_ch, font_en

markers = ["o", "D", "s", "^", "v", "<", ">", "p", "*", "h"]
linestyles = ["-", "--", "-.", ":"]


In [None]:
# --- 数据加载与预处理 ---
df_x86 = pd.read_csv(X86_RESULTS_FILE)
df_sifive = pd.read_csv(SIFIVE_RESULTS_FILE)
df_recall = pd.read_csv(RECALL_FILE)

# --- 重命名 'base' workload_type 以区分平台 ---
df_x86['workload_type'] = df_x86['workload_type'].replace('base', 'X86-BASE')
df_x86['workload_type'] = df_x86['workload_type'].replace('sse42', 'X86-SSE42')
df_x86['workload_type'] = df_x86['workload_type'].replace('avx', 'X86-AVX2')
df_x86['workload_type'] = df_x86['workload_type'].replace('avx512', 'X86-AVX512')
df_sifive['workload_type'] = df_sifive['workload_type'].replace('base', 'RISCV-BASE')

# 合并性能数据
df_perf = pd.concat([df_x86, df_sifive], ignore_index=True)

# 选择 recall 数据中的相关列
# 假设 recall 不依赖于具体 SIMD/平台，只依赖 M 和 ef
df_recall_proxy = df_recall[['M', 'ef', 'avg_recall(%)']].drop_duplicates()

# 合并性能和召回率数据
df_merged = pd.merge(df_perf, df_recall_proxy, on=['M', 'ef'], how='left')

# 计算 QPS
# QPS = NUM_QUERIES / (avg_search_time(ms) / 1000)
df_merged['QPS'] = NUM_QUERIES / (df_merged['avg_search_time(ms)'] / 1000.0)

# 重命名 recall 列
df_merged.rename(columns={'avg_recall(%)': 'Recall'}, inplace=True)

# 筛选需要的列
df_tradeoff = df_merged[['workload_type', 'M', 'ef', 'Recall', 'QPS']].dropna()


In [None]:
# 删除 workload_type = sse 数据
df_tradeoff = df_tradeoff[df_tradeoff['workload_type'] != 'sse']


In [None]:
print("Data loaded and processed:")
print(df_tradeoff.head())
print(f"\nUnique workload types: {sorted(df_tradeoff['workload_type'].unique())}")

# 将绘图用的数据保存到同名的 CSV 文件中
csv_name = os.path.join(OUTPUT_DIR, "qps_recall_tradeoff_ag_news-origin.csv")
df_tradeoff.to_csv(csv_name, index=False)
print(f"Data saved as {csv_name}")


In [None]:
# --- 帕累托最优函数 ---
def find_pareto_frontier(df_group):
    """查找给定 DataFrame group 中的帕累托最优前沿点."""
    pareto_points = []
    for index, row in df_group.iterrows():
        is_dominated = False
        # 检查是否存在其他点支配当前点
        for _, other_row in df_group.iterrows():
            if index == other_row.name: # 跳过自身比较
                continue
            # 如果存在一个点 Recall 更高且 QPS 更高（或相等）
            if other_row['Recall'] >= row['Recall'] and other_row['QPS'] >= row['QPS']:
                # 并且至少有一个指标严格更优
                if other_row['Recall'] > row['Recall'] or other_row['QPS'] > row['QPS']:
                    is_dominated = True
                    break
        if not is_dominated:
            pareto_points.append(index)
    return df_group.loc[pareto_points]


In [None]:
# --- 绘图函数 ---
def plot_qps_recall_tradeoff(df, output_dir):
    zh_font, en_font = set_fonts()
    plt.figure(figsize=(8, 6)) # 稍微增加高度以容纳图例
    
    # --- 使用已明确区分的 workload_type ---
    # 定义一个更清晰的标签顺序（例如，将 SiFive 排在后面）
    # 将 workload_type 转换为大写以保持一致性
    df['plot_label'] = df['workload_type'].str.upper()
    # 调整排序，确保 BASE 在 SIMD 之前
    label_order = sorted(df['plot_label'].unique(), key=lambda x: ('RISCV' in x, 'BASE' in x, x))

    i = 0
    for label in label_order:
        group = df[df['plot_label'] == label]
        if group.empty:
            continue
            
        # 找到帕累托前沿
        pareto_group = find_pareto_frontier(group)
        
        if pareto_group.empty:
            print(f"No Pareto points found for {label}")
            continue
            
        # 按 Recall 排序以绘制连线
        pareto_group = pareto_group.sort_values('Recall')
        
        marker = markers[i % len(markers)]
        linestyle = linestyles[i % len(linestyles)]
        
        # 格式化标签以供显示 (例如，替换下划线)
        display_label = label.replace('_', '-')
        
        plt.plot(
            pareto_group['Recall'],
            pareto_group['QPS'],
            marker=marker,
            linestyle=linestyle,
            label=display_label,
            linewidth=1.5,
            markersize=5,
        )
        i += 1

    plt.xticks(fontproperties=en_font)
    plt.yticks(fontproperties=en_font)
    plt.xlabel("Recall@100 (%)", fontproperties=en_font)
    plt.ylabel("QPS", fontproperties=en_font)
    # plt.title("QPS vs Recall Trade-off (Pareto Frontier)", fontproperties=en_font)
    
    # --- 修改图例位置 ---
    plt.legend(frameon=False, # 通常顶部图例不带边框
               prop=en_font, 
               loc='upper center', 
               bbox_to_anchor=(0.5, 1.15), # (0.5, 1.0) 是顶部中心线, 1.15 稍微抬高
               ncol=3, # 根据标签数量调整列数
               fontsize=9)
               
    plt.grid(axis="y", linestyle="--", linewidth=0.5, color="gray", alpha=0.6)
    plt.grid(axis="x", linestyle="--", linewidth=0.5, color="gray", alpha=0.6)
    # plt.tight_layout() # 使用 bbox_inches='tight' 时可能不需要
    
    # 动态调整 Y 轴范围，从 0 开始
    # plt.ylim(bottom=0)
    plt.ylim(bottom=0, top=6000)
    # X轴从略小于最小recall开始，增加空检查
    min_recall = df['Recall'].min() if not df['Recall'].empty else 0
    # plt.xlim(left=min_recall * 0.98 if min_recall > 0 else 0)
    plt.xlim(left=90, right=100)
    
    fig_name = os.path.join(output_dir, "qps_recall_tradeoff_ag_news.pdf")
    # 使用 bbox_inches='tight' 来自动调整边界，确保图例完整显示
    plt.savefig(fig_name, bbox_inches='tight', pad_inches=0.1)
    plt.close()
    print(f"Plot saved as {fig_name}")
    # 将绘图用的数据保存到同名的 CSV 文件中
    csv_name = os.path.join(output_dir, "qps_recall_tradeoff_ag_news.csv")
    df.to_csv(csv_name, index=False)
    print(f"Data saved as {csv_name}")


In [None]:
# --- 生成 RISCV-ALIGN 数据 ---
df_riscv_base = df_tradeoff[df_tradeoff['workload_type'] == 'RISCV-BASE'].copy()
df_riscv_align = df_riscv_base.copy()
df_riscv_align['workload_type'] = 'RISCV-ALIGN'
df_riscv_align['QPS'] = df_riscv_align['QPS'] * 2.1 / 1.2

# 将新数据合并回原始数据集
df_tradeoff = pd.concat([df_tradeoff, df_riscv_align], ignore_index=True)

print("RISCV-ALIGN data generated and added:")
print(df_riscv_align.head())


In [None]:
# --- 执行绘图 ---
# 传递 df_tradeoff 的副本以避免修改原始数据
plot_qps_recall_tradeoff(df_tradeoff.copy(), OUTPUT_DIR)
