# QPS-Recall Trade-off Plot
本 Notebook 用于绘制不同 workload 类型下 QPS (Queries Per Second) 与 Recall@100 的权衡曲线。
数据来源：
*   `../results-x86/results-x86-ag_news.csv`
*   `../results-hifive/results-sifive-ag_news.csv`
*   `./results-c910-ag_news.csv`
*   `../results-x86/recall-ag_news.csv`
绘图采用帕累托最优前沿。

In [35]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import MaxNLocator


In [36]:
# --- 配置 ---
X86_RESULTS_FILE = '../results-x86/results-x86-ag_news.csv'
SIFIVE_RESULTS_FILE = '../results-hifive/results-sifive-ag_news.csv'
C910_RESULTS_FILE = './results-c910-ag_news.csv' # 新增 C910 文件路径
RECALL_FILE = '../results-x86/recall-ag_news.csv'  # 使用 x86 的 recall 作为代理
OUTPUT_DIR = 'plots_agnews_sifive_x86_c910' # 修改输出目录名
NUM_QUERIES = 7600

# 创建输出目录
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [37]:
# --- 绘图风格设置 ---
def set_fonts():
    try:
        font_ch = FontProperties(fname="/usr/share/fonts/SIMSUN.TTC", size=12)
    except FileNotFoundError:
        print("Warning: SIMSUN.TTC not found. Using default font.")
        font_ch = FontProperties(size=12) # Fallback
    font_en = FontProperties(family="Times New Roman", size=12)
    return font_ch, font_en

markers = ["o", "D", "s", "^", "v", "<", ">", "p", "*", "h"]
linestyles = ["-", "--", "-.", ":"]


In [38]:
# --- 数据加载与预处理 ---
df_x86 = pd.read_csv(X86_RESULTS_FILE)
df_sifive = pd.read_csv(SIFIVE_RESULTS_FILE)
df_c910 = pd.read_csv(C910_RESULTS_FILE) # 加载 C910 数据
df_recall = pd.read_csv(RECALL_FILE)

# --- 重命名 workload_type 以区分平台 ---
df_x86['workload_type'] = df_x86['workload_type'].replace('base', 'X86-BASE')
df_x86['workload_type'] = df_x86['workload_type'].replace('sse42', 'X86-SSE42')
df_x86['workload_type'] = df_x86['workload_type'].replace('avx', 'X86-AVX2')
df_x86['workload_type'] = df_x86['workload_type'].replace('avx512', 'X86-AVX512')
df_sifive['workload_type'] = df_sifive['workload_type'].replace('base', 'RISCV-BASE')
df_c910['workload_type'] = df_c910['workload_type'].replace('rv', 'C910-RV')
df_c910['workload_type'] = df_c910['workload_type'].replace('rvv', 'C910-RVV')

# 合并性能数据
df_perf = pd.concat([df_x86, df_sifive, df_c910], ignore_index=True) # 合并 C910 数据

# 选择 recall 数据中的相关列
# 假设 recall 不依赖于具体 SIMD/平台，只依赖 M 和 ef
df_recall_proxy = df_recall[['M', 'ef', 'avg_recall(%)']].drop_duplicates()

# 合并性能和召回率数据
df_merged = pd.merge(df_perf, df_recall_proxy, on=['M', 'ef'], how='left')

# 计算 QPS
# QPS = NUM_QUERIES / (avg_search_time(ms) / 1000)
df_merged['QPS'] = NUM_QUERIES / (df_merged['avg_search_time(ms)'] / 1000.0)

# 重命名 recall 列
df_merged.rename(columns={'avg_recall(%)': 'Recall'}, inplace=True)

# 筛选需要的列
df_tradeoff = df_merged[['workload_type', 'M', 'ef', 'Recall', 'QPS']].dropna()


In [39]:
# 删除 workload_type = sse 数据 (如果存在)
df_tradeoff = df_tradeoff[df_tradeoff['workload_type'] != 'sse']
# 删除 workload_type = X86-SSE42 数据 (如果需要)
df_tradeoff = df_tradeoff[df_tradeoff['workload_type'] != 'X86-SSE42']


In [40]:
print("Data loaded and processed:")
print(df_tradeoff.head())
print(f"\nUnique workload types: {sorted(df_tradeoff['workload_type'].unique())}")

# 将绘图用的数据保存到同名的 CSV 文件中
csv_name = os.path.join(OUTPUT_DIR, "qps_recall_tradeoff_ag_news_all-origin.csv") # 修改文件名
df_tradeoff.to_csv(csv_name, index=False)
print(f"Data saved as {csv_name}")


Data loaded and processed:
  workload_type  M   ef  Recall          QPS
0      X86-AVX2  8  100  91.539  5030.151123
1      X86-AVX2  8  128  91.949  5022.107184
2      X86-AVX2  8  200  92.321  4954.332140
3      X86-AVX2  8  256  92.449  5096.296535
4      X86-AVX2  8  300  92.480  5005.964343

Unique workload types: ['C910-RV', 'C910-RVV', 'RISCV-BASE', 'X86-AVX2', 'X86-AVX512', 'X86-BASE']
Data saved as plots_agnews_sifive_x86_c910/qps_recall_tradeoff_ag_news_all-origin.csv


In [41]:
# --- 帕累托最优函数 ---
def find_pareto_frontier(df_group):
    """查找给定 DataFrame group 中的帕累托最优前沿点."""
    pareto_points = []
    for index, row in df_group.iterrows():
        is_dominated = False
        # 检查是否存在其他点支配当前点
        for _, other_row in df_group.iterrows():
            if index == other_row.name: # 跳过自身比较
                continue
            # 如果存在一个点 Recall 更高且 QPS 更高（或相等）
            if other_row['Recall'] >= row['Recall'] and other_row['QPS'] >= row['QPS']:
                # 并且至少有一个指标严格更优
                if other_row['Recall'] > row['Recall'] or other_row['QPS'] > row['QPS']:
                    is_dominated = True
                    break
        if not is_dominated:
            pareto_points.append(index)
    return df_group.loc[pareto_points]


In [42]:
# --- 绘图函数 (所有平台) ---
def plot_qps_recall_tradeoff(df, output_dir):
    zh_font, en_font = set_fonts()
    plt.figure(figsize=(8, 6)) # 稍微增加高度以容纳图例
    
    # --- 使用已明确区分的 workload_type ---
    # 将 workload_type 转换为大写以保持一致性
    df['plot_label'] = df['workload_type'].str.upper()
    # 调整排序，确保 BASE 在 SIMD 之前，并按平台分组
    label_order = sorted(df['plot_label'].unique(), key=lambda x: ('X86' in x, 'C910' in x, 'RISCV' in x, 'BASE' in x, x))

    i = 0
    for label in label_order:
        group = df[df['plot_label'] == label]
        if group.empty:
            continue
            
        # 找到帕累托前沿
        pareto_group = find_pareto_frontier(group)
        
        if pareto_group.empty:
            print(f"No Pareto points found for {label}")
            continue
            
        # 按 Recall 排序以绘制连线
        pareto_group = pareto_group.sort_values('Recall')
        
        marker = markers[i % len(markers)]
        linestyle = linestyles[i % len(linestyles)]
        
        # 格式化标签以供显示 (例如，替换下划线)
        display_label = label.replace('_', '-')
        
        plt.plot(
            pareto_group['Recall'],
            pareto_group['QPS'],
            marker=marker,
            linestyle=linestyle,
            label=display_label,
            linewidth=1.5,
            markersize=5,
        )
        i += 1

    plt.xticks(fontproperties=en_font)
    plt.yticks(fontproperties=en_font)
    plt.xlabel("Recall@100 (%)", fontproperties=en_font)
    plt.ylabel("QPS", fontproperties=en_font)
    # plt.title("QPS vs Recall Trade-off (All Platforms)", fontproperties=en_font)
    
    # --- 修改图例位置 ---
    # 根据标签数量动态调整列数和位置
    num_labels = len(label_order)
    ncol = min(num_labels, 4) # 每行最多显示 4 个图例项
    # 动态计算图例需要的行数
    num_legend_rows = (num_labels + ncol - 1) // ncol
    # 根据行数调整 bbox_y，留出更多空间
    bbox_y = 1.05 + num_legend_rows * 0.08 # 基础位置 1.05，每行增加 0.08
    plt.legend(frameon=False, 
               prop=en_font, 
               loc='upper center', 
               bbox_to_anchor=(0.5, bbox_y), # 动态调整 Y 坐标
               ncol=ncol, # 根据标签数量调整列数
               fontsize=9)
               
    plt.grid(axis="y", linestyle="--", linewidth=0.5, color="gray", alpha=0.6)
    plt.grid(axis="x", linestyle="--", linewidth=0.5, color="gray", alpha=0.6)
    # plt.tight_layout() # 使用 bbox_inches='tight' 时可能不需要
    
    # 动态调整 Y 轴范围，从 0 开始
    max_qps = df['QPS'].max() if not df['QPS'].empty else 6000
    plt.ylim(bottom=0, top=max_qps * 1.1) # 增加 10% 的顶部空间
    # X轴从略小于最小recall开始，增加空检查
    min_recall = df['Recall'].min() if not df['Recall'].empty else 0
    # plt.xlim(left=min_recall * 0.98 if min_recall > 0 else 0)
    plt.xlim(left=90, right=100)
    
    fig_name = os.path.join(output_dir, "qps_recall_tradeoff_ag_news_all.pdf") # 修改文件名
    # 使用 bbox_inches='tight' 来自动调整边界，确保图例完整显示
    plt.savefig(fig_name, bbox_inches='tight', pad_inches=0.1)
    plt.close()
    print(f"Plot saved as {fig_name}")
    # 将绘图用的数据保存到同名的 CSV 文件中
    csv_name = os.path.join(output_dir, "qps_recall_tradeoff_ag_news_all.csv") # 修改文件名
    # 保存帕累托前沿点数据，而不是所有数据
    all_pareto_points = pd.concat([find_pareto_frontier(df[df['plot_label'] == label]) for label in label_order if not df[df['plot_label'] == label].empty])
    all_pareto_points.to_csv(csv_name, index=False)
    print(f"Pareto data saved as {csv_name}")


In [None]:
# --- 绘图函数 (仅 C910) ---
def plot_c910_tradeoff(df, output_dir):
    zh_font, en_font = set_fonts()
    plt.figure(figsize=(8, 5)) # 可以调整尺寸
    
    # 筛选 C910 数据
    df_c910_only = df[df['workload_type'].str.contains('C910', case=False)].copy()
    if df_c910_only.empty:
        print("No C910 data found to plot.")
        return
        
    df_c910_only['plot_label'] = df_c910_only['workload_type'].str.upper()
    label_order = sorted(df_c910_only['plot_label'].unique()) # 只排序 C910 的标签

    i = 0
    for label in label_order:
        group = df_c910_only[df_c910_only['plot_label'] == label]
        if group.empty:
            continue
            
        pareto_group = find_pareto_frontier(group)
        
        if pareto_group.empty:
            print(f"No Pareto points found for {label}")
            continue
            
        pareto_group = pareto_group.sort_values('Recall')
        
        # 可以为 C910 选择不同的标记或颜色
        marker = markers[i % len(markers)] 
        linestyle = linestyles[i % len(linestyles)]
        
        display_label = label.replace('_', '-')
        
        plt.plot(
            pareto_group['Recall'],
            pareto_group['QPS'],
            marker=marker,
            linestyle=linestyle,
            label=display_label,
            linewidth=1.5,
            markersize=5,
        )
        i += 1

    plt.xticks(fontproperties=en_font)
    plt.yticks(fontproperties=en_font)
    plt.xlabel("Recall@100 (%)", fontproperties=en_font)
    plt.ylabel("QPS", fontproperties=en_font)
    # plt.title("QPS vs Recall Trade-off (C910 Only)", fontproperties=en_font)
    
    # 调整图例 (标签较少，可以放在内部)
    plt.legend(frameon=False, prop=en_font, loc='best', fontsize=10)
               
    plt.grid(axis="y", linestyle="--", linewidth=0.5, color="gray", alpha=0.6)
    plt.grid(axis="x", linestyle="--", linewidth=0.5, color="gray", alpha=0.6)
    
    # 动态调整 Y 轴范围
    max_qps_c910 = df_c910_only['QPS'].max() if not df_c910_only['QPS'].empty else 1000
    plt.ylim(bottom=0, top=max_qps_c910 * 1.1)
    # X轴范围可以保持一致或根据 C910 数据调整
    min_recall_c910 = df_c910_only['Recall'].min() if not df_c910_only['Recall'].empty else 90
    plt.xlim(left=max(90, min_recall_c910 * 0.99), right=100)
    
    fig_name = os.path.join(output_dir, "qps_recall_tradeoff_ag_news_c910_only.pdf")
    plt.savefig(fig_name, bbox_inches='tight', pad_inches=0.1)
    plt.close()
    print(f"C910 only plot saved as {fig_name}")
    # 保存 C910 帕累托数据
    c910_pareto_points = pd.concat([find_pareto_frontier(df_c910_only[df_c910_only['plot_label'] == label]) for label in label_order if not df_c910_only[df_c910_only['plot_label'] == label].empty])
    csv_name_c910 = os.path.join(output_dir, "qps_recall_tradeoff_ag_news_c910_only.csv")
    c910_pareto_points.to_csv(csv_name_c910, index=False)
    print(f"C910 Pareto data saved as {csv_name_c910}")
    # 计算相同 M 和 ef 时的 C910-RVV 和 C910-RV 的 QPS 比值
    # Ensure QPS column exists in df_c910
    if 'QPS' not in df_c910.columns:
        df_c910['QPS'] = NUM_QUERIES / (df_c910['avg_search_time(ms)'] / 1000.0)
    
    c910_rv = df_c910[df_c910['workload_type'] == 'C910-RV']
    c910_rvv = df_c910[df_c910['workload_type'] == 'C910-RVV']
    
    # Merge the two datasets for comparison
    c910_comparison = pd.merge(
        c910_rv[['M', 'ef', 'QPS']],
        c910_rvv[['M', 'ef', 'QPS']],
        on=['M', 'ef'],
        suffixes=('_RV', '_RVV')
    )
    
    # Calculate QPS ratio
    c910_comparison['QPS_Ratio'] = (c910_comparison['QPS_RVV'] / c910_comparison['QPS_RV'] - 1) * 100
    
    # Print the results
    print("C910-RVV vs C910-RV QPS Comparison:")
    print(c910_comparison[['M', 'ef', 'QPS_RV', 'QPS_RVV', 'QPS_Ratio']])
    
    print("Avg ratio of C910-RVV to C910-RV QPS:", c910_comparison['QPS_Ratio'].mean())
    print("Max ratio of C910-RVV to C910-RV QPS:", c910_comparison['QPS_Ratio'].max())
    print("Min ratio of C910-RVV to C910-RV QPS:", c910_comparison['QPS_Ratio'].min())


In [48]:
# --- 执行绘图 (所有平台) ---
# 传递 df_tradeoff 的副本以避免修改原始数据
plot_qps_recall_tradeoff(df_tradeoff.copy(), OUTPUT_DIR)


Plot saved as plots_agnews_sifive_x86_c910/qps_recall_tradeoff_ag_news_all.pdf
Pareto data saved as plots_agnews_sifive_x86_c910/qps_recall_tradeoff_ag_news_all.csv


In [51]:
# --- 执行绘图 (仅 C910) ---
plot_c910_tradeoff(df_tradeoff.copy(), OUTPUT_DIR)


C910 only plot saved as plots_agnews_sifive_x86_c910/qps_recall_tradeoff_ag_news_c910_only.pdf
C910 Pareto data saved as plots_agnews_sifive_x86_c910/qps_recall_tradeoff_ag_news_c910_only.csv
C910-RVV vs C910-RV QPS Comparison:
      M   ef      QPS_RV      QPS_RVV  QPS_Ratio
0     8  100  911.671397  1100.249525  20.684879
1     8  128  893.691409  1079.833206  20.828420
2     8  200  869.730406  1051.376189  20.885297
3     8  256  868.529441  1058.760954  21.902713
4     8  300  863.231822  1040.185230  20.498944
5     8  400  860.467302  1036.455268  20.452604
6     8  512  863.521064  1036.874533  20.075187
7    16  100  641.140069   773.439753  20.635067
8    16  128  598.605061   731.542767  22.207915
9    16  200  543.628941   658.879748  21.200271
10   16  256  524.128822   630.343980  20.265086
11   16  300  509.091879   621.166798  22.014674
12   16  400  492.573004   600.398760  21.890310
13   16  512  483.981271   587.729992  21.436516
14   32  100  541.764172   656.627239