# 检索与建索时间随维度变化绘图
本 notebook 用于读取 results-*.csv 文件，分别绘制不同 workload 下维度与平均建索时间（单位：秒）和检索时间（单位：毫秒）的折线图。
所有生成的图片将保存到 `plots/` 文件夹中。

In [55]:
import os
import glob
import re
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import MaxNLocator


In [56]:
# 定义输出目录
output_dir = 'plots_output'


## 字体设置函数

In [57]:
def set_fonts():
    font_ch = FontProperties(fname="/usr/share/fonts/SIMSUN.TTC", size=12)
    font_en = FontProperties(family="Times New Roman", size=12)
    return font_ch, font_en


In [58]:
markers = [
    "o",
    "D",
    "s",
    "^",
    "v",
    "<",
    ">",
    "p",
    "*",
    "h",
    "+",
    "x",
    "|",
    "_",
]

linestyles = [
    "--",
    "-",
    "-.",
    ":",
]


In [59]:
def plot_index_time_vs_dim(csv_path, output_dir):
    # 创建输出目录（如果不存在）
    os.makedirs(output_dir, exist_ok=True)
    
    df = pd.read_csv(csv_path)
    df = df[["workload_type", "dimension", "avg_index_time(ms)"]]
    df["dimension"] = df["dimension"].astype(int)
    df["avg_index_time(s)"] = df["avg_index_time(ms)"] / 1000.0
    zh_font, en_font = set_fonts()
    plt.figure(figsize=(4, 3))
    for i, (workload_type, group) in enumerate(df.groupby("workload_type")):
        group = group.sort_values("dimension")
        wt = workload_type.upper()
        marker = markers[i % len(markers)]
        linestyle = linestyles[i % len(linestyles)]
        plt.plot(
            group["dimension"],
            group["avg_index_time(s)"],
            marker=marker,
            linestyle=linestyle,
            label=wt,
            linewidth=1,
            markersize=3,
        )
    plt.xticks(fontproperties=en_font)
    plt.yticks(fontproperties=en_font)
    plt.xlabel("数据规模/向量维度", fontproperties=zh_font)
    plt.ylabel("建索引时间 (s)", fontproperties=zh_font)
    # plt.title("不同 workload 下建索引时间随维度变化", fontproperties=zh_font)
    plt.xscale("log", base=2)
    ax = plt.gca()
    ax.yaxis.set_major_locator(MaxNLocator(integer=True)) 
    # plt.ylim(0, max(df["avg_index_time(s)"]) * 1.1)
    plt.legend(frameon=True, prop=en_font, loc="upper left")
    y_upper_limit = df["avg_index_time(s)"].max() * 1.05
    plt.ylim(0, y_upper_limit)
    # plt.grid(True, linestyle="--", alpha=0.5)
    plt.grid(axis="y", linestyle="--", linewidth=0.5, color="gray", alpha=0.6)
    # plt.gca().set_axisbelow(True)
    plt.tight_layout()
    m = re.match(r"results-(.+)\.csv", os.path.basename(csv_path))
    suffix = m.group(1) if m else "unknown"
    fig_name = os.path.join(output_dir, f"index_time_vs_dim-{suffix}.pdf")
    plt.savefig(fig_name)
    plt.close()
    print(f"Plot saved as {fig_name}")


## 检索时间随维度变化折线图（单位：毫秒）

In [60]:
def plot_search_time_vs_dim(csv_path, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_csv(csv_path)
    df = df[["workload_type", "dimension", "avg_search_time(ms)"]]
    df["dimension"] = df["dimension"].astype(int)
    df["avg_avg_search_time(ms)"] = df["avg_search_time(ms)"] / 100.0
    zh_font, en_font = set_fonts()
    plt.figure(figsize=(4, 3))
    for i, (workload_type, group) in enumerate(df.groupby("workload_type")):
        group = group.sort_values("dimension")
        wt = workload_type.upper()
        marker = markers[i % len(markers)]
        linestyle = linestyles[i % len(linestyles)]
        plt.plot(
            group["dimension"],
            group["avg_avg_search_time(ms)"],
            marker=marker,
            linestyle=linestyle,
            label=wt,
            linewidth=1,
            markersize=3
        )
    plt.xlabel("数据规模/向量维度", fontproperties=zh_font)
    plt.ylabel("平均检索时间 (ms)", fontproperties=zh_font)
    # plt.title("不同 workload 下检索时间随维度变化", fontproperties=zh_font)
    plt.xscale("log", base=2)
    # plt.ylim(bottom=0)
    ax = plt.gca()
    ax.yaxis.set_major_locator(MaxNLocator(integer=True)) 
    y_upper_limit = df["avg_avg_search_time(ms)"] .max() * 1.05
    plt.ylim(0, y_upper_limit)
    plt.legend(frameon=True, prop=en_font, loc="upper left")
    plt.grid(axis="y", linestyle="--", linewidth=0.5, color="gray", alpha=0.6)
    plt.tight_layout()
    m = re.match(r"results-(.+)\.csv", os.path.basename(csv_path))
    suffix = m.group(1) if m else "unknown"
    fig_name = os.path.join(output_dir, f"search_time_vs_dim-{suffix}.pdf")
    plt.savefig(fig_name)
    plt.close()
    print(f"Plot saved as {fig_name}")


## 查找并批量绘制所有 results-*.csv 文件

In [61]:
csv_files = glob.glob("results-*.csv")
if not csv_files:
    print("No results-*.csv file found.")
else:
    for csv_path in csv_files:
        plot_index_time_vs_dim(csv_path, output_dir)
        plot_search_time_vs_dim(csv_path, output_dir)


Plot saved as plots_output/index_time_vs_dim-r908fdv.pdf
Plot saved as plots_output/search_time_vs_dim-r908fdv.pdf
Plot saved as plots_output/index_time_vs_dim-c920v3.pdf
Plot saved as plots_output/search_time_vs_dim-r908fdv.pdf
Plot saved as plots_output/index_time_vs_dim-c920v3.pdf
Plot saved as plots_output/search_time_vs_dim-c920v3.pdf
Plot saved as plots_output/index_time_vs_dim-c908v.pdf
Plot saved as plots_output/search_time_vs_dim-c920v3.pdf
Plot saved as plots_output/index_time_vs_dim-c908v.pdf
Plot saved as plots_output/search_time_vs_dim-c908v.pdf
Plot saved as plots_output/index_time_vs_dim-c907fdvm.pdf
Plot saved as plots_output/search_time_vs_dim-c908v.pdf
Plot saved as plots_output/index_time_vs_dim-c907fdvm.pdf
Plot saved as plots_output/search_time_vs_dim-c907fdvm.pdf
Plot saved as plots_output/index_time_vs_dim-c906fdv.pdf
Plot saved as plots_output/search_time_vs_dim-c907fdvm.pdf
Plot saved as plots_output/index_time_vs_dim-c906fdv.pdf
Plot saved as plots_output/sear

## 分析讨论

1.  **索引构建时间 (图~\ref{fig:fpga_build_time_all}):**
    *   a. 整体趋势：RVV 优化在所有核心上是否都缩短了索引构建时间？
    *   b. 核心间对比：比较不同核心的绝对构建时间（基线和 RVV）以及 RVV 带来的相对加速比。缓存配置（如 C906 无 L2 vs C920v3 有 2MB L2）对构建时间的影响如何？RVV 在不同缓存配置下的加速效果是否有差异？
    *   c. 维度影响：随着向量维度的增加，构建时间如何变化？RVV 的加速效果是否随维度增加而更明显？
2.  **平均查询延迟 (图~\ref{fig:fpga_query_latency_all}):**
    *   a. 整体趋势：RVV 优化在所有核心上是否都降低了平均查询延迟？
    *   b. 核心间对比：与构建时间类似，比较不同核心的绝对查询延迟和 RVV 加速比。缓存配置对查询延迟的影响如何？
    *   c. 维度影响：查询延迟随维度的变化趋势？RVV 加速效果与维度的关系？
3.  **综合分析：**
    *   a. RVV 优化对构建时间和查询延迟的影响是否一致？
    *   b. 结合核心配置（特别是缓存），解释观察到的性能差异和 RVV 加速效果的差异。访存是否仍然是 FPGA 仿真环境下的一个重要因素？

*TODO: 填充详细的分析内容，利用新的图表布局进行横向对比*

## 数据分析代码

In [62]:
import pandas as pd
import glob
import os

# Find all results files
csv_files_analysis = sorted(glob.glob("results-*.csv"))

if not csv_files_analysis:
    print("No results-*.csv file found for analysis.")
else:
    all_results = []
    for csv_path in csv_files_analysis:
        m = re.match(r"results-(.+)\.csv", os.path.basename(csv_path))
        suffix = m.group(1) if m else "unknown"
        print(f"--- Analyzing {suffix} ({csv_path}) ---")
        df = pd.read_csv(csv_path)
        
        # Pivot the table
        df_pivot = df.pivot(index='dimension', columns='workload_type', 
                            values=['avg_index_time(ms)', 'avg_search_time(ms)'])
        
        # Flatten MultiIndex columns
        df_pivot.columns = ['_'.join(col).strip() for col in df_pivot.columns.values]
        
        # Check if both rv and rvv columns exist
        required_cols = [
            'avg_index_time(ms)_rv', 'avg_index_time(ms)_rvv', 
            'avg_search_time(ms)_rv', 'avg_search_time(ms)_rvv'
        ]
        if all(col in df_pivot.columns for col in required_cols):
            # Calculate Ratios (rvv_time / rv_time)
            df_pivot['index_time_ratio'] = df_pivot['avg_index_time(ms)_rvv'] / df_pivot['avg_index_time(ms)_rv']
            df_pivot['search_time_ratio'] = df_pivot['avg_search_time(ms)_rvv'] / df_pivot['avg_search_time(ms)_rv']
            
            # Calculate Percentage Reduction ((rv_time - rvv_time) / rv_time * 100)
            df_pivot['index_time_reduction_%'] = (df_pivot['avg_index_time(ms)_rv'] - df_pivot['avg_index_time(ms)_rvv']) / df_pivot['avg_index_time(ms)_rv'] * 100
            df_pivot['search_time_reduction_%'] = (df_pivot['avg_search_time(ms)_rv'] - df_pivot['avg_search_time(ms)_rvv']) / df_pivot['avg_search_time(ms)_rv'] * 100
            
            # Select and display relevant columns
            result_df = df_pivot[[
                'avg_index_time(ms)_rv', 'avg_index_time(ms)_rvv', 'index_time_ratio', 'index_time_reduction_%',
                'avg_search_time(ms)_rv', 'avg_search_time(ms)_rvv', 'search_time_ratio', 'search_time_reduction_%'
            ]].copy()
            # Add source file info (core identifier)
            result_df['source_file'] = suffix 
            all_results.append(result_df.reset_index())
            print(result_df)
            print("\n")
        else:
            print(f"Skipping {suffix}: Missing required rv/rvv columns.")
            print("Available columns:", df_pivot.columns.tolist())
            print("\n")
            
    # Combine all results into a single DataFrame
    if all_results:
        combined_results = pd.concat(all_results, ignore_index=True)
        print("--- Combined Results (includes source_file column to distinguish cores) ---")
        # Display combined results (optional, might be large)
        # print(combined_results)
        
        # Save combined results to a text file (tab-separated)
        # The 'source_file' column distinguishes data from different cores
        output_txt_file = 'analysis_results.txt'
        combined_results.to_csv(output_txt_file, sep='\t', index=False, float_format='%.6f')
        print(f"\nCombined analysis results saved to {output_txt_file}")
        
        # --- Summary Calculations ---
        print("\n--- Summary Statistics ---")
        # 1. Average reduction across all files, grouped by dimension
        avg_reduction_by_dim = combined_results.groupby('dimension')[['index_time_reduction_%', 'search_time_reduction_%']].mean()
        print("\nAverage Reduction Percentage across all files (grouped by dimension):")
        print(avg_reduction_by_dim)
        
        # 2. Aggregate reduction stats (mean, min, max) per core (source_file)
        core_summary_stats = combined_results.groupby('source_file').agg(
            index_reduction_mean=('index_time_reduction_%', 'mean'),
            index_reduction_min=('index_time_reduction_%', 'min'),
            index_reduction_max=('index_time_reduction_%', 'max'),
            search_reduction_mean=('search_time_reduction_%', 'mean'),
            search_reduction_min=('search_time_reduction_%', 'min'),
            search_reduction_max=('search_time_reduction_%', 'max')
        )
        print("\nReduction Percentage Statistics (Mean, Min, Max) by Core:")
        print(core_summary_stats)
        
        # Save the per-core summary statistics to a separate file
        output_summary_file = 'analysis_summary_by_core.txt'
        # Save with index (core name) and format floats
        core_summary_stats.to_csv(output_summary_file, sep='\t', float_format='%.2f') 
        print(f"\nPer-core summary statistics (mean, min, max) saved to {output_summary_file}")


--- Analyzing c906fdv (results-c906fdv.csv) ---
           avg_index_time(ms)_rv  avg_index_time(ms)_rvv  index_time_ratio  \
dimension                                                                    
8                       8648.435                8509.137          0.983893   
16                     10923.280               10387.010          0.950906   
32                     15133.201               14604.190          0.965043   
64                     20824.405               21526.471          1.033714   
128                    31015.834               31577.619          1.018113   
256                    51055.570               51053.538          0.999960   
512                    92957.575               92027.066          0.989990   
1024                  173069.950              170489.722          0.985091   
2048                  339395.095              333876.059          0.983739   
3072                  495008.328              486500.066          0.982812   
4096            