# 检索与建索时间随维度变化绘图
本 notebook 用于读取 results-*.csv 文件，分别绘制不同 workload 下维度与平均建索时间（单位：秒）和检索时间（单位：毫秒）的折线图。
所有生成的图片将保存到 `plots/` 文件夹中。

In [None]:
import os
import glob
import re
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import MaxNLocator


In [None]:
# 定义输出目录
output_dir = 'plots_output'


## 字体设置函数

In [None]:
def set_fonts():
    font_ch = FontProperties(fname="/usr/share/fonts/SIMSUN.TTC", size=12)
    font_en = FontProperties(family="Times New Roman", size=12)
    return font_ch, font_en


In [None]:
markers = [
    "o",
    "D",
    "s",
    "^",
    "v",
    "<",
    ">",
    "p",
    "*",
    "h",
    "+",
    "x",
    "|",
    "_",
]

linestyles = [
    "--",
    "-",
    "-.",
    ":",
]


## 建索时间随维度变化折线图（单位：秒）

In [None]:
def plot_index_time_vs_dim(csv_path, output_dir):
    # 创建输出目录（如果不存在）
    os.makedirs(output_dir, exist_ok=True)
    
    df = pd.read_csv(csv_path)
    df = df[["workload_type", "dimension", "avg_index_time(ms)"]]
    df["dimension"] = df["dimension"].astype(int)
    df["avg_index_time(s)"] = df["avg_index_time(ms)"] / 1000.0
    zh_font, en_font = set_fonts()
    plt.figure(figsize=(4, 3))
    for i, (workload_type, group) in enumerate(df.groupby("workload_type")):
        group = group.sort_values("dimension")
        wt = workload_type.upper()
        marker = markers[i % len(markers)]
        linestyle = linestyles[i % len(linestyles)]
        plt.plot(
            group["dimension"],
            group["avg_index_time(s)"],
            marker=marker,
            linestyle=linestyle,
            label=wt,
            linewidth=1,
            markersize=3,
        )
    plt.xticks(fontproperties=en_font)
    plt.yticks(fontproperties=en_font)
    plt.xlabel("数据规模/向量维度", fontproperties=zh_font)
    plt.ylabel("建索引时间 (s)", fontproperties=zh_font)
    # plt.title("不同 workload 下建索引时间随维度变化", fontproperties=zh_font)
    plt.xscale("log", base=2)
    ax = plt.gca()
    ax.yaxis.set_major_locator(MaxNLocator(integer=True)) 
    # plt.ylim(0, max(df["avg_index_time(s)"]) * 1.1)
    plt.legend(frameon=True, prop=en_font, loc="upper left")
    y_upper_limit = df["avg_index_time(s)"].max() * 1.05
    plt.ylim(0, y_upper_limit)
    # plt.grid(True, linestyle="--", alpha=0.5)
    plt.grid(axis="y", linestyle="--", linewidth=0.5, color="gray", alpha=0.6)
    # plt.gca().set_axisbelow(True)
    plt.tight_layout()
    m = re.match(r"results-(.+)\.csv", os.path.basename(csv_path))
    suffix = m.group(1) if m else "unknown"
    fig_name = os.path.join(output_dir, f"index_time_vs_dim-{suffix}.pdf")
    plt.savefig(fig_name)
    plt.close()
    print(f"Plot saved as {fig_name}")


## 检索时间随维度变化折线图（单位：毫秒）

In [None]:
def plot_search_time_vs_dim(csv_path, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_csv(csv_path)
    df = df[["workload_type", "dimension", "avg_search_time(ms)"]]
    df["dimension"] = df["dimension"].astype(int)
    df["avg_avg_search_time(ms)"] = df["avg_search_time(ms)"] / 100.0
    zh_font, en_font = set_fonts()
    plt.figure(figsize=(4, 3))
    for i, (workload_type, group) in enumerate(df.groupby("workload_type")):
        group = group.sort_values("dimension")
        wt = workload_type.upper()
        marker = markers[i % len(markers)]
        linestyle = linestyles[i % len(linestyles)]
        plt.plot(
            group["dimension"],
            group["avg_avg_search_time(ms)"],
            marker=marker,
            linestyle=linestyle,
            label=wt,
            linewidth=1,
            markersize=3
        )
    plt.xlabel("数据规模/向量维度", fontproperties=zh_font)
    plt.ylabel("平均检索时间 (ms)", fontproperties=zh_font)
    # plt.title("不同 workload 下检索时间随维度变化", fontproperties=zh_font)
    plt.xscale("log", base=2)
    # plt.ylim(bottom=0)
    ax = plt.gca()
    ax.yaxis.set_major_locator(MaxNLocator(integer=True)) 
    y_upper_limit = df["avg_avg_search_time(ms)"] .max() * 1.05
    plt.ylim(0, y_upper_limit)
    plt.legend(frameon=True, prop=en_font, loc="upper left")
    plt.grid(axis="y", linestyle="--", linewidth=0.5, color="gray", alpha=0.6)
    plt.tight_layout()
    m = re.match(r"results-(.+)\.csv", os.path.basename(csv_path))
    suffix = m.group(1) if m else "unknown"
    fig_name = os.path.join(output_dir, f"search_time_vs_dim-{suffix}.pdf")
    plt.savefig(fig_name)
    plt.close()
    print(f"Plot saved as {fig_name}")


## 查找并批量绘制所有 results-*.csv 文件

In [None]:
csv_files = glob.glob("results-*.csv")
if not csv_files:
    print("No results-*.csv file found.")
else:
    for csv_path in csv_files:
        plot_index_time_vs_dim(csv_path, output_dir)
        plot_search_time_vs_dim(csv_path, output_dir)
