## **Experiment 1: Scaling Behavior — Single-Domain vs Multi-Domain Retrieval Across Backends**

This section visualizes and analyzes retrieval performance for four vector-store backends (FAISS Single, FAISS Sharded, Chroma, and Spark) across three datastore sizes (10k, 30k, 50k).  
We compare:
- **Single-domain (Wiki) vs Multi-domain**
- **p50 and p90 retrieval latency**
- **Throughput (QPS) scaling**
- **Backend-wise performance differences**

The goal is to evaluate how well each backend scales as the number of chunks increases, and how domain heterogeneity affects retrieval speed.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ============================
# 1. Mount Drive
# ============================
from google.colab import drive
drive.mount('/content/drive')


# ============================
# 2. Imports
# ============================
import os
import json
import re
import pandas as pd
import matplotlib.pyplot as plt


# ============================
# 3. Set correct base directory
# ============================
BASE_DIR = "/content/drive/MyDrive/AMS 560 PROJECT VIZ/results/bench_summary"


# ============================
# 4. Helper to find bench summary files
# ============================
def find_bench_summaries(base_dir: str):
    paths = []
    for root, _, files in os.walk(base_dir):
        for fn in files:
            if fn.endswith("_bench_summary.json"):
                paths.append(os.path.join(root, fn))
    return sorted(paths)


# ============================
# 5. Parse filename (domain, backend, size)
# ============================
def parse_filename(path: str):
    name = os.path.basename(path)
    stem = name.replace("_bench_summary.json", "")
    tokens = stem.split("_")

    domain = tokens[0]  # wiki or multidomain

    # find size token SAFELY
    size_token = None
    for t in tokens:
        if t in ("10k", "30k", "50k"):
            size_token = t
            break

    if size_token is None:
        raise ValueError(f"Cannot find size (10k/30k/50k) in filename: {name}")

    size_map = {"10k": 10000, "30k": 30000, "50k": 50000}
    size = size_map[size_token]
    size_k = size // 1000

    # backend between domain and size (skip 'llm')
    idx_size = tokens.index(size_token)
    backend_tokens = [t for t in tokens[1:idx_size] if t != "llm"]
    backend = "_".join(backend_tokens)

    return {
        "filename": name,
        "path": path,
        "domain": domain,
        "backend": backend,
        "size": size,
        "size_k": size_k,
        "size_label": size_token,
    }


# ============================
# 6. Load all JSON files
# ============================
paths = find_bench_summaries(BASE_DIR)
print("Found bench_summary files:", len(paths))
for p in paths:
    print("  ", p)

rows = []
for path in paths:
    meta = parse_filename(path)

    with open(path, "r") as f:
        js = json.load(f)

    meta.update({
        "retrieval_avg_ms": js.get("avg_ms"),
        "retrieval_p50_ms": js.get("p50_ms"),
        "retrieval_p90_ms": js.get("p90_ms"),
        "qps": js.get("qps"),
        "llm_avg_ms": js.get("llm_avg_ms"),
        "llm_p50_ms": js.get("llm_p50_ms"),
        "llm_p95_ms": js.get("llm_p95_ms"),
        "llm_qps": js.get("llm_qps"),
    })

    rows.append(meta)

df = pd.DataFrame(rows).sort_values(["domain", "backend", "size"])
df.reset_index(drop=True, inplace=True)

df

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found bench_summary files: 24
   /content/drive/MyDrive/AMS 560 PROJECT VIZ/results/bench_summary/multidomain_chroma_10k_bench_summary.json
   /content/drive/MyDrive/AMS 560 PROJECT VIZ/results/bench_summary/multidomain_chroma_30k_bench_summary.json
   /content/drive/MyDrive/AMS 560 PROJECT VIZ/results/bench_summary/multidomain_chroma_50k_bench_summary.json
   /content/drive/MyDrive/AMS 560 PROJECT VIZ/results/bench_summary/multidomain_faiss_sharded_10k_bench_summary.json
   /content/drive/MyDrive/AMS 560 PROJECT VIZ/results/bench_summary/multidomain_faiss_sharded_30k_bench_summary.json
   /content/drive/MyDrive/AMS 560 PROJECT VIZ/results/bench_summary/multidomain_faiss_sharded_50k_bench_summary.json
   /content/drive/MyDrive/AMS 560 PROJECT VIZ/results/bench_summary/multidomain_faiss_single_10k_bench_summary.json
   /content/drive/MyDrive/AMS 560 PROJECT VI

In [None]:
# ============================
# Plotting utilities
# ============================

BACKEND_LABELS = {
    "faiss_single": "FAISS (Single)",
    "faiss_sharded": "FAISS (Sharded)",
    "chroma": "Chroma",
    "spark": "Spark",
}

DOMAIN_LABELS = {
    "wiki": "Wiki (Single-domain)",
    "multidomain": "Multi-domain",
}


def plot_retrieval_latency(df, metric="retrieval_p50_ms"):
    for domain in ["wiki", "multidomain"]:
        sub = df[df["domain"] == domain]

        plt.figure()
        for backend, grp in sub.groupby("backend"):
            grp = grp.sort_values("size_k")
            plt.plot(
                grp["size_k"],
                grp[metric],
                marker="o",
                label=BACKEND_LABELS.get(backend, backend),
            )

        plt.title(f"{DOMAIN_LABELS[domain]} – {metric}")
        plt.xlabel("Datastore size (k chunks)")
        plt.ylabel(metric + " (ms)")
        plt.grid(True, alpha=0.3)
        plt.legend()
        plt.show()


def plot_qps(df):
    for domain in ["wiki", "multidomain"]:
        sub = df[df["domain"] == domain]

        plt.figure()
        for backend, grp in sub.groupby("backend"):
            grp = grp.sort_values("size_k")
            plt.plot(
                grp["size_k"],
                grp["qps"],
                marker="o",
                label=BACKEND_LABELS.get(backend, backend),
            )

        plt.title(f"{DOMAIN_LABELS[domain]} – Retrieval QPS")
        plt.xlabel("Datastore size (k chunks)")
        plt.ylabel("QPS")
        plt.grid(True, alpha=0.3)
        plt.legend()
        plt.show()


def plot_domain_comparison(df, backend="faiss_single", metric="retrieval_p50_ms"):
    sub = df[df["backend"] == backend]

    plt.figure()
    for domain, grp in sub.groupby("domain"):
        grp = grp.sort_values("size_k")
        plt.plot(
            grp["size_k"],
            grp[metric],
            marker="o",
            label=DOMAIN_LABELS.get(domain, domain),
        )

    plt.title(f"{BACKEND_LABELS.get(backend, backend)} – {metric} (Wiki vs Multidomain)")
    plt.xlabel("Datastore size (k chunks)")
    plt.ylabel(metric + " (ms)")
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.show()

In [None]:
# Retrieval latency vs size (p50 and p90)
plot_retrieval_latency(df, metric="retrieval_p50_ms")
plot_retrieval_latency(df, metric="retrieval_p90_ms")

# QPS vs size
plot_qps(df)

# Direct single vs multi-domain comparison per backend (p50)
plot_domain_comparison(df, backend="faiss_single", metric="retrieval_p50_ms")
plot_domain_comparison(df, backend="faiss_sharded", metric="retrieval_p50_ms")
plot_domain_comparison(df, backend="chroma", metric="retrieval_p50_ms")
plot_domain_comparison(df, backend="spark", metric="retrieval_p50_ms")

In [None]:
# Combined Summary Visualization — Experiment 1 (Single-domain vs Multi-domain)

import matplotlib.pyplot as plt

# Ensure consistent ordering
backends_order = ["faiss_single", "faiss_sharded", "chroma", "spark"]
backend_labels = {
    "faiss_single": "FAISS (Single)",
    "faiss_sharded": "FAISS (Sharded)",
    "chroma": "Chroma",
    "spark": "Spark"
}

colors = {
    "faiss_single": "#2ca02c",
    "faiss_sharded": "#ff7f0e",
    "chroma": "#1f77b4",
    "spark": "#d62728"
}

fig, axs = plt.subplots(2, 2, figsize=(18, 12))
fig.suptitle("Experiment 1 Summary — Single-domain vs Multi-domain Scaling", fontsize=18, fontweight="bold")


# =============================
# 1. p50 Latency (Wiki)
# =============================
ax = axs[0, 0]
sub = df[df["domain"] == "wiki"]

for backend in backends_order:
    grp = sub[sub["backend"] == backend].sort_values("size_k")
    ax.plot(grp["size_k"], grp["retrieval_p50_ms"], marker="o", label=backend_labels[backend], color=colors[backend])

ax.set_title("Wiki (Single-domain) — p50 Retrieval Latency")
ax.set_xlabel("Datastore size (k chunks)")
ax.set_ylabel("Latency (ms)")
ax.grid(True, alpha=0.3)
ax.legend()


# =============================
# 2. p50 Latency (Multidomain)
# =============================
ax = axs[0, 1]
sub = df[df["domain"] == "multidomain"]

for backend in backends_order:
    grp = sub[sub["backend"] == backend].sort_values("size_k")
    ax.plot(grp["size_k"], grp["retrieval_p50_ms"], marker="o", label=backend_labels[backend], color=colors[backend])

ax.set_title("Multidomain — p50 Retrieval Latency")
ax.set_xlabel("Datastore size (k chunks)")
ax.set_ylabel("Latency (ms)")
ax.grid(True, alpha=0.3)


# =============================
# 3. QPS (Wiki vs Multidomain — FAISS Single)
# =============================
ax = axs[1, 0]
for domain in ["wiki", "multidomain"]:
    grp = df[(df["backend"] == "faiss_single") & (df["domain"] == domain)].sort_values("size_k")
    ax.plot(grp["size_k"], grp["qps"], marker="o", label=("Wiki" if domain=="wiki" else "Multidomain"))

ax.set_title("FAISS Single — QPS Comparison")
ax.set_xlabel("Datastore size (k chunks)")
ax.set_ylabel("QPS")
ax.grid(True, alpha=0.3)
ax.legend()


# =============================
# 4. Combined Domain Gap (p50 Latency)
# =============================
ax = axs[1, 1]

for backend in backends_order:
    wiki_grp = df[(df["backend"] == backend) & (df["domain"] == "wiki")].sort_values("size_k")
    multi_grp = df[(df["backend"] == backend) & (df["domain"] == "multidomain")].sort_values("size_k")

    ax.plot(wiki_grp["size_k"], wiki_grp["retrieval_p50_ms"],
            marker="o", linestyle="-", color=colors[backend], label=f"{backend_labels[backend]} (Wiki)")

    ax.plot(multi_grp["size_k"], multi_grp["retrieval_p50_ms"],
            marker="x", linestyle="--", color=colors[backend], label=f"{backend_labels[backend]} (Multi)")

ax.set_title("p50 Latency Gap — Wiki vs Multidomain")
ax.set_xlabel("Datastore size (k chunks)")
ax.set_ylabel("Latency (ms)")
ax.grid(True, alpha=0.3)
ax.legend(bbox_to_anchor=(1.05, 1), loc="upper left")


plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

## **Conclusion: Key Findings from Experiment 1**

- **FAISS (Single & Sharded) shows excellent scalability**  
  Retrieval latency stays low (~11–14 ms) across all dataset sizes and both domains.  
  Domain heterogeneity (multi-domain) only slightly increases latency.

- **Chroma also maintains stable performance**  
  Latency remains consistent (~12–14 ms) with small fluctuations.  
  Scaling from 10k → 50k does not degrade performance significantly.

- **Spark performs poorly for real-time retrieval**  
  Retrieval latency is extremely high (18–28 seconds).  
  Throughput is effectively zero, confirming Spark is not built for low-latency vector search.

- **Single-domain datasets are slightly faster than multi-domain**  
  Particularly visible in FAISS Single and Chroma, where multi-domain introduces small overhead.  
  This indicates embedding heterogeneity increases search complexity slightly.

- **QPS results reinforce backend differences**  
  FAISS and Chroma support high throughput (40–80 QPS), while Spark collapses to near-zero QPS.

- **Overall takeaway**  
  - FAISS is the **most robust and scalable** backend across domains and dataset sizes.  
  - Chroma performs well and consistently but slightly slower.  
  - Spark is unsuitable for interactive retrieval workloads.  
  - Multi-domain scaling is successful — latency remains stable and predictable even as data diversity grows.

## **Experiment 2: Large LLM without RAG vs Small LLM with RAG**

This experiment compares a **small 1.1B LLM with RAG** (TinyLlama + multidomain 50k datastore) against a **larger standalone LLM without RAG**.  
The goal is to see whether a small model, paired with a retrieval datastore, can match or outperform a much larger model on **latency** and **throughput**, while keeping retrieval cost low.

We reuse the **multidomain 50k** bench summary from Experiment 1 for the “Small + RAG” system, and load a separate bench summary generated by the `llm_no_RAG.py` script for the Large LLM baseline.  
The code below builds a comparison table and simple bar plots for:

- p50 / average end-to-end latency (ms)  
- QPS (queries per second)  
- (optionally) tokens/sec if present in the big-LLM summary.

In [None]:
import json
import pandas as pd
import numpy as np

# Paths
SMALL_RAG_PATH = "/content/drive/MyDrive/AMS 560 PROJECT VIZ/results/bench_summary/multidomain_faiss_single_50k_bench_summary.json"
BIG_NO_RAG_PATH = "/content/drive/MyDrive/AMS 560 PROJECT VIZ/results/bench_summary/llm_only_summary.json"

def load_summary(path):
    with open(path, "r") as f:
        return json.load(f)

small_js = load_summary(SMALL_RAG_PATH)
big_js   = load_summary(BIG_NO_RAG_PATH)

print("Small + RAG summary:", small_js)
print("Big LLM (no RAG) summary:", big_js)

# compute QPS for big model from raw_times_ms
big_total_ms = sum(big_js["raw_times_ms"])
big_qps = big_js["n_queries"] / (big_total_ms / 1000.0)   # queries per second

rows = []

# Small + RAG
rows.append({
    "system": "Small LLM (1.1B) + RAG (50k)",
    "type": "small+rag",
    "lat_p50_ms": small_js.get("llm_p50_ms", small_js.get("p50_ms")),
    "lat_avg_ms": small_js.get("llm_avg_ms", small_js.get("avg_ms")),
    "qps":        small_js.get("llm_qps", small_js.get("qps")),
    "retrieval_p50_ms": small_js.get("p50_ms"),
    "retrieval_avg_ms": small_js.get("avg_ms"),
})

# Big LLM only
rows.append({
    "system": "Big LLM (No RAG)",
    "type": "big_no_rag",
    "lat_p50_ms": big_js.get("p50_ms"),
    "lat_avg_ms": big_js.get("avg_ms"),
    "qps":        big_qps,
    "retrieval_p50_ms": np.nan,   # no retrieval stage
    "retrieval_avg_ms": np.nan,
})

exp2_df = pd.DataFrame(rows)
exp2_df

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.bar(exp2_df["system"], exp2_df["lat_p50_ms"])
plt.title("End-to-end p50 Latency")
plt.ylabel("Latency (ms)")
plt.xticks(rotation=20)
plt.grid(axis="y", alpha=0.3)

plt.subplot(1, 3, 2)
plt.bar(exp2_df["system"], exp2_df["lat_avg_ms"])
plt.title("End-to-end Avg Latency")
plt.ylabel("Latency (ms)")
plt.xticks(rotation=20)
plt.grid(axis="y", alpha=0.3)

plt.subplot(1, 3, 3)
plt.bar(exp2_df["system"], exp2_df["qps"])
plt.title("Throughput (QPS)")
plt.ylabel("Queries per second")
plt.xticks(rotation=20)
plt.grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.show()

## **Experiment 2 Conclusion**

- The small 1.1B model with RAG (TinyLlama + 50k multidomain index) has end-to-end latency around **11.5–11.6 seconds per query**, mainly dominated by slow LLM decoding.
- The large standalone LLM without RAG answers in about **1.2–1.3 seconds per query**, achieving **~0.78 QPS**, roughly **9–10× faster** than the small + RAG system.
- Retrieval itself is cheap (≈12–13 ms), but in this setup the small model is not efficient enough to compensate for its size advantage.
- This suggests that, with the current models and infrastructure, a **well-optimized larger LLM without RAG can outperform a slow small-LLM+RAG stack on latency and throughput**, even though RAG may still help with grounding and knowledge freshness.