In [None]:
import subprocess
import time
from datetime import timedelta
import gzip
import pandas as pd
import numpy as np
import json
import requests
from pathlib import Path
import io
from zipfile import ZipFile

In [None]:
vespa_name = "benchmark_vespa"
vespa_host = "localhost"
vespa_port = 8090
vespa_management_port = 19081
vespa_version = "8.319.9"

content_path = Path("dataset/passages-c400-jawiki-20230403")
embedding_path = Path("dataset/passages-c400-jawiki-20230403/multilingual-e5-base-passage")
num_of_docs = 5555583
index_size = 5000000
bulk_size = 10000

index_name = "contents"

In [None]:
def run_vespa():
    print(f"Starting {vespa_name}... ", end="")
    docker_cmd = [
        "docker", "run", "-d",
        "--name", vespa_name,
        "-p", f"{vespa_port}:8080",
        "-p", f"{vespa_management_port}:19071",
        # "-v", "./data:/opt/vespa/var",
        f"vespaengine/vespa:{vespa_version}"
    ]
    result = subprocess.run(docker_cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print("[OK]")
    else:
        print(f"[FAIL]")
        print("STDOUT:")
        print(result.stdout)
        print("STDERR:")
        print(result.stderr)

In [None]:
def stop_vespa():
    print(f"Stopping {vespa_name}... ", end="")
    docker_cmd = [
        "docker", "stop", vespa_name
    ]
    result = subprocess.run(docker_cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print("[OK]")
    else:
        print(f"[FAIL]")
        print("STDOUT:")
        print(result.stdout)
        print("STDERR:")
        print(result.stderr)

In [None]:
def prune_docker():
    print(f"Cleaning up... ", end="")
    docker_cmd = [
        "docker", "system", "prune", "-f"
    ]
    result = subprocess.run(docker_cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print("[OK]")
    else:
        print(f"[FAIL]")
        print("STDOUT:")
        print(result.stdout)
        print("STDERR:")
        print(result.stderr)

In [None]:
service_xml_str = f"""<?xml version='1.0' encoding='UTF-8'?>
<services version="1.0" xmlns:deploy="vespa" xmlns:preprocess="properties">
  <container id='default' version='1.0'>
    <search></search>
    <document-api></document-api>
    <nodes>
      <node hostalias='node1'></node>
    </nodes>
  </container>
  <content id='wikipedia' version='1.0'>
    <redundancy>2</redundancy>
    <documents>
      <document type="{index_name}" mode="index"/>
    </documents>
    <nodes>
      <node hostalias="node1" distribution-key="0" />
    </nodes>
  </content>
</services>
"""

sd_str = """
schema {index_name} {
    document {index_name} {
        field page_id type int {
            indexing: attribute | summary
        }
        field rev_id type int {
            indexing: attribute | summary
        }
        field title type string {
            indexing: index | summary
            index: enable-bm25
        }
        field section type string {
            indexing: attribute | summary
            attribute: fast-search
        }
        field text type string {
            indexing: index | summary
            index: enable-bm25
        }
        field embedding type tensor<float>(x[768]) {
            indexing: attribute | index
            attribute {
                distance-metric: angular
            }
            index {
                hnsw {
                    max-links-per-node: 16
                    neighbors-to-explore-at-insert: 100
                }
            }
        }
    }

    fieldset default {
        fields: title,text
    }

    rank-profile default {
        first-phase {
            expression: nativeRank(title, text)
        }
    }

    rank-profile closeness {
        num-threads-per-search: 1
        match-features: distance(field, embedding)

        inputs {
            query(q)  tensor<float>(x[768])
            query(qa) tensor<float>(x[768])
        }

        first-phase {
            expression: closeness(field, embedding)
        }
    }
}
""".replace("{index_name}", index_name)

def create_index():
    zip_buffer = io.BytesIO()
    with ZipFile(zip_buffer, 'w') as zip_file:
        zip_file.writestr("services.xml", service_xml_str)
        zip_file.writestr(f"schemas/{index_name}.sd", sd_str)
    zip_buffer.seek(0)
    
    print(F"Creating {index_name}... ", end="")
    response = requests.post(f"http://{vespa_host}:{vespa_management_port}/application/v2/tenant/default/prepareandactivate",
                             headers={"Content-Type": "application/zip"},
                             data=zip_buffer)
    if response.status_code == 200:
        print("[OK]")
    else:
        print(f"[FAIL]\n{response.text}")


In [None]:
def delete_index():
    print(F"Deleting {index_name}... ", end="")
    response = requests.delete(f"http://{vespa_management_port}:{vespa_port}/application/v2/tenant/default")
    if response.status_code == 200:
        print("[OK]")
    else:
        print(f"[FAIL]\n{response.text}")


In [None]:
def print_indices():
    response = requests.get(f"http://{vespa_host}:{vespa_port}/search/",
                            headers={"Content-Type": "application/json"},
                            params={
                                "yql": "select * from sources * where sddocname contains 'contents';"
                            })
    obj = json.loads(response.text)
    print(f"count: {obj.get('root').get('fields').get('totalCount')}")


In [None]:
def wait_for_index():
    while True:
        try:
            response = requests.get(f"http://{vespa_host}:{vespa_port}/search/",
                                    headers={"Content-Type": "application/json"},
                                    params={
                                        "yql": "select * from sources * where sddocname contains 'contents';"
                                    })
            if response.status_code == 200:
                break
        except:
            pass
        print(".", end="")
        time.sleep(1)
    print(".")


In [None]:
def wait_for_vespa(retry_count=60):
    print(f"Waiting for {vespa_name}", end="")
    for i in range(retry_count):
        try:
            response = requests.get(f"http://{vespa_host}:{vespa_management_port}/state/v1/health")
            if response.status_code == 200:
                obj = json.loads(response.text)
                if obj.get("status").get("code") == "up":
                    print("[OK]")        
                    return
        except:
            pass
        print(".", end="")
        time.sleep(1)
    print("[FAIL]")


In [None]:
def get_embedding(embedding_index, embedding_data, id):
    emb_index = int(id / 100000) * 100000
    if embedding_data is None or embedding_index != emb_index:
        with np.load(embedding_path / f"{emb_index}.npz") as data:
            embedding_data = data["embs"]
    return emb_index, embedding_data, embedding_data[id - emb_index]        


def insert_data(bulk_size, max_size):
    start_time = time.time()

    docs = []
    def send_data(pos):
        print(F"Sending {int(len(docs))} docs ({pos}/{max_size})... ", end="")
        with open("vespa_docs.jsonl", "wt") as f:
            for doc in docs:
                f.write(doc)
                f.write("\n")
        # use vespa command
        vespa_cmd = [
            "vespa", "feed", "vespa_docs.jsonl",
            "--target", f"http://{vespa_host}:{vespa_port}"
        ]
        now = time.time()
        result = subprocess.run(vespa_cmd, capture_output=True, text=True)
        if result.returncode == 0:
            t = time.time() - now
            print(f"[OK] {t}")
            return t
        else:
            print(f"[FAIL] 0 {result.returncode} STDOUT:{result.stdout} STDERR:{result.stderr}")
            return 0

    total_time = 0
    count = 0
    embedding_index = -1
    embedding_data = None
    for content_file in sorted(content_path.glob("*.parquet")):
        df = pd.read_parquet(content_file)
        for i,row in df.iterrows():
            if count >= max_size:
                break
            embedding_index, embedding_data, embedding = get_embedding(embedding_index, embedding_data, row.id)
            count += 1
            docs.append(json.dumps({
                "put": f"id:{index_name}:{index_name}::{count}",
                "fields": {
                    # "page_id": row.pageid,
                    # "rev_id": row.revid,
                    # "title": row.title,
                    # "section": row.section,
                    # "text": row.text,
                    "embedding": embedding.tolist(),
                }
            }))
            if len(docs) >= bulk_size:
                total_time += send_data(count)
                docs = []

    if len(docs) > 0:
        total_time += send_data(count)

    execution_time = time.time() - start_time
    hours, remainder = divmod(execution_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f"Execution Time: {int(hours):02d}:{int(minutes):02d}:{seconds:02.2f} {total_time}")


In [None]:
def search(query):
    now = time.time()
    response = requests.post(f"http://{vespa_host}:{vespa_port}/search/",
                             headers={"Content-Type": "application/json"},
                             data=json.dumps(query))
    # print(response.text)
    took = time.time() - now

    if response.status_code == 200:
        obj = json.loads(response.text)
        product_ids = [x.get("id") for x in obj.get("root").get("children")]
        scores = [x.get("relevance") for x in obj.get("root").get("children")]
        return took * 1000, int(obj.get("root").get("coverage").get("documents")), product_ids, scores
    print(f"[FAIL][{response.status_code}] {response.text}")
    return -1, -1, [], []


In [None]:
def search_with_knn_queries(output_path, max_size=10000, page_size=100, offset=0):
    print("Sending knn queries...")
    start_time = time.time()
    pos = offset
    count = 0
    running = True
    with gzip.open(output_path, "wt", encoding="utf-8") as f:
        while running:
            with np.load(embedding_path / f"{pos}.npz") as data:
                embedding_data = data["embs"]
            for embedding in embedding_data:
                if count >= max_size:
                    running = False
                    break
                query = {
                    "hits": page_size,
                    "yql": "select word from contents where {approximate:true,targetHits:" + str(page_size) + "}nearestNeighbor(embedding,q)",
                    "ranking": "closeness",
                    "input.query(q)": embedding.tolist(),
                }
                took, total_hits, ids, scores = search(query=query)
                # print(f"{took}, {total_hits}, {ids}, {scores}")
                if took == -1:
                    continue
                result = {
                    "id": (count + 1),
                    "took": took,
                    "total_hits": total_hits,
                    "ids": ids,
                    "scores": scores,
                }
                f.write(json.dumps(result, ensure_ascii=False))
                f.write("\n")
                count += 1
                if count % 10000 == 0:
                    print(f"Sent {count}/{max_size} queries.")

            pos += 100000
            if pos > num_of_docs:
                pos = 0

    execution_time = time.time() - start_time
    hours, remainder = divmod(execution_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f"Execution Time: {int(hours):02d}:{int(minutes):02d}:{seconds:02.2f}")


In [None]:
def get_output_filename(vespa_version, name):
    filename = f"output/vespa{vespa_version.replace('.', '_')}_{name}"
    filename += ".jsonl.gz"
    return filename


In [None]:
def print_took_and_total_hits(filename, min_hits=0):
    tooks = []
    total_hits = []
    with gzip.open(filename, "rt", encoding="utf-8") as f:
        for line in f.readlines():
            obj = json.loads(line)
            hits = obj.get("total_hits")
            if hits >= min_hits:
                tooks.append(obj.get("took"))
                total_hits.append(hits)
    df = pd.DataFrame({"took": tooks, "total_hits": total_hits})
    print(df.describe().to_markdown())

In [None]:
prune_docker()
print(f"<<<Vespa {vespa_version}>>>")
run_vespa()
wait_for_vespa()

In [None]:
create_index()
wait_for_index()
print_indices()

In [None]:
insert_data(bulk_size=bulk_size, max_size=index_size)
print_indices()

In [None]:
for page_size in [10, 100, 400]:
    print(f"page size: {page_size}")
    filename = get_output_filename(vespa_version, f"knn_{page_size}")
    search_with_knn_queries(filename, page_size=page_size, max_size=1000) # warmup
    search_with_knn_queries(filename, page_size=page_size, offset=index_size)
    print_took_and_total_hits(filename)

In [None]:
# delete_index()
stop_vespa()