Skip to content

Commit

Permalink
Merge pull request #510 from kemingy/pgvectors
Browse files Browse the repository at this point in the history
add benchmark for pgvecto.rs
  • Loading branch information
erikbern committed Apr 15, 2024
2 parents a393581 + c86ce6d commit 75043ab
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ jobs:
- panng_ngt
- pg_embedding
- pgvector
- pgvecto_rs
- pynndescent
- redisearch
- qdrant
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ Evaluated
* [Milvus](https://github.com/milvus-io/milvus) ![https://img.shields.io/github/stars/milvus-io/milvus?style=social](https://img.shields.io/github/stars/milvus-io/milvus?style=social): [Knowhere](https://github.com/milvus-io/knowhere)
* [Zilliz(Glass)](https://github.com/hhy3/pyglass)
* [pgvector](https://github.com/pgvector/pgvector) ![https://img.shields.io/github/stars/pgvector/pgvector?style=social](https://img.shields.io/github/stars/pgvector/pgvector?style=social)
* [pgvecto.rs](https://github.com/tensorchord/pgvecto.rs) ![https://img.shields.io/github/stars/tensorchord/pgvecto.rs?style=social](https://img.shields.io/github/stars/tensorchord/pgvecto.rs?style=social)
* [RediSearch](https://github.com/redisearch/redisearch) ![https://img.shields.io/github/stars/redisearch/redisearch?style=social](https://img.shields.io/github/stars/redisearch/redisearch?style=social)
* [pg_embedding](https://github.com/neondatabase/pg_embedding) ![https://img.shields.io/github/stars/pg_embedding/pg_embedding?style=social](https://img.shields.io/github/stars/neondatabase/pg_embedding?style=social)
* [Descartes(01AI)](https://github.com/xiaoming-01ai/descartes)
Expand Down
26 changes: 26 additions & 0 deletions ann_benchmarks/algorithms/pgvecto_rs/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
FROM tensorchord/pgvecto-rs:pg16-v0.3.0-alpha.1

# https://github.com/tensorchord/pgvecto.rs

RUN apt-get update \
&& apt-get install -y python3-pip

WORKDIR /home/app
COPY requirements.txt .

RUN python3 -m pip install --break-system-packages -r requirements.txt
RUN python3 -m pip install --break-system-packages psycopg[binary]

COPY run_algorithm.py .

ENV POSTGRES_PASSWORD=password
ENV POSTGRES_USER=postgres

RUN printf '#!/bin/bash\n\
runuser -u postgres -- initdb \n\
runuser -u postgres -- postgres -c shared_preload_libraries=vectors.so &\n\
sleep 5\n\
python3 -u run_algorithm.py "$@"' > entrypoint.sh \
&& chmod u+x entrypoint.sh

ENTRYPOINT ["/home/app/entrypoint.sh"]
17 changes: 17 additions & 0 deletions ann_benchmarks/algorithms/pgvecto_rs/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
float:
any:
- base_args: ['@metric']
constructor: PGVectoRS
disabled: false
docker_tag: ann-benchmarks-pgvecto_rs
module: ann_benchmarks.algorithms.pgvecto_rs
name: pgvecto_rs
run_groups:
M-16:
arg_groups: [{M: 16, efConstruction: 200}]
args: {}
query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
M-24:
arg_groups: [{M: 24, efConstruction: 200}]
args: {}
query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
99 changes: 99 additions & 0 deletions ann_benchmarks/algorithms/pgvecto_rs/module.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import struct
import time

import numpy as np
import psycopg
from psycopg.adapt import Dumper, Loader
from psycopg.pq import Format
from psycopg.types import TypeInfo

from ..base.module import BaseANN


class VectorDumper(Dumper):
format = Format.BINARY

def dump(self, obj):
return struct.pack(f"<H{len(obj)}f", len(obj), *obj)


class VectorLoader(Loader):
def load(self, buf):
if isinstance(buf, memoryview):
buf = bytes(buf)
dim = struct.unpack_from("<H", buf)[0]
return np.frombuffer(buf, dtype="<f", count=dim, offset=2)


def register_vector(conn: psycopg.Connection):
info = TypeInfo.fetch(conn=conn, name="vector")
register_vector_type(conn, info)


def register_vector_type(conn: psycopg.Connection, info: TypeInfo):
if info is None:
raise ValueError("vector type not found")
info.register(conn)

class VectorBinaryDumper(VectorDumper):
oid = info.oid

adapters = conn.adapters
adapters.register_dumper(list, VectorBinaryDumper)
adapters.register_dumper(np.ndarray, VectorBinaryDumper)
adapters.register_loader(info.oid, VectorLoader)


class PGVectoRS(BaseANN):
def __init__(self, metric, method_param) -> None:
self.metric = metric
self.m = method_param["M"]
self.ef_construction = method_param["efConstruction"]
self.ef_search = 100

if metric == "angular":
self.query_sql = "SELECT id FROM items ORDER BY embedding <=> %s LIMIT %s"
self.index_sql = f"CREATE INDEX ON items USING vectors (embedding vector_cos_ops) WITH (options = $$[indexing.hnsw]\nm = {self.m}\nef_construction = {self.ef_construction}$$)"
elif metric == "euclidean":
self.query_sql = "SELECT id FROM items ORDER BY embedding <-> %s LIMIT %s"
self.index_sql = f"CREATE INDEX ON items USING vectors (embedding vector_l2_ops) WITH (options = $$[indexing.hnsw]\nm = {self.m}\nef_construction = {self.ef_construction}$$)"
else:
raise RuntimeError(f"unknown metric {metric}")

self.connect = psycopg.connect(user="postgres", password="password", autocommit=True)
self.connect.execute("SET search_path = \"$user\", public, vectors")
self.connect.execute("CREATE EXTENSION IF NOT EXISTS vectors")
register_vector(self.connect)

def fit(self, X):
dim = X.shape[1]

cur = self.connect.cursor()
cur.execute("DROP TABLE IF EXISTS items")
cur.execute(f"CREATE TABLE items (id int, embedding vector({dim}))")
with cur.copy("COPY items (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy:
copy.set_types(["int4", "vector"])
for i, emb in enumerate(X):
copy.write_row((i, emb))

cur.execute(self.index_sql)
print("waiting for indexing to finish...")
for _ in range(3600):
cur.execute("SELECT idx_indexing FROM vectors.pg_vector_index_stat WHERE tablename='items'")
if not cur.fetchone()[0]:
break
time.sleep(10)

def set_query_arguments(self, ef_search):
self.ef_search = ef_search
self.connect.execute(f"SET vectors.hnsw_ef_search = {ef_search}")

def query(self, vec, num):
cur = self.connect.execute(self.query_sql, (vec, num), binary=True, prepare=True)
return [id for (id,) in cur.fetchall()]

def __str__(self):
return (
f"PGVectoRS(metric={self.metric}, m={self.m}, "
f"ef_construction={self.ef_construction}, ef_search={self.ef_search})"
)

0 comments on commit 75043ab

Please sign in to comment.