Skip to content

Commit

Permalink
Add IVF bench for PgVector extension
Browse files Browse the repository at this point in the history
Signed-off-by: Artem Barger <artem@bargr.net>
  • Loading branch information
C0rWin committed Apr 16, 2024
1 parent 75043ab commit 5d4a8d0
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 1 deletion.
11 changes: 10 additions & 1 deletion ann_benchmarks/algorithms/pgvector/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ float:
disabled: false
docker_tag: ann-benchmarks-pgvector
module: ann_benchmarks.algorithms.pgvector
name: pgvector
name: pgvector_hnsw
run_groups:
M-16:
arg_groups: [{M: 16, efConstruction: 200}]
Expand All @@ -15,3 +15,12 @@ float:
arg_groups: [{M: 24, efConstruction: 200}]
args: {}
query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
- base_args: ['@metric']
constructor: PGVectorIVF
disabled: false
docker_tag: ann-benchmarks-pgvector
module: ann_benchmarks.algorithms.pgvector
name: pgvector_ivf
run_groups:
base:
args: [[32, 64, 128, 256, 512, 1024]]
52 changes: 52 additions & 0 deletions ann_benchmarks/algorithms/pgvector/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,55 @@ def get_memory_usage(self):

def __str__(self):
return f"PGVector(m={self._m}, ef_construction={self._ef_construction}, ef_search={self._ef_search})"


class PGVectorIVF(BaseANN):
def __init__(self, metric, n_list):
self._metric = metric
self._n_list = n_list
self._cur = None

if metric == "angular":
self._query = "SELECT id FROM items ORDER BY embedding <=> %s LIMIT %s"
elif metric == "euclidean":
self._query = "SELECT id FROM items ORDER BY embedding <-> %s LIMIT %s"
else:
raise RuntimeError(f"unknown metric {metric}")

def fit(self, X):
subprocess.run("service postgresql start", shell=True, check=True, stdout=sys.stdout, stderr=sys.stderr)
conn = psycopg.connect(user="ann", password="ann", dbname="ann", autocommit=True)
pgvector.psycopg.register_vector(conn)
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS items")
cur.execute("CREATE TABLE items (id int, embedding vector(%d))" % X.shape[1])
cur.execute("ALTER TABLE items ALTER COLUMN embedding SET STORAGE PLAIN")
print("copying data...")
with cur.copy("COPY items (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy:
copy.set_types(["int4", "vector"])
for i, embedding in enumerate(X):
copy.write_row((i, embedding))
print("creating index...")
if self._metric == "angular":
cur.execute(
"CREATE INDEX ON items USING ivfflat (embedding vector_cosine_ops) WITH (lists = %d)" % (self._n_list)
)
elif self._metric == "euclidean":
cur.execute("CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = %d)" % (self._n_list))
else:
raise RuntimeError(f"unknown metric {self._metric}")
print("done!")
self._cur = cur

def query(self, v, n):
self._cur.execute(self._query, (v, n), binary=True, prepare=True)
return [id for id, in self._cur.fetchall()]

def get_memory_usage(self):
if self._cur is None:
return 0
self._cur.execute("SELECT pg_relation_size('items_embedding_idx')")
return self._cur.fetchone()[0] / 1024

def __str__(self):
return f"PGVectorIVF(n_list={self._n_list})"

0 comments on commit 5d4a8d0

Please sign in to comment.