## Create persistent vector database

In [1]:
import chromadb
import pandas as pd
from tqdm import tqdm

In [2]:
persistentClient = chromadb.PersistentClient(path="book_vec_db")

In [3]:
books_collection = persistentClient.create_collection( name="books", 
       metadata={"hnsw:space": "cosine"}
)
#books_collection = persistentClient.get_collection("books")

In [4]:
book_df = pd.read_csv("./dataset.csv", on_bad_lines="skip", encoding="latin-1")

In [5]:
book_df.columns

Index(['authors', 'bestsellers-rank', 'categories', 'description',
       'dimension-x', 'dimension-y', 'dimension-z', 'edition',
       'edition-statement', 'for-ages', 'format', 'id', 'illustrations-note',
       'image-checksum', 'image-path', 'image-url', 'imprint', 'index-date',
       'isbn10', 'isbn13', 'lang', 'publication-date', 'publication-place',
       'rating-avg', 'rating-count', 'title', 'url', 'weight'],
      dtype='object')

In [7]:
batch_size = 100
for i in tqdm(range(len(book_df)//batch_size)):
    current_i = i * batch_size
    title_frame = book_df.iloc[current_i:current_i+batch_size]["title"].copy()
    metadata_frame = book_df.iloc[current_i:current_i+batch_size].loc[:, ["authors", "for-ages", "publication-date", "description"]].copy()
    id_frame = book_df.iloc[current_i:current_i+batch_size]["isbn13"].astype(str).copy()
    books_collection.add(
        documents=title_frame.to_list(),
        metadatas=metadata_frame.to_dict(orient="records"),
        ids=id_frame.to_list()
    )

100%|██████████| 11093/11093 [5:04:59<00:00,  1.65s/it] 


In [9]:
results = books_collection.query(query_texts=["Soldier"], n_results=3)
print(results["documents"])

[['Soldier', 'Soldier', 'The Soldier']]
