# Indexing script for products

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
from langchain_core.documents.base import Document
import json



# set parameters
#---------------------------------------------------------------------------------------
# general parameters
product_file = "notino_products_info.json" # json file containing product information

# Qdrant parameters
collection_name = "product_collection" # name of the Qdrant collection
db_path = "test_rag_db" # path to the Qdrant database
distance = Distance.COSINE # distance metric for the Qdrant database

# embedding model parameters
model_name = "sentence-transformers/all-mpnet-base-v2" # embedding model name
#---------------------------------------------------------------------------------------

# load and process items from the product file
with open(product_file) as f:
    product_list = json.load(f)

    docs = [Document(product["description"] + "\n\n" + product["full_description"], 
                     metadata={"url": product["url"],
                               "brand": product["brand"],
                               "product_name": product["product_name"],
                               "price": product["price"],
                               "composition": product["composition"],}) for product in product_list]

ids = [i for i in range(len(docs))]

# embedding model to convert text to vectors
embeddings = HuggingFaceEmbeddings(model_name=model_name)

# Initialize the Qdrant client with a local path
client = QdrantClient(path=db_path)

try:
    if not client.collection_exists(collection_name):
        # create a new collection if it doesn't exist

        vector_length = len(embeddings.embed_documents(["dummy"])[0])  # get vector length from dummy embedding
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=vector_length, distance=distance),
        )

    vector_store = QdrantVectorStore(
        client=client,
        collection_name=collection_name,
        embedding=embeddings,
    )

    _ = vector_store.add_documents(docs, ids=ids) # metadata is contained in the documents
    print(f"Added {len(docs)} documents to the collection '{collection_name}'")
finally:
    client.close()    

Added 10 documents to the collection 'product_collection'


In [19]:
docs[0]

Document(metadata={'url': 'https://www.notino.co.uk/catrice/under-eye-brightener-highlighter-to-treat-under-eye-circles/', 'brand': 'Catrice', 'product_name': 'Under Eye Brightener', 'price': 3.7, 'composition': 'PENTAERYTHRITYL TETRAISOSTEARATE, ETHYLHEXYL PALMITATE, MICA, SORBITAN SESQUIOLEATE, EUPHORBIA CERIFERA (CANDELILLA) WAX, HELIANTHUS ANNUUS (SUNFLOWER) SEED WAX, SYNTHETIC FLUORPHLOGOPITE, TRIBEHENIN, DIISOSTEAROYL POLYGLYCERYL-3 DIMER DILINOLEATE, TAPIOCA STARCH, BUTYROSPERMUM PARKII (SHEA) BUTTER, BISABOLOL, TOCOPHEROL, HELIANTHUS ANNUUS (SUNFLOWER) SEED OIL, RICINUS COMMUNIS (CASTOR) SEED OIL, SODIUM HYALURONATE, TOCOPHERYL ACETATE, ASCORBYL PALMITATE, HYDROGENATED CASTOR OIL, CI 77491 (IRON OXIDES), CI 77492 (IRON OXIDES), CI 77499 (IRON OXIDES), CI 77891 (TITANIUM DIOXIDE).'}, page_content='highlighter for under eye circles\n\nVegan\nComplexion type all skin types\nEffects Brightening, Against dark circles\nWhen to use day and night\nLook fresh, whether you slept in or ju