### Indexing of Documents used for RAG

In [2]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
from langchain_core.documents.base import Document
import os
import json



# set parameters
#---------------------------------------------------------------------------------------
# general parameters
rag_docs_folder = "Rag_docs" # folder containing json files to be loaded

# test splitter parameters
chunk_size = 1000 # chunk size for splitting documents
chunk_overlap = 200 # chunk overlap for splitting documents
add_start_index = True # whether to track index in original document

# Qdrant parameters
min_len = 20 # minimum length of documents to be stored in Qdrant
collection_name = "demo_collection" # name of the Qdrant collection
db_path = "test_rag_db" # path to the Qdrant database
distance = Distance.COSINE # distance metric for Qdrant

# embedding model parameters
model_name = "sentence-transformers/all-mpnet-base-v2" # embedding model name
#---------------------------------------------------------------------------------------


# get pdf files from Rag_docs folder 
paths = [path for path in os.listdir(rag_docs_folder) if path.endswith(".json")]

# list of docs, metadata and ids to store in Qdrant
docs = []
ids = []

# text splitter to split documents into smaller chunks for retrieval
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,  # chunk size (characters)
    chunk_overlap=chunk_overlap,  # chunk overlap (characters)
    add_start_index=add_start_index,  # track index in original document
    )

# load and process json files using PyPDFLoader
for path in paths:
    with open(rag_docs_folder + "\\" + path) as f:
        text = json.load(f)

    text_to_embed = text["report"] #+ str(text["products"]) if "products" in text else text["report"]
    # split texts into smaller chunks
    text_chunks = text_splitter.split_text(text_to_embed)

    # only add texts to list that are longer than min_len characters, creating a Document object for each chunk and adding metadata
    docs_min_length = [Document(d, metadata={"company": text["company"],"topic": text["topic"],"source_file": path, "report":""}) for d in text_chunks if len(d) > min_len]

    # add documents containing text and metadata to lists for Qdrant upload
    docs.extend(docs_min_length)

    print(f"Loaded {len(docs_min_length)} reports from {path}")

    # add products as documents to Qdrant
    if "products" in text:
        # create a Document object for each product and add metadata
        docs_procuts = [Document(str(product), metadata={"company": text["company"],"topic": text["topic"],"source_file": path, "report":text["report"]}) for product in text["products"]]
        docs.extend(docs_procuts)

        print(f"Loaded {len(docs_procuts)} products from {path}")

ids = [i for i in range(len(docs))]

# embedding model to convert text to vectors
embeddings = HuggingFaceEmbeddings(model_name=model_name)

# Initialize the Qdrant client with a local path
client = QdrantClient(path=db_path)

try:
    if not client.collection_exists(collection_name):
        # create a new collection if it doesn't exist

        vector_length = len(embeddings.embed_documents(["dummy"])[0])  # get vector length from dummy embedding
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=vector_length, distance=distance),
        )

    vector_store = QdrantVectorStore(
        client=client,
        collection_name=collection_name,
        embedding=embeddings,
    )

    _ = vector_store.add_documents(docs, ids=ids) # metadata is contained in the documents
finally:
    client.close()

Loaded 3 reports from ahava_env.json
Loaded 3 reports from ahava_ethics.json
Loaded 2 reports from ahava_info.json
Loaded 4 reports from avon_env.json
Loaded 2 products from avon_env.json
Loaded 3 reports from avon_ethics.json
Loaded 2 products from avon_ethics.json
Loaded 3 reports from avon_info.json
Loaded 2 products from avon_info.json
Loaded 7 reports from cerave_env.json
Loaded 6 reports from cerave_ethics.json
Loaded 5 reports from cerave_info.json
Loaded 3 reports from darphin_env.json
Loaded 3 reports from darphin_ethics.json
Loaded 3 reports from darphin_info.json
Loaded 3 reports from delia_cosmetics_env.json
Loaded 3 reports from delia_cosmetics_ethics.json
Loaded 3 reports from delia_cosmetics_info.json
Loaded 4 reports from estee_lauder_env.json
Loaded 4 reports from estee_lauder_ethics.json
Loaded 4 reports from estee_lauder_info.json
Loaded 4 reports from korres_env.json
Loaded 4 reports from korres_ethics.json
Loaded 3 reports from korres_info.json
Loaded 3 reports fro