In [1]:
from src.rag.components.shared.io import IOManager

In [2]:
from pathlib import Path

In [3]:
output_path = Path.cwd().joinpath("datasets", "parsed_documents_with_embeddings")
io_manager = IOManager(output_path)

In [4]:
io_manager.document_path.exists()

True

In [5]:
chunk_size = 5
all_documents = []
for i in range(0, 20, chunk_size):
	documents = io_manager.load_documents(i, i + chunk_size)
	all_documents.extend(documents)

In [6]:
# generate a random number between 0 and 20
import random
random_number = random.randint(0, 20)

In [7]:
sample_doc = all_documents[random_number]

In [8]:
nodes = sample_doc.convert_to_milvus()

In [9]:
nodes[0].get("text")

'passage: \nUSE OUR FLOW CHART TO SEE \nWHICH ACCOMMODATION IS BEST \nFOR YOU... <br><br>I require an \nadapted \nroom*<br><br>Yes \nNo <br><br>I would like to \nbe closer to \nthe town \ncentre and \nother local \namenities \nI don’t mind \nbeing slightly \nout of the \ncentre of \ncampus  I don’t mind a \nshort \n(picturesque) \nstroll to \ncampus \nI want to be \nclose to the \ncentre of \ncampus  I want to be \nclose to the \ncentre of \ncampus \nI like the idea \nof a 4 foot \nbed <br><br>Copse \n(UG Only) \nStudio flats \nalso available  I like the finer \nthings in life \n(and a 4 foot \nbed) \nI’m looking \nfor a social \nvibe  I’m looking \nfor a social \nvibe \nI’m after a \nchilled vibe  I’m after a \nchilled vibe  Quays \n(UG & PG) \nMeadows <br><br>I want to keep \nit as cheap as \npossible & \ndon’t mind \nsharing a \nbathroom \nI don’t mind \nsharing a \nbathroom \nwith 1 other \nperson \nCopse \n(UG Only) \nStudio flats \nalso available <br><br>Houses \n(UG Only)  South

In [10]:
from src.rag.components.shared.databases.milvus import MilvusDatabase

In [11]:
COLLECTION_NAME = "my_collection"

In [12]:
from src.rag.components.shared.databases.milvus_settings import MilvusSettings

In [13]:
COLLECTION_NAME = "my_collection"

In [14]:
settings = MilvusSettings(
    uri="http://localhost:19530",
    collection_name=COLLECTION_NAME,
    vector_dimension=1024,
)

In [15]:
milvus_client = MilvusDatabase(
    milvus_settings=settings
)

[32m2025-05-28 23:23:52 | INFO     | milvus_database:connect:27 | Connecting to Milvus...[0m
[32m2025-05-28 23:23:52 | INFO     | milvus_database:connect:31 | Connected to Milvus successfully.[0m


In [16]:
milvus_client.connect()

[32m2025-05-28 23:23:53 | INFO     | milvus_database:connect:27 | Connecting to Milvus...[0m
[32m2025-05-28 23:23:53 | INFO     | milvus_database:connect:31 | Connected to Milvus successfully.[0m


In [17]:
milvus_client.create_index_if_not_exists()

[32m2025-05-28 23:23:53 | INFO     | milvus_database:create_index_if_not_exists:43 | Collection 'my_collection' already exists.[0m


In [18]:
milvus_client.client.query(collection_name=COLLECTION_NAME, output_fields=["count(*)"])

data: ["{'count(*)': 23050}"]

In [19]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [20]:

embedding_model_name = "intfloat/multilingual-e5-large"

In [21]:
embedding_model = SentenceTransformer(embedding_model_name)

In [50]:
test_questions = ["query: what is dependency injection?"]

In [51]:
question_embeddings  = embedding_model.encode(test_questions, convert_to_tensor=True)

In [52]:
question_embeddings = question_embeddings.cpu().numpy()



In [54]:
responses = milvus_client.client.search(
    collection_name=COLLECTION_NAME,
    data=question_embeddings,
    limit=5,
    output_fields=["node_id", "text",
                   "metadata", "bbox", "elements"],
    params={"metric_type": "COSINE"}
)

In [55]:
len(responses)


1

In [56]:
responses

data: [[{'node_id': 'c4c3284b-2795-424c-97dd-177c03b4417f', 'distance': 0.8030880093574524, 'entity': {'bbox': [{'page': 7, 'page_height': 414.0, 'page_width': 735.12, 'x0': 379.34, 'y0': 265.31, 'x1': 680.9, 'y1': 356.63}], 'elements': [], 'node_id': 'c4c3284b-2795-424c-97dd-177c03b4417f', 'text': 'passage: Adipisicing Lorem et quis magna consequat ut irure <br><br>tempor eu dolor nisi. Do irure in cupidatat laborum <br><br>occaecat officia non nisi non amet mollit duis <br><br>ullamco. Duis amet velit sint consectetur. Fugiat <br><br>magna cillum cillum do voluptate. Dolore fugiat <br><br>voluptate veniam sint dolor anim. ', 'metadata': {'filename': 'slides.example.pdf', 'num_pages': 23, 'coordinate_system': 'bottom-left', 'table_parsing_kwargs': None, 'last_modified_date': '2024-10-08T00:00:00', 'last_accessed_date': '2025-02-21T00:00:00', 'creation_date': '2024-10-08T00:00:00', 'file_size': 683459}}}, {'node_id': 'b288673c-cf1c-4391-9e13-45452d2d3d03', 'distance': 0.802629113197326

In [57]:
from src.rag.schemas.document import Node

In [59]:
for question, response in zip(test_questions, responses):
    print(f"Question: {question}")
    print("===" * 20)
    for result in response:
        print(result.id, result.distance, result.entity.get("text"))
        print("the entity is of type:", result.entity.get("bbox"))
    print("===" * 20)

Question: query: what is dependency injection?
c4c3284b-2795-424c-97dd-177c03b4417f 0.8030880093574524 passage: Adipisicing Lorem et quis magna consequat ut irure <br><br>tempor eu dolor nisi. Do irure in cupidatat laborum <br><br>occaecat officia non nisi non amet mollit duis <br><br>ullamco. Duis amet velit sint consectetur. Fugiat <br><br>magna cillum cillum do voluptate. Dolore fugiat <br><br>voluptate veniam sint dolor anim. 
the entity is of type: [{'page': 7, 'page_height': 414.0, 'page_width': 735.12, 'x0': 379.34, 'y0': 265.31, 'x1': 680.9, 'y1': 356.63}]
b288673c-cf1c-4391-9e13-45452d2d3d03 0.8026291131973267 passage: Eu mollit nulla voluptate dolore dolore tempor velit aliqua cillum irure quis ea. Aute laboris sit quis aliquip <br><br>tempor elit adipisicing duis in sint sit eiusmod exercitation consectetur. Ut deserunt qui veniam dolore sint <br><br>excepteur. Occaecat minim ea eu esse enim deserunt veniam ad ullamco nostrud est. 
the entity is of type: [{'page': 4, 'page_h

In [None]:
### Need to come and chech the results and implement vector search.