In [1]:
from src.rag.components.shared.io import IOManager

In [2]:
from pathlib import Path

In [3]:
output_path = Path.cwd().joinpath("datasets", "parsed_documents_with_embeddings")
io_manager = IOManager(output_path)

In [4]:
io_manager.document_path.exists()

True

In [5]:
chunk_size = 5
all_documents = []
for i in range(0, 20, chunk_size):
	documents = io_manager.load_documents(i, i + chunk_size)
	all_documents.extend(documents)

In [6]:
# generate a random number between 0 and 20
import random
random_number = random.randint(0, 20)

In [7]:
sample_doc = all_documents[random_number]

In [8]:
nodes = sample_doc.convert_to_milvus()

In [9]:
nodes[0].get("text")

'passage: \nUSE OUR FLOW CHART TO SEE \nWHICH ACCOMMODATION IS BEST \nFOR YOU... <br><br>I require an \nadapted \nroom*<br><br>Yes \nNo <br><br>I would like to \nbe closer to \nthe town \ncentre and \nother local \namenities \nI don’t mind \nbeing slightly \nout of the \ncentre of \ncampus  I don’t mind a \nshort \n(picturesque) \nstroll to \ncampus \nI want to be \nclose to the \ncentre of \ncampus  I want to be \nclose to the \ncentre of \ncampus \nI like the idea \nof a 4 foot \nbed <br><br>Copse \n(UG Only) \nStudio flats \nalso available  I like the finer \nthings in life \n(and a 4 foot \nbed) \nI’m looking \nfor a social \nvibe  I’m looking \nfor a social \nvibe \nI’m after a \nchilled vibe  I’m after a \nchilled vibe  Quays \n(UG & PG) \nMeadows <br><br>I want to keep \nit as cheap as \npossible & \ndon’t mind \nsharing a \nbathroom \nI don’t mind \nsharing a \nbathroom \nwith 1 other \nperson \nCopse \n(UG Only) \nStudio flats \nalso available <br><br>Houses \n(UG Only)  South

In [1]:
from src.rag.components.shared.databases.milvus import MilvusDatabase

In [2]:
COLLECTION_NAME = "my_collection"

In [3]:
from src.rag.components.shared.databases.milvus_settings import MilvusSettings

In [4]:
COLLECTION_NAME = "my_collection"

In [5]:
settings = MilvusSettings(
    uri="http://localhost:19530",
    collection_name=COLLECTION_NAME,
    vector_dimension=1024,
)

In [6]:
milvus_client = MilvusDatabase(
    milvus_settings=settings
)

[32m2025-06-03 22:22:41 | INFO     | milvus_database:connect:29 | Connecting to Milvus...[0m
[32m2025-06-03 22:22:41 | INFO     | milvus_database:connect:33 | Connected to Milvus successfully.[0m


In [7]:
milvus_client.connect()

[32m2025-06-03 22:22:42 | INFO     | milvus_database:connect:29 | Connecting to Milvus...[0m
[32m2025-06-03 22:22:42 | INFO     | milvus_database:connect:33 | Connected to Milvus successfully.[0m


In [8]:
milvus_client.create_index_if_not_exists()

[32m2025-06-03 22:22:43 | INFO     | milvus_database:create_index_if_not_exists:45 | Collection 'my_collection' already exists.[0m


In [9]:
milvus_client.client.query(collection_name=COLLECTION_NAME, output_fields=["count(*)"])

data: ["{'count(*)': 7310}"]

In [10]:
# milvus_client.delete_collection()

In [11]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [12]:

embedding_model_name = "intfloat/multilingual-e5-large"

In [13]:
embedding_model = SentenceTransformer(embedding_model_name)

In [14]:
test_questions = ["query: what is dependency injection?"]

In [15]:
question_embeddings  = embedding_model.encode(test_questions, convert_to_tensor=True)

In [16]:
question_embeddings = question_embeddings.cpu().numpy()



In [17]:
responses = milvus_client.client.search(
    collection_name=COLLECTION_NAME,
    data=question_embeddings,
    limit=5,
    output_fields=["node_id", "text",
                   "metadata", "bbox", "elements"],
    params={"metric_type": "COSINE"}
)

In [18]:
len(responses)


1

In [19]:
responses

data: [[{'node_id': 'f02578c1-ce67-45ae-94ff-b770518199fe', 'distance': 0.8108667731285095, 'entity': {'node_id': 'f02578c1-ce67-45ae-94ff-b770518199fe', 'text': 'THE TRAP: SHIFTING THE BURDEN \nTO THE INTERVENOR <br><br>Shifting the burden, dependence, and addiction arise when \na solution to a systemic problem reduces (or disguises) the \nsymptoms, but does nothing to solve the underlying problem. \nWhether it is a substance that dulls one’s perception or a policy \nthat hides the underlying trouble, the drug of choice interferes \nwith the actions that could solve the real problem. <br><br>If the intervention designed to correct the problem causes \nthe self-maintaining capacity of the original system to atrophy \nor erode, then a destructive reinforcing feedback loop is set in \nmotion. The system deteriorates; more and more of the solution \nis then required. The system will become more and more depen- \ndent on the intervention and less and less able to maintain its own \ndesired

In [20]:
from src.rag.schemas.document import Node

In [21]:
for question, response in zip(test_questions, responses):
    print(f"Question: {question}")
    print("===" * 20)
    for result in response:
        print(result.id, result.distance, result.entity.get("text"))
        print("the entity is of type:", result.entity.get("bbox"))
    print("===" * 20)

Question: query: what is dependency injection?
f02578c1-ce67-45ae-94ff-b770518199fe 0.8108667731285095 THE TRAP: SHIFTING THE BURDEN 
TO THE INTERVENOR <br><br>Shifting the burden, dependence, and addiction arise when 
a solution to a systemic problem reduces (or disguises) the 
symptoms, but does nothing to solve the underlying problem. 
Whether it is a substance that dulls one’s perception or a policy 
that hides the underlying trouble, the drug of choice interferes 
with the actions that could solve the real problem. <br><br>If the intervention designed to correct the problem causes 
the self-maintaining capacity of the original system to atrophy 
or erode, then a destructive reinforcing feedback loop is set in 
motion. The system deteriorates; more and more of the solution 
is then required. The system will become more and more depen- 
dent on the intervention and less and less able to maintain its own 
desired state. <br><br>THE WAY OUT <br><br>Again, the best way out of this trap

In [None]:
### Need to come and chech the results and implement vector search.