In [2]:
from src.rag.components.shared.io import IOManager

In [3]:
from pathlib import Path

In [10]:
output_path = Path.cwd().joinpath("datasets", "parsed_documents_with_embeddings")
io_manager = IOManager(output_path)

In [11]:
io_manager.document_path.exists()

True

In [30]:
chunk_size = 5
all_documents = []
for i in range(0, 20, chunk_size):
	documents = io_manager.load_documents(i, i + chunk_size)
	all_documents.extend(documents)

In [33]:
# generate a random number between 0 and 20
import random
random_number = random.randint(0, 20)

In [35]:
sample_doc = all_documents[random_number]

In [36]:
nodes = sample_doc.convert_to_milvus()

In [37]:
nodes[0].get("text")

'Model Complexity \nImpact On Bias And Variance <br><br>Increasing model complexity reduces bias by allowing the model to \ncapture more intricate patterns in the data, but it also increases \nvariance, making the model more sensitive to noise and potentially \nleading to overfitting. '

In [50]:
from src.rag.components.shared.databases.milvus import MilvusDatabase

In [51]:
milvus_client = MilvusDatabase(
    host="http://localhost:19530",
    vector_dimension=1024,
    collection_name="my_documents",
    token=""
)

In [10]:
milvus_client.connect()

[32m2025-04-23 21:55:28 | INFO     | milvus_database:connect:16 | Connecting to Milvus...[0m
[32m2025-04-23 21:55:28 | INFO     | milvus_database:connect:21 | Connected to Milvus successfully.[0m


In [11]:
milvus_client.create_index_if_not_exists()

[32m2025-04-23 21:55:29 | INFO     | milvus_database:create_index_if_not_exists:33 | Collection 'my_documents' already exists.[0m


In [12]:
milvus_client.write_data(data=doc_nodes)

[32m2025-04-23 21:56:44 | INFO     | milvus_database:write_data:57 | Writing embeddings to Milvus...[0m
[32m2025-04-23 21:56:44 | INFO     | milvus_database:write_data:64 | Successfully inserted 3 entities into Milvus.[0m
[32m2025-04-23 21:56:44 | INFO     | milvus_database:write_data:70 | Completed writing embeddings to Milvus.[0m


In [13]:
milvus_client.client.query(collection_name="my_documents", output_fields=["count(*)"])

data: ["{'count(*)': 3}"]

In [15]:
milvus_client.client.get_collection_stats(collection_name="my_collection")

{'row_count': 23050}

In [42]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [43]:

embedding_model_name = "intfloat/multilingual-e5-large"

In [44]:
embedding_model = SentenceTransformer(embedding_model_name)

In [45]:
test_questions = [
    "How does increasing model complexity affect bias?",
    "What is the relationship between model complexity and variance?",
    "Why does a more complex model have lower bias?",
    "What risk arises when model complexity increases?",
    "How can increasing model complexity lead to overfitting?",
    "What happens to a model's sensitivity to noise as complexity increases?",
    "What is the trade-off between bias and variance when adjusting model complexity?",
    "Why might a very complex model perform poorly on new, unseen data?",
    "How does model complexity influence the model's ability to capture patterns in data?",
    "What are the consequences of a model having high variance due to increased complexity?"
]


In [46]:
question_embeddings  = embedding_model.encode(test_questions, convert_to_tensor=True)

In [58]:
question_embeddings = question_embeddings.cpu().numpy()

In [59]:
milvus_client.connect()

[32m2025-04-23 23:23:24 | INFO     | milvus_database:connect:16 | Connecting to Milvus...[0m
[32m2025-04-23 23:23:24 | INFO     | milvus_database:connect:21 | Connected to Milvus successfully.[0m


In [60]:
responses = milvus_client.client.search(
    collection_name="my_documents",
    data=question_embeddings,
    limit=5,
    output_fields=["text", "metadata"],
    params={"metric_type": "L2"}
)

In [61]:
responses

data: [[{'id': '05efe299-fe43-4565-8904-46509274336a', 'distance': 0.7705314755439758, 'entity': {'text': '<br><br>U IVERSITE LIBRE DES PAYS DES GRANDS LACS \nP.O. Box· 368 Goma Dem Republi \n**Office of the General Secretary Academic**<br><br>c of the Congo <br><br>Faculty of Sciences and Applied Technologies <br><br>**N**\nGRADE TRA SCRIPT: \nTechnical \n2 \n**N**\nDEPARTME T \ning \nEngineer  Computer <br><br>ACADEMIC YEAR \n2015-2016 \n**N**\nborn in BUKAVU, on 12 July 1991  MURHABAZI BUll A ESPOIR (registr \nation number 6692), <br><br>r N \nI <br><br>°**COURSE**<br><br>**Grades**\n**Out of 20**\n**Hour Credits**\n**Load**\nTITLE <br><br>01 Ethics \n02 Elements \nj 03 Programming \nto Peace \n04 Education \n05 Human-Machine \n06 Telecommunication \n07 OS Supplements \nPositioning \n08 \n09 Real time \n10 Artificial \n11 Enterprise \n12 Project \n13 Parallel \n14 Automatism \n15 Embedded \n16 OpenCL Programming \n17 Logical \n18 Signal processing <br><br>and professional \nof Psych

In [None]:
### Need to come and chech the results and implement vector search.