In [2]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..')))

In [9]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaLLM
from src.rag_components import create_rag_chain 
from src.vector_store import build_vector_store 
from src.data_processing import process_pdf_to_chunks
from pprint import pprint

In [7]:
DATA_PATH = "/home/enzo/Desktop/riza/data/raw"
MANUAL_COMPLETO = os.path.join(DATA_PATH, "galaxy_z_flip_7.pdf")
MANUAL_TESTE = os.path.join(DATA_PATH, "galaxy_z_flip_7_teste.pdf")

In [8]:
persist_directory = "./chroma_db_test"

In [14]:
final_chunks = process_pdf_to_chunks(MANUAL_TESTE)

2025-09-29 22:58:15,723 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-29 22:58:15,726 - INFO - Going to convert document batch...
2025-09-29 22:58:15,726 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e647edf348883bed75367b22fbe60347
2025-09-29 22:58:15,727 - INFO - Accelerator device: 'cpu'


Iniciando conversão do arquivo: /home/enzo/Desktop/riza/data/raw/galaxy_z_flip_7_teste.pdf


2025-09-29 22:58:17,481 - INFO - Accelerator device: 'cpu'
2025-09-29 22:58:18,281 - INFO - Accelerator device: 'cpu'
2025-09-29 22:58:18,581 - INFO - Processing document galaxy_z_flip_7_teste.pdf
2025-09-29 22:59:20,431 - INFO - Finished converting document galaxy_z_flip_7_teste.pdf in 64.71 sec.


Conversão para Markdown concluída.
Iniciando o processo de chunking híbrido...
Chunking concluído. 106 chunks intermediários gerados.
Iniciando limpeza e filtragem dos chunks...
Limpeza concluída. 87 chunks finais restantes.


In [15]:
embedding_model = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},
)


2025-09-29 22:59:20,498 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [17]:
db = build_vector_store(
    embedding_model=embedding_model,
    persist_directory=persist_directory,
    chunks=final_chunks
)

Criando Vector Store com 87 chunks em './chroma_db_test'...


2025-09-29 23:00:31,779 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Vector Store criado com sucesso.


In [18]:
llm = OllamaLLM(model="gemma:2b")

In [19]:
retriever = db.as_retriever(search_kwargs={"k": 3})

In [20]:
qa_chain = create_rag_chain(
                            llm=llm, 
                            retriever=retriever
                            )

In [21]:
query = "how can i charge the phone?"
result = qa_chain.invoke({"query": query})

2025-09-29 23:00:53,596 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


In [22]:
pprint(result)

{'query': 'how can i charge the phone?',
 'result': 'The device can be charged using a wired charging cable or a '
           'wireless charger.\n'
           '\n'
           'If using a wired charging cable, connect the USB cable to the USB '
           "power adapter and plug the cable into the device's multipurpose "
           'jack to charge the battery.\n'
           '\n'
           'If using a wireless charger, follow these steps:\n'
           '\n'
           '1. Fold the device before charging the battery.\n'
           "2. Place the device's back on the center of the wireless charger "
           'to charge the battery.\n'
           '3. After fully charging, disconnect the device from the wireless '
           'charger.',
 'source_documents': [Document(metadata={'Header 2': 'Wired charging'}, page_content="[Wired charging] Connect the USB cable to the USB power adapter and plug the cable into the device's multipurpose jack to charge the battery. After fully charging, disconn