# Database Comparison Test

Compare final_db.db (with chunk titles) vs document_db.db (basic chunks)

In [3]:
# compare database sizes
from pymilvus import MilvusClient

# connect to both databases
final_db = MilvusClient(uri="./final_db.db")
document_db = MilvusClient(uri="./document_db.db")

# get collection info
final_collections = final_db.list_collections()
document_collections = document_db.list_collections()

print("=== Database Comparison ===")
print(f"final_db collections: {final_collections}")
print(f"document_db collections: {document_collections}")

if final_collections:
    # use a high limit to get all records
    final_results = final_db.query(collection_name=final_collections[0], filter="", output_fields=["id"], limit=16000)
    final_count = len(final_results)
    print(f"\nfinal_db chunks: {final_count}")
else:
    print("\nfinal_db: empty")

if document_collections:
    # use a high limit to get all records
    document_results = document_db.query(collection_name=document_collections[0], filter="", output_fields=["id"], limit=16000)
    document_count = len(document_results)
    print(f"document_db chunks: {document_count}")
else:
    print("document_db: empty")

final_db.close()
document_db.close()

=== Database Comparison ===
final_db collections: ['pdf_embeddings_with_titles']
document_db collections: ['pdf_embeddings']

final_db chunks: 293
document_db chunks: 179


In [5]:
# inspect first NUMBER chunks from final_db with all metadata
from pymilvus import MilvusClient

NUMBER = 50

client = MilvusClient(uri="./final_db.db")
collections = client.list_collections()

if not collections:
    print("No collections found in final_db")
else:
    collection_name = collections[0]
    print(f"Collection: {collection_name}")
    
    # get first NUMBER chunks with all fields except vector
    results = client.query(
        collection_name=collection_name,
        filter="",
        output_fields=["id", "filename", "page_number", "chunk_id", "chunk_title", "text"],
        limit=NUMBER
    )
    
    print(f"\n=== First {NUMBER} chunks ===")
    for i, chunk in enumerate(results):
        print(f"\n--- Chunk {i+1} ---")
        print(f"ID: {chunk.get('id', 'N/A')}")
        print(f"Filename: {chunk.get('filename', 'N/A')}")
        print(f"Page: {chunk.get('page_number', 'N/A')}")
        print(f"Chunk ID: {chunk.get('chunk_id', 'N/A')}")
        print(f"Title: {chunk.get('chunk_title', 'N/A')}")
        text = chunk.get('text', 'N/A')
        print(f"Text: {text}")

client.close()

Collection: pdf_embeddings_with_titles

=== First 50 chunks ===

--- Chunk 1 ---
ID: 461248658660917248
Filename: ccnl_commercio_terziario_distribuzione_e_servizi.pdf
Page: 1
Chunk ID: 0
Title: UNKNOWN
Text: CCNL Contratto Collettivo Nazionale di Lavoro Commercio, Terziario, Distribuzione e Servizi In vigore dal 01.02.2023 al 31.01.2026 Sottoscritto dalle parti sindacali datoriali e dei lavoratori: CONFLAVORO PMI CONFSAL FESICA

--- Chunk 2 ---
ID: 461248658660917249
Filename: ccnl_commercio_terziario_distribuzione_e_servizi.pdf
Page: 2
Chunk ID: 0
Title: UNKNOWN
Text: In vigore dal 01/02/2023 al 31/01/2026 L’anno 2023 il giorno 17 del mese di Gennaio in Roma, presso la sede della CONFLAVORO PMI, in Via del Consolato n. 6, tra CONFLAVORO PMI, Confederazione Nazionale delle Piccole e Medie Imprese rappresentata dal Presidente Nazionale Roberto Capobianco, Enzo Capobianco , Giuseppe Pullara, Alessandro Mattesini, Bertino Trolese, Anna Maria Domenici, Andrea Bigi e FESICA-CONFSAL, Federaz