Provide Milvus connection details below  and API key 

In [None]:
#conect to milvus
from pymilvus import Collection, connections, utility,  CollectionSchema, FieldSchema, DataType
connections.connect(
    alias="default", 
    uri = 'https://<GRPC host>:<GRPC port>',
    user="ibmlhapikey",
    password="<apikey>"  #same api key will be using will adding milvus custom extension
)
print("Connected to Milvus")

utility.list_collections()

Provide  Collection Name Below 

In [None]:
from sentence_transformers import SentenceTransformer

# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2')   #Sentence Transformer model 

collection_name = "<collection name>"   # Provide collection name 

# Step 1: Extract page-wise text
def extract_text_by_page(pdf_path):
    reader = PdfReader(pdf_path)
    return [{"text": page.extract_text() or "", "page_number": i+1} for i, page in enumerate(reader.pages)]


In [None]:
# Step 2: Chunk text with page number embedded directly in text
def preprocess_page_chunks(pages, chunk_size=512):
    chunks = []
    for page in pages:
        text = page["text"]
        page_number = page["page_number"]
        page_chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
        for chunk in page_chunks:
            cleaned_chunk = chunk.replace('\n', ' ').strip()
            chunks.append({
                "text": f"[Page {page_number}] {cleaned_chunk}"
            })
    return chunks


In [None]:
# Step 3: Generate embeddings
def generate_embeddings(texts):
    return model.encode(texts, convert_to_numpy=True)


Inserting Data 

In [None]:
# Step 4: Insert into Milvus
def batch_insert(collection, embeddings, chunks, file_name, batch_size=500):
    num_entries = len(embeddings)
    for start in range(0, num_entries, batch_size):
        end = min(start + batch_size, num_entries)
        batch_embeddings = embeddings[start:end]
        batch_chunks = chunks[start:end]

        batch_texts = [chunk['text'] for chunk in batch_chunks]
        batch_file_names = [file_name] * len(batch_chunks)

        data = [
            batch_embeddings.tolist(),     # embedding
            batch_texts,                   # text
            batch_file_names               # file_name
        ]

        collection.insert(data)
        print(f"Inserted batch from {start} to {end}")


In [None]:
from pymilvus import Collection, CollectionSchema, FieldSchema, DataType, utility

def create_collection_if_not_exists():
    if collection_name in utility.list_collections():
        return Collection(collection_name)

    fields = [
        FieldSchema(name='id', dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, dim=384),
        FieldSchema(name='text', dtype=DataType.VARCHAR, max_length=2048),
        FieldSchema(name='file_name', dtype=DataType.VARCHAR, max_length=512)
    ]

    schema = CollectionSchema(fields, description="Collection for PDF text chunks")
    collection = Collection(name=collection_name, schema=schema)

    print(f"Created collection '{collection_name}' with schema.")
    return collection


Creating an index on the column in the inserted data

In [None]:
def create_index(collection):
    # Step 1: Release the collection if loaded
    try:
        collection.release()
        print("Collection released.")
    except Exception as e:
        print(f"Error releasing collection: {e}")

    # Step 2: Drop existing index if present
    try:
        collection.drop_index()
        print("Existing index dropped.")
    except Exception as e:
        print(f"No existing index to drop or error: {e}")

    # Step 3: Create new index
    index_params = {
        "metric_type": "L2",
        "index_type": "IVF_FLAT",
        "params": {"nlist": 128}
    }
    collection.create_index(field_name="embedding", index_params=index_params)
    print("Index created successfully.")

    # Step 4: Load collection for search
    collection.load()


Provide PDF path below 

In [None]:
import os
from pymilvus import Collection, CollectionSchema, FieldSchema, DataType
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader

# Step 7: Main Execution
pdf_path = '<provide complete path to your pdf file> '   #PDF path 
file_name = os.path.basename(pdf_path)

pages = extract_text_by_page(pdf_path)
chunks = preprocess_page_chunks(pages)
embeddings = generate_embeddings([chunk['text'] for chunk in chunks])

collection = create_collection_if_not_exists()
batch_insert(collection, embeddings, chunks, file_name)
create_index(collection)

Perform Testing Locally to verify data is inserted.

In [None]:
collection.load()

# Define the search function
def search_text_chunks(collection, query_text, top_k=5):
    from sentence_transformers import SentenceTransformer
    import numpy as np

    model = SentenceTransformer('all-MiniLM-L6-v2')  # or your existing model
    query_embedding = model.encode(query_text, convert_to_numpy=True)

    # Check vector shape before searching
    if query_embedding.shape[0] != 384:
        raise ValueError(f"Query embedding must have shape (384,), got {query_embedding.shape}")

    search_params = {"metric_type": "L2", "params": {"nprobe": 64}}

    try:
        results = collection.search(
            data=[query_embedding],
            anns_field="embedding",
            param=search_params,
            limit=top_k,
            output_fields=["text", "file_name"]
        )
    except Exception as e:
        print("❌ Search failed:", e)
        raise

    formatted_results = []
    for result in results:
        for hit in result:
            formatted_results.append({
                'id': hit.id,
                'distance': hit.distance,
                'text': hit.entity.get('text'),
                'file_name': hit.entity.get('file_name')
            })

    return formatted_results

# Query and display results
query = "<user query>"
#sample question :What are the steps for dismantling of brake cylinder?
results = search_text_chunks(collection, query)

# Output results
print("Search Results:")
for res in results:
    print(f"ID: {res['id']}, Distance: {res['distance']:.4f}")
    print(f"File: {res['file_name']}")
    print(f"Text: {res['text']}")
    print("-" * 60)


Provide IDS created in above steps in below step 

In [None]:
ids_to_query = [
    457984710530969383,
    457984710530968943,
    457984710530968939,
    457984710530969377,
    457984710530968944
]

# Initialize a list to store results
all_results = []

# Query the collection for each ID
for id in ids_to_query:
    try:
        query_expr = f"id == {id}"
        results = collection.query(
            expr=query_expr,
            output_fields=["id", "text", "file_name"]
        )

        all_results.extend(results)
    except Exception as e:
        print(f"Error querying ID {id}: {e}")

# Output results
for result in all_results:
    print("************")
    print(f"ID: {result['id']}")
    print(f"Text: {result['text']}")
    print(f"File: {result.get('file_name')}")