In [None]:
import os
import re
import fitz
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Extract text and chunk using ‚Äî‚Äî separator
def pdf_to_doc(pdf_path, document_name, document_type):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()

    full_text = re.sub(r'[\u2028\u2029\n\r]+', ' ', full_text).strip()
    sections = re.split(r'‚Äî‚Äî+', full_text)
    sections = [sec.strip() for sec in sections if sec.strip()]

    documents = []
    for i, section in enumerate(sections):
        documents.append(
            Document(
                page_content=section,
                metadata={
                    "document_name": document_name,
                    "document_type": document_type,
                    "section_number": i
                }
            )
        )
    return documents

In [3]:
#Store documents in Chroma DB
def store_in_chroma(pdf_path, document_name, document_type, persist_directory="./chroma_phi"):
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
    data = vectordb.get()
    existing = any(
        metadata.get("document_name") == document_name
        for metadata in data['metadatas']
    )
    if existing:
        print(f"‚ö†Ô∏è Document with name '{document_name}' already exists. Use update_entries() to replace it.")
        return
    docs = pdf_to_doc(pdf_path, document_name, document_type)
    vectordb.add_documents(docs)
    print(f"‚úÖ Stored {len(docs)} chunks from '{document_name}' as '{document_type}'.")

In [4]:
def list_documents(persist_directory="./chroma_phi"):
    if not os.path.exists(persist_directory):
        print("‚ö†Ô∏è No Chroma database found.")
        return

    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
    data = vectordb.get()
    seen = set()
    if not data['documents']:
        print("üì≠ No documents found in the vector store.")
        return

    print("üìÑ Documents stored:")
    for metadata in data['metadatas']:
        name = metadata.get("document_name", "Unknown")
        dtype = metadata.get("document_type", "Unknown")
        if (name, dtype) not in seen:
            print(f"‚Ä¢ Name: {name} | Type: {dtype}")
            seen.add((name, dtype))


In [5]:
def clear_database(persist_directory="./chroma_phi"):
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
    
    # Get all document IDs
    data = vectordb.get()
    all_ids = data.get('ids', [])

    if not all_ids:
        print("üì≠ Vector DB is already empty.")
        return

    # Delete all documents
    vectordb.delete(ids=all_ids)
    print(f"üßπ Cleared {len(all_ids)} documents from the vector DB at '{persist_directory}'.")


In [19]:
# Delete all entries by document name
def delete_entries_by_name(document_name, persist_directory="./chroma_phi"):
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
    data = vectordb.get()
    matching_ids = [
        doc_id for doc_id, metadata in zip(data['ids'], data['metadatas'])
        if metadata.get("document_name") == document_name
    ]
    if matching_ids:
        vectordb.delete(ids=matching_ids)
        print(f"üßπ Deleted {len(matching_ids)} entries with document_name = '{document_name}'.")
    else:
        print(f"‚ö†Ô∏è No entries found with document_name = '{document_name}'.")

In [20]:
# Update entries by re-uploading document
def update_entries(pdf_path, document_name, document_type, persist_directory="./chroma_phi"):
    delete_entries_by_name(document_name, persist_directory)
    store_in_chroma(pdf_path, document_name, document_type, persist_directory)
    print(f"üîÅ Updated entries for '{document_name}'.")

In [42]:
def view_document(document_name, persist_directory="./chroma_phi"):
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
    docs = vectordb.get(where={"document_name": document_name})
    chunks = docs['documents']
    if not chunks:
        print(f"‚ùå No document found with name '{document_name}'.")
        return
    print(f"üìë Contents of '{document_name}':\n")
    for i, chunk in enumerate(chunks):
        print(f"[Section {i+1}]\n{chunk}\n{'-'*50}")

In [43]:
store_in_chroma("/Users/diya/Desktop/projects pdf/Zoomie.pdf", "Zoomie", "Project")

‚úÖ Stored 15 chunks from 'Zoomie' as 'Project'.


In [44]:
update_entries("/Users/diya/Desktop/projects pdf/cookbook.pdf", "cookbook", "Project")

‚ö†Ô∏è No entries found with document_name = 'cookbook'.
‚úÖ Stored 15 chunks from 'cookbook' as 'Project'.
üîÅ Updated entries for 'cookbook'.


In [45]:
view_document("Zoomie")

üìë Contents of 'Zoomie':

[Section 1]
Name of project: Zoomie
--------------------------------------------------
[Section 2]
Introduction: Zoomie is an intelligent autonomous vehicle developed for low-cost field data  collection. Built on a Raspberry Pi 4 platform, the vehicle uses a camera for navigation and  object-distance detection via the YOLO algorithm. It drives autonomously toward a specified  location while avoiding obstacles. Once it reaches the destination, Zoomie begins recording the  surrounding audio. This audio is processed using speech-to-text conversion via Whisper, and the  transcribed content is then summarized using a transformer-based model. The final summary is  made available through a React-based frontend, allowing users to review information collected by  the vehicle without any direct intervention.
--------------------------------------------------
[Section 3]
Summary: Zoomie is a compact autonomous vehicle powered by Raspberry Pi 4 that navigates to  a desi

In [46]:
delete_entries_by_name("cookbook")

üßπ Deleted 15 entries with document_name = 'cookbook'.


In [47]:
list_documents()

üìÑ Documents stored:
‚Ä¢ Name: Zoomie | Type: Project


In [48]:
clear_database()
#list_documents()

üßπ Cleared 15 documents from the vector DB at './chroma_phi'.
