In [4]:
# ingest_component.py
"""
This component takes a document (local file or text), processes it into chunks,
embeds the chunks, and adds them to a local vector store for later retrieval.
"""

from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter # For text splitting
import os

VECTOR_DB_PATH = "./vector_db"

def ingest_document(file_path: str):
    """
    Ingest a single document into the OSHA vector store.

    Args:
        file_path (str): Path to the document (PDF or TXT).
    """
    # Choose loader based on extension
    if file_path.lower().endswith(".pdf"):
        loader = PyPDFLoader(file_path)
    elif file_path.lower().endswith(".txt"):
        loader = TextLoader(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_path}")

    # Load and split into chunks
    docs = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_documents(docs)

    # Create or update vector store
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma(persist_directory=VECTOR_DB_PATH, embedding_function=embeddings)
    vectorstore.add_documents(chunks)
    vectorstore.persist()

    print(f"✅ Ingested {len(chunks)} chunks from {file_path} into {VECTOR_DB_PATH}")
