In [None]:
# ingest_osha.py
import os
from dotenv import load_dotenv

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.ollama import OllamaEmbeddings

from osha_consumer import OSHAConsumer


# Load environment variables from .env file
load_dotenv()
VectorDBPath = os.getenv("VECTOR_DB_PATH")


def ingest_osha(path: str, persist_dir=VectorDBPath):
    # 1. Fetch & parse
    consumer = OSHAConsumer()
    docs = consumer.fetch_and_parse(path)

    # 2. Split into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_documents(docs)

    # 3. Create embeddings
    embeddings = OllamaEmbeddings(model="nomic-embed-text")

    # 4. Store in Chroma DB
    vectorstore = Chroma.from_documents(
        chunks,
        embeddings,
        persist_directory=persist_dir
    )
    vectorstore.persist()
    print(f"Ingested {len(chunks)} chunks into {persist_dir}")


if __name__ == "__main__":
    # Example: OSHA regulations for General Industry
    ingest_osha("/laws-regs/regulations/standardnumber/1910/1910.23")
