In [None]:
# ingest_and_chunk.py
from pathlib import Path
import json
from typing import List

# LangChain ingestion primitives
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# -----------------------
# 1) Load mixed documents
# -----------------------
def load_docs_from_directory(root: str) -> List:
    """
    Loads PDFs and DOCX files from a directory into LangChain Document objects,
    carrying useful metadata like source path and (for PDFs) page numbers.
    """
    root_path = Path(root).resolve()
    if not root_path.exists():
        raise FileNotFoundError(f"Path not found: {root_path}")

    docs = []

    # PDF: one Document per page, metadata includes {"page": int, "source": str}
    pdf_loader = DirectoryLoader(
        str(root_path),
        glob="**/*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True,
        use_multithreading=True,
        max_concurrency=8,
    )
    docs.extend(pdf_loader.load())

    # DOCX: one Document per file, metadata includes {"source": str}
    docx_loader = DirectoryLoader(
        str(root_path),
        glob="**/*.docx",
        loader_cls=Docx2txtLoader,
        show_progress=True,
        use_multithreading=True,
        max_concurrency=8,
    )
    docs.extend(docx_loader.load())

    print(f"[ingest] Loaded {len(docs)} Documents from {root_path}")
    return docs

# -------------------------
# 2) Split into useful bits
# -------------------------
def split_docs(docs: List, chunk_size=1200, chunk_overlap=200):
    """
    Splits documents into overlapping chunks while preserving metadata.
    Overlap helps avoid context being cut in half at chunk boundaries.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", " ", ""],  # coarse -> fine
        add_start_index=True,  # adds 'start_index' to metadata for traceability in the source text
    )
    chunks = splitter.split_documents(docs)

    # Ensure minimal provenance on every chunk
    for i, d in enumerate(chunks):
        d.metadata.setdefault("chunk_id", i)
        d.metadata.setdefault("source", d.metadata.get("source", "unknown"))
    print(f"[split] Produced {len(chunks)} chunks (size={chunk_size}, overlap={chunk_overlap})")
    return chunks

# -------------------------
# 3) Save chunks to JSONL
# -------------------------
def save_chunks_jsonl(chunks: List, out_path="chunks_output.jsonl"):
    """
    Saves chunks to a JSONL file with page_content + metadata.
    This is the most portable form for future pipelines.
    """
    out_file = Path(out_path).resolve()
    out_file.parent.mkdir(parents=True, exist_ok=True)

    with out_file.open("w", encoding="utf-8") as f:
        for d in chunks:
            record = {
                "text": d.page_content,
                "metadata": d.metadata,
            }
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    print(f"[export] Wrote {len(chunks)} chunks to {out_file}")
    return str(out_file)

# -------------------------
# 4) Glue it together
# -------------------------
def main():
    SRC_DIR = "../Test_Files"           # your folder
    OUT_JSONL = "./chunks_output.jsonl" # where to save the extracted info

    docs = load_docs_from_directory(SRC_DIR)
    chunks = split_docs(docs, chunk_size=1200, chunk_overlap=200)
    save_chunks_jsonl(chunks, OUT_JSONL)

    # Optional: quick peek at a couple of chunks for sanity
    for d in chunks[:3]:
        print("\n--- SAMPLE CHUNK ---")
        print("SOURCE:", d.metadata.get("source"))
        if "page" in d.metadata:
            print("PAGE:", d.metadata["page"])
        print("CHUNK_ID:", d.metadata.get("chunk_id"))
        print(d.page_content[:300], "..." if len(d.page_content) > 300 else "")

if __name__ == "__main__":
    main()



[A
100%|██████████| 2/2 [00:00<00:00, 13.98it/s]

100%|██████████| 2/2 [00:00<00:00, 124.63it/s]

[ingest] Loaded 17 Documents from C:\Users\cwell\OneDrive\Desktop\Datatize\Test_Files
[split] Produced 23 chunks (size=1200, overlap=200)
[export] Wrote 23 chunks to C:\Users\cwell\OneDrive\Desktop\Datatize\Test_Jupyters\chunks_output.jsonl

--- SAMPLE CHUNK ---
SOURCE: C:\Users\cwell\OneDrive\Desktop\Datatize\Test_Files\Example_1.pdf
PAGE: 0
CHUNK_ID: 0
Statement of Work (SOW) 
 
Project: Data Roadmap & AWS Infrastructure Modernization 
Client: LogiTech 
Consultant: ConsultingCo 
Date: August 27, 2025 
Duration: 6 Months 
Total Cost: $1,000,000 
 
Introduction 
 
This Statement of Work (“SOW”) outlines the objectives, scope, deliverables, timeline, ...

--- SAMPLE CHUNK ---
SOURCE: C:\Users\cwell\OneDrive\Desktop\Datatize\Test_Files\Example_1.pdf
PAGE: 1
CHUNK_ID: 1
Migrate legacy systems and data workloads to AWS. 
 
Provide best practices for governance, compliance, and cost optimization. 
 
Enable knowledge transfer and training for LogiTech teams. 
 
Scope of Work 
In-Scope 
 
Cur




In [None]:
import chromeadb
