In [10]:
import sys
import os

# Define the path you want to add
new_path = '/workspace/rag/knowledgeforge/backend' # Use forward slashes or double backslashes on Windows (e.g., 'C:\\Users\\...')

# Add the path to the system path list
sys.path.append(new_path)

In [11]:
import time
from pathlib import Path

from app.core.config import KnowledgeForgeConfig
from app.services.chunking import StructureAwareChunker
from app.services.extraction import ContentExtractor, ContentType
from app.services.parsing import DocumentParser
from app.services.transformation import ContentTransformer

In [3]:
pdf_path = os.path.join("/workspace/rag/knowledgeforge/","reference","travel.pdf")

In [14]:
def fmt_time(seconds: float) -> str:
    """Format seconds into a human-readable string."""
    if seconds < 1:
        return f"{seconds * 1000:.0f}ms"
    return f"{seconds:.2f}s"

In [15]:
def main() -> None:
    """Run parse → extract → transform → chunk on travel.pdf and print results."""
    #pdf_path = Path(__file__).parent.parent.parent / "reference" / "travel.pdf"
    # if not pdf_path.exists():
    #     print(f"ERROR: {pdf_path} not found")
    #     return

    config = KnowledgeForgeConfig()
    timings: dict[str, float] = {}

    # =====================================================================
    # STAGE 1: PARSING
    # =====================================================================
    print("=" * 70)
    print("STAGE 1: PARSING")
    print("=" * 70)

    parser = DocumentParser(config)
    t0 = time.perf_counter()
    parse_result = parser.parse(str(pdf_path))
    timings["parse"] = time.perf_counter() - t0

    print(f"  Time:         {fmt_time(timings['parse'])}")
    print(f"  Pages:        {parse_result.page_count}")
    print(f"  Tokens (est): {parse_result.estimated_token_count}")
    print(f"  Content types: {parse_result.content_types}")
    print(f"  Has raw doc:  {parse_result.raw_document is not None}")
    print(f"  Structure:    {len(parse_result.structure)} pages with breakdown")
    for ps in parse_result.structure:
        print(f"    Page {ps.page_number}: {dict(ps.content_types)}")
    print()

    # =====================================================================
    # STAGE 2: EXTRACTION
    # =====================================================================
    print("=" * 70)
    print("STAGE 2: EXTRACTION")
    print("=" * 70)

    extractor = ContentExtractor(config)
    t0 = time.perf_counter()
    extracted = extractor.extract(parse_result)
    timings["extract"] = time.perf_counter() - t0

    print(f"  Time:           {fmt_time(timings['extract'])}")
    print(f"  Total items:    {len(extracted)}")

    by_type: dict[str, list] = {}
    for item in extracted:
        by_type.setdefault(item.content_type.value, []).append(item)
    for ctype, items in sorted(by_type.items()):
        print(f"    {ctype}: {len(items)}")
    print()

    # Show first 5 items
    print("  --- Sample extracted items (first 5) ---")
    for i, item in enumerate(extracted[:5]):
        preview = item.content[:120].replace("\n", "\\n")
        print(f"  [{i}] {item.content_type.value:18s} page={item.page_number}  "
              f"header={item.header_path!r}")
        print(f"       {preview}")
        if item.metadata:
            meta_keys = ", ".join(f"{k}={v}" for k, v in item.metadata.items())
            print(f"       meta: {meta_keys}")
        print()

    # =====================================================================
    # STAGE 3: TRANSFORMATION
    # =====================================================================
    print("=" * 70)
    print("STAGE 3: TRANSFORMATION")
    print("=" * 70)

    transformer = ContentTransformer(config)
    t0 = time.perf_counter()
    transform_result = transformer.transform(
        extracted, raw_document=parse_result.raw_document
    )
    timings["transform"] = time.perf_counter() - t0

    transformed = transform_result.items
    doc_markdown = transform_result.document_markdown

    print(f"  Time:              {fmt_time(timings['transform'])}")
    print(f"  Total items:       {len(transformed)}")
    print(f"  All transformed:   "
          f"{all(t.metadata.get('transformed') for t in transformed)}")

    # Count how many items actually changed content
    changed_count = sum(
        1 for e, t in zip(extracted, transformed) if e.content != t.content
    )
    print(f"  Content changed:   {changed_count}/{len(extracted)} items")
    print(f"  Document markdown: {len(doc_markdown)} chars")
    print()

    # =====================================================================
    # STRUCTURED MARKDOWN PREVIEW
    # =====================================================================
    if doc_markdown:
        print("=" * 70)
        print("STRUCTURED MARKDOWN (first 80 lines)")
        print("=" * 70)
        md_lines = doc_markdown.split("\n")
        for line in md_lines[:80]:
            print(f"  {line}")
        if len(md_lines) > 80:
            print(f"  ... ({len(md_lines)} total lines)")
        print()

    # =====================================================================
    # TABLE BEFORE/AFTER COMPARISON
    # =====================================================================
    tables_ext = [e for e in extracted if e.content_type == ContentType.TABLE]
    tables_trn = [t for t in transformed if t.content_type == ContentType.TABLE]

    if tables_ext:
        print("=" * 70)
        print(f"TABLE COMPARISON ({len(tables_ext)} table(s) found)")
        print("=" * 70)

        for idx, (ext, trn) in enumerate(zip(tables_ext, tables_trn)):
            changed = ext.content != trn.content
            print(f"\n  --- Table {idx + 1} (page {ext.page_number}) "
                  f"{'[CHANGED]' if changed else '[UNCHANGED]'} ---")

            if ext.metadata:
                meta_str = ", ".join(f"{k}={v}" for k, v in ext.metadata.items())
                print(f"  Metadata: {meta_str}")

            print(f"\n  BEFORE (extraction output):")
            print("  " + "-" * 50)
            for line in ext.content.split("\n")[:15]:
                print(f"  {line}")
            if ext.content.count("\n") > 15:
                print(f"  ... ({ext.content.count(chr(10)) + 1} total lines)")

            print(f"\n  AFTER (transformation output):")
            print("  " + "-" * 50)
            for line in trn.content.split("\n")[:15]:
                print(f"  {line}")
            if trn.content.count("\n") > 15:
                print(f"  ... ({trn.content.count(chr(10)) + 1} total lines)")

            if changed:
                # Show specific differences
                ext_lines = ext.content.split("\n")
                trn_lines = trn.content.split("\n")
                print(f"\n  DIFF (line-by-line changes):")
                print("  " + "-" * 50)
                max_lines = max(len(ext_lines), len(trn_lines))
                diffs_shown = 0
                for li in range(min(max_lines, 20)):
                    el = ext_lines[li] if li < len(ext_lines) else "<missing>"
                    tl = trn_lines[li] if li < len(trn_lines) else "<missing>"
                    if el != tl:
                        print(f"  Line {li + 1}:")
                        print(f"    - {el!r}")
                        print(f"    + {tl!r}")
                        diffs_shown += 1
                if diffs_shown == 0:
                    print("  (differences only in removed blank lines)")
            print()

    # =====================================================================
    # TEXT BEFORE/AFTER SAMPLES
    # =====================================================================
    text_ext = [e for e in extracted if e.content_type == ContentType.TEXT]
    text_trn = [t for t in transformed if t.content_type == ContentType.TEXT]
    text_changed = [
        (e, t) for e, t in zip(text_ext, text_trn) if e.content != t.content
    ]

    if text_changed:
        print("=" * 70)
        print(f"TEXT ITEMS THAT CHANGED ({len(text_changed)} of {len(text_ext)})")
        print("=" * 70)
        for e, t in text_changed[:5]:
            print(f"\n  Page {e.page_number}, header={e.header_path!r}")
            print(f"    Before: {e.content[:100]!r}")
            print(f"    After:  {t.content[:100]!r}")
        if len(text_changed) > 5:
            print(f"\n  ... and {len(text_changed) - 5} more")
        print()

    # =====================================================================
    # STAGE 4: CHUNKING
    # =====================================================================
    print("=" * 70)
    print("STAGE 4: CHUNKING")
    print("=" * 70)

    chunker = StructureAwareChunker(config)
    t0 = time.perf_counter()
    chunks = chunker.chunk(transformed)
    timings["chunk"] = time.perf_counter() - t0

    print(f"  Time:           {fmt_time(timings['chunk'])}")
    print(f"  Total chunks:   {len(chunks)}")
    total_chunk_tokens = sum(c.token_count for c in chunks)
    print(f"  Total tokens:   {total_chunk_tokens}")

    chunk_by_type: dict[str, list] = {}
    for c in chunks:
        chunk_by_type.setdefault(c.content_type.value, []).append(c)
    for ctype, clist in sorted(chunk_by_type.items()):
        token_sum = sum(c.token_count for c in clist)
        print(f"    {ctype}: {len(clist)} chunks, {token_sum} tokens")
    print()

    # Show chunk details
    print("  --- All chunks ---")
    for c in chunks:
        preview = c.content[:100].replace("\n", "\\n")
        strategy = c.metadata.get("chunking_strategy", "")
        split_by = c.metadata.get("split_by", "")
        extra = ""
        if strategy:
            extra += f" strategy={strategy}"
        if split_by:
            extra += f" split_by={split_by}"
        print(f"  [{c.chunk_index:2d}] {c.content_type.value:18s} "
              f"page={c.page_number}  tokens={c.token_count:4d}  "
              f"header={c.header_path!r}{extra}")
        print(f"       {preview}")
        print()

    # =====================================================================
    # TIMING SUMMARY
    # =====================================================================
    total = sum(timings.values())
    print("=" * 70)
    print("PIPELINE SUMMARY")
    print("=" * 70)
    print(f"  Parse:       {parse_result.page_count} pages, "
          f"{parse_result.estimated_token_count} tokens  [{fmt_time(timings['parse'])}]")
    print(f"  Extract:     {len(extracted)} items                  "
          f"[{fmt_time(timings['extract'])}]")
    print(f"  Transform:   {len(transformed)} items "
          f"({changed_count} changed), "
          f"markdown={len(doc_markdown)} chars  [{fmt_time(timings['transform'])}]")
    print(f"  Chunk:       {len(chunks)} chunks, "
          f"{total_chunk_tokens} tokens  [{fmt_time(timings['chunk'])}]")
    print(f"  " + "-" * 50)
    print(f"  Total:       {fmt_time(total)}")
    print()

    # Timing breakdown bar
    print("  Timing breakdown:")
    bar_width = 40
    for stage, t in timings.items():
        pct = (t / total) * 100 if total > 0 else 0
        filled = int(bar_width * t / total) if total > 0 else 0
        bar = "#" * filled + "." * (bar_width - filled)
        print(f"    {stage:12s} [{bar}] {pct:5.1f}%  {fmt_time(t)}")


In [16]:
main()

STAGE 1: PARSING


Parameter `strict_text` has been deprecated and will be ignored.


  Time:         5.65s
  Pages:        7
  Tokens (est): 5077
  Content types: {'text': 18, 'section_header': 16, 'page_header': 6, 'list_item': 12, 'footnote': 5, 'table': 5, 'picture': 14}
  Has raw doc:  True
  Structure:    7 pages with breakdown
    Page 1: {'text': 8, 'section_header': 1, 'picture': 8}
    Page 2: {'page_header': 1, 'section_header': 10, 'text': 6, 'list_item': 2, 'footnote': 1, 'table': 1, 'picture': 1}
    Page 3: {'page_header': 1, 'section_header': 1, 'footnote': 1, 'table': 1, 'picture': 1}
    Page 4: {'page_header': 1, 'section_header': 1, 'footnote': 1, 'table': 1, 'picture': 1}
    Page 5: {'page_header': 1, 'section_header': 1, 'footnote': 1, 'table': 1, 'picture': 1}
    Page 6: {'page_header': 1, 'section_header': 1, 'footnote': 1, 'table': 1, 'picture': 1}
    Page 7: {'page_header': 1, 'section_header': 1, 'list_item': 10, 'text': 4, 'picture': 1}

STAGE 2: EXTRACTION
  Time:           15ms
  Total items:    54
    image_description: 14
    table: 5
