In [None]:
FILE_PATH="/Users/gmanvel/repos/rag-fast-flow/data/fast_flow.pdf"
JSON_OUTPUT_PATH="/Users/gmanvel/repos/rag-fast-flow/data"
%pip install PyMuPDF

In [None]:
import fitz
import re

HEADER_SIZE = 34
SECTION_SIZE = 18
CONTENT_SIZE = 13
TOL = 0

def dominant_size(block):
    sizes = []
    for line in block.get("lines", []):
        for span in line.get("spans", []):
            sizes.append(span.get("size"))
    if not sizes:
        return None
    rounded = [round(s, 1) for s in sizes if s is not None]
    if not rounded:
        return None
    freq = {}
    for s in rounded:
        freq[s] = freq.get(s, 0) + 1
    return max(freq.items(), key=lambda kv: kv[1])[0]

def block_text(block):
    parts = []
    for line in block.get("lines", []):
        for span in line.get("spans", []):
            parts.append(span.get("text", ""))
    text = "".join(parts)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\s*\n\s*", "\n", text)
    return text.strip()

def classify(size):
    if size is None:
        return None
    if abs(size - HEADER_SIZE) <= TOL:
        return "header"
    if abs(size - SECTION_SIZE) <= TOL:
        return "section"
    if abs(size - CONTENT_SIZE) <= TOL:
        return "content"
    return None

def sanitize_str(s):
    if s is None:
        return s
    return s.encode("utf-8", "replace").decode("utf-8")

def sanitize(obj):
    if isinstance(obj, dict):
        return {sanitize(k): sanitize(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [sanitize(x) for x in obj]
    if isinstance(obj, str):
        return sanitize_str(obj)
    return obj

In [None]:
doc = fitz.open(FILE_PATH)

result = []
current_header = None
current_section = None

for page in doc[2:64]:
    data = page.get_text("dict")
    blocks = data.get("blocks", [])
    blocks_sorted = sorted(blocks, key=lambda b: (b.get("bbox", [0,0,0,0])[1], b.get("bbox", [0,0,0,0])[0]))
    for b in blocks_sorted:
        if b.get("type") != 0:
            continue
        size = dominant_size(b)
        kind = classify(size)
        if kind is None:
            continue
        text = block_text(b)
        if not text:
            continue
        if kind == "header":
            current_header = {"header": text, "sections": []}
            result.append(current_header)
            current_section = None
        elif kind == "section":
            if current_header is None:
                current_header = {"header": "", "sections": []}
                result.append(current_header)
            current_section = {"tile": text, "content": ""}
            current_header["sections"].append(current_section)
        elif kind == "content":
            if current_section is None:
                if current_header is None:
                    current_header = {"header": "", "sections": []}
                    result.append(current_header)
                current_section = {"tile": "", "content": ""}
                current_header["sections"].append(current_section)
            if current_section["content"]:
                current_section["content"] += text #"\n" + text
            else:
                current_section["content"] = text

for h in result:
    h["header"] = h["header"].strip()
    cleaned_sections = []
    for s in h["sections"]:
        s["tile"] = s.get("tile", "").strip()
        s["content"] = s.get("content", "").strip()
        if s["tile"] or s["content"]:
            cleaned_sections.append(s)
    h["sections"] = cleaned_sections

result = sanitize(result)

result

#with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
#    json.dump(result, f, ensure_ascii=False, indent=2)