In [None]:

# step0: patent.md 

# step1:  split to :  figs_MetaDict.json   full_filtered.md  -split
#  文本信息、图片引用 可以嵌入了
# -> full_split.md
# step2: full_filtered.md -->  struct: # 著录信息  # 权利要求书   # 说明书 
#   ---> str
# -> full_split_split.md
# step3: extractor                     record      claims         specification
#                         def record_extractor  claims_extractor  specification_extractor
#   --->  norm_(str) 
#  
# --> full_split_split_norm.md
# 代码逻辑可能存在复用的可能  code ++

# [xxxx] 以开头的段落  去掉[xxxx]
# 无关的内容也可以去掉，



In [None]:
#  split   raw full.md
# ->  figs.json    full_split.md   

import os
import re
import json
import base64
import mimetypes
from typing import Dict, Tuple, List, Any, Optional
from pathlib import Path

# -------------------- 正则 --------------------
IMG_MD_RE = re.compile(r'!\[(?P<alt>[^\]]*)\]\((?P<path>[^)]+)\)', re.IGNORECASE)
HEADER_RE = re.compile(r'^\s*#{1,6}\s*(?P<title>.+?)\s*$', re.MULTILINE)
ABSTRACT_HEADER_RE = re.compile(r'^\s*#\s*(?:\(57\))?\s*摘要\s*$', re.MULTILINE)
FIGDESC_HEADER_RE = re.compile(r'^\s*#\s*附图说明\s*$', re.MULTILINE)
CN_INDEX_TAG_RE = re.compile(r'\[\s*\d{3,}\s*\]')  # [0017] 这类编号

# -------------------- 工具函数：构建 figs.json --------------------
_DIGIT_TRANS = str.maketrans("０１２３４５６７８９", "0123456789")

def _to_ascii_digits(s: str) -> str:
    return s.translate(_DIGIT_TRANS)

def _clean_desc(desc: str) -> str:
    """把'图N为/是/:'去掉，保留后半句。"""
    s = desc.strip()
    m = re.match(r'^图\s*([0-9０-９]+)\s*[:：\s]*(是|为)?\s*', s)
    if m:
        return s[m.end():].strip()
    return s

def _to_path_str(p: Any) -> Optional[str]:
    if not p:
        return None
    return str(p)

def _img_to_b64(path_str: Optional[str],
                include_b64: bool,
                safe_read: bool,
                max_b64_mb: float,
                data_uri: bool) -> str:
    if not include_b64 or not path_str:
        return ""
    try:
        if safe_read and not os.path.exists(path_str):
            return ""
        sz = os.path.getsize(path_str)
        if max_b64_mb and sz > max_b64_mb * 1024 * 1024:
            return ""
        with open(path_str, "rb") as f:
            b64 = base64.b64encode(f.read()).decode("ascii")
        if data_uri:
            mime = mimetypes.guess_type(path_str)[0] or "image/jpeg"
            return f"data:{mime};base64,{b64}"
        return b64
    except Exception:
        return ""

def build_figs_repo(
    figs: Dict[str, Any],
    *,
    include_b64: bool = True,
    include_path: bool = False,
    safe_read: bool = True,
    max_b64_mb: float = 8.0,
    data_uri: bool = False,
) -> Dict[str, Any]:
    """把 figs_MetaDict 规范化为可直接入库/前端使用的结构（均可 JSON 序列化）。"""
    repo: Dict[str, Any] = {
        # 摘要图: [path_str | None, base64_str]
        "im_abs": [None, ""],
        # 图号 -> 描述（去掉“图N为/是”）
        "ims_desc": {},
        # 图号 -> 绝对路径（字符串；按开关）
        "ims_absp": {},
        # 图号 -> base64（按开关）
        "ims_bs64": {},
        # 附图标记说明（“漂亮字符串”/原始）
        "ims_annos": figs.get("annos_ims", "").strip(),
    }

    # 兼容 im_abs / img_abs
    abs_entry = figs.get("im_abs") or figs.get("img_abs")
    if isinstance(abs_entry, list) and len(abs_entry) >= 2:
        abs_path_str = _to_path_str(abs_entry[1])
        abs_b64 = _img_to_b64(abs_path_str, include_b64, safe_read, max_b64_mb, data_uri)
        repo["im_abs"] = [abs_path_str if include_path else None, abs_b64]

    # 逐个图条目：im_1, im_2, ...
    for k, v in figs.items():
        m = re.match(r'^im[_-]?(\d+)$', k)
        if not m:
            continue
        n = int(_to_ascii_digits(m.group(1)))
        if isinstance(v, list) and len(v) >= 2:
            desc_raw, p = v[0], v[1]
            desc_clean = _clean_desc(str(desc_raw))
            repo["ims_desc"][n] = desc_clean or str(desc_raw).strip()

            p_str = _to_path_str(p)
            if include_path and p_str:
                repo["ims_absp"][n] = p_str
            if include_b64:
                repo["ims_bs64"][n] = _img_to_b64(p_str, include_b64, safe_read, max_b64_mb, data_uri)

    # 兜底：如果没有从 im_n 抽到描述，尝试从 lines_ims 里扫
    if not repo["ims_desc"] and isinstance(figs.get("lines_ims"), list):
        for line in figs["lines_ims"]:
            m2 = re.match(r'^图\s*([0-9０-９]+)\s*', line)
            if not m2:
                continue
            n = int(_to_ascii_digits(m2.group(1)))
            repo["ims_desc"][n] = _clean_desc(line)

    # 若不开 base64/路径，保证字段仍是可序列化的空 dict
    if not include_b64:
        repo["ims_bs64"] = {}
    if not include_path:
        repo["ims_absp"] = {}

    # 整理顺序
    repo["ims_desc"] = {k: repo["ims_desc"][k] for k in sorted(repo["ims_desc"])}
    repo["ims_absp"] = {k: repo["ims_absp"][k] for k in sorted(repo["ims_absp"])}
    repo["ims_bs64"]  = {k: repo["ims_bs64"].get(k, "") for k in sorted(repo["ims_desc"])}

    return repo

# -------------------- 主类：一次性产出 full_split.md + figs.json --------------------
class PatentMdSplit:
    """
    用法：
        sp = PatentMdSplit(
            mdf=".../full.md",
            include_b64=True,
            include_path=False,
            max_b64_mb=6.0,
            data_uri=False,
            write_meta_raw=True,   # 是否旁存 figs_MetaDict.json
        )
        out_md, out_figs = sp()   # 返回输出文件路径
    """
    def __init__(
        self,
        mdf: str | Path,
        *,
        include_b64: bool = True,
        include_path: bool = False,
        safe_read: bool = True,
        max_b64_mb: float = 8.0,
        data_uri: bool = False,
        write_meta_raw: bool = False,
    ) -> None:
        self.mdf      : Path = Path(mdf)
        self.fdir     : Path = self.mdf.parent.resolve()
        self.imdir    : Path = self.fdir / "images"
        # pdf 可选
        self.pdff     : Optional[Path] = next(self.fdir.rglob('*_origin.pdf'), None)

        # 可选校验
        if not self.imdir.is_dir():
            # 不强制；有的文档可能无图
            pass

        self.text: str = self.mdf.read_text(encoding='utf-8', errors="ignore")

        # 中间产物（原始 figs 元信息）
        self.figs_info: Dict[str, Any] = {}

        # 输出控制
        self.include_b64 = include_b64
        self.include_path = include_path
        self.safe_read = safe_read
        self.max_b64_mb = max_b64_mb
        self.data_uri = data_uri
        self.write_meta_raw = write_meta_raw

    def __call__(self) -> Tuple[Path, Path]:
        return self.pipeline()

    # -------------------- pipeline --------------------
    def pipeline(self) -> Tuple[Path, Path]:
        raw_text = self.text

        # 先做一次全局扫描，记录所有图片以及“图片后一行的图号 caption”
        all_imgs = list(self._scan_all_images_with_captions(raw_text))

        # 1) 摘要图抽取并删除
        content = self._extract_abstract_image_and_strip(raw_text)

        # 2) 附图说明/附图标记说明抽取，并删除整节
        fig_text_items, fig_annos_str, content = self._extract_and_strip_fig_section(content)

        if fig_text_items:
            self.figs_info["lines_ims"] = fig_text_items  # List[str]
        if fig_annos_str:
            self.figs_info["annos_ims"] = fig_annos_str   # str

        # 3) 文末残留的“图片引用 + 图x” 堆叠删除
        content = self._strip_tail_image_blocks(content)

        # 4) 依据 “图号→路径” 配对 + “附图说明里的描述”，生成 im_1/im_2…
        self._build_im_n_entries(all_imgs, fig_text_items)

        # 5) 落盘：纯文本 + figs.json（可选同时写 figs_MetaDict.json）
        out_md, out_figs = self._write_down(content)

        return out_md, out_figs

    # -------------------- 工具函数 --------------------
    def _section_span(self, text: str, header_re: re.Pattern, next_header_re: re.Pattern = HEADER_RE) -> Tuple[int, int]:
        m = header_re.search(text)
        if not m:
            return -1, -1
        start = m.start()
        m2 = next_header_re.search(text, pos=m.end())
        end = m2.start() if m2 else len(text)
        return start, end

    def _scan_all_images_with_captions(self, text: str):
        """
        扫描整篇 md，返回迭代器： {alt, rel, abs, span, fig_no}
        - fig_no: 紧跟图片后若出现“图N …”则返回 N（int），否则 None
        """
        for m in IMG_MD_RE.finditer(text):
            alt = (m.group("alt") or "").strip()
            rel = (m.group("path") or "").strip()
            abs_path = str((self.fdir / rel).resolve())
            # 找图片后不远处的“图N …”
            lookahead_end = min(len(text), m.end() + 400)
            next_chunk = text[m.end():lookahead_end]
            cap = re.search(r'^\s*图\s*([0-9０-９]+)\b', next_chunk, re.MULTILINE)
            fig_no = None
            if cap:
                num_str = cap.group(1)
                try:
                    fig_no = int(self._to_halfwidth_digits(num_str))
                except Exception:
                    fig_no = None
            yield {
                "alt": alt,
                "rel": rel,
                "abs": abs_path,
                "span": (m.start(), m.end()),
                "fig_no": fig_no,
            }

    def _extract_abstract_image_and_strip(self, text: str) -> str:
        """
        在“摘要”段落内寻找第一张图片作为摘要图：
          - figs_info["im_abs"] = ["摘要图", abs_path]
          - 删除该图片引用行
        """
        s, e = self._section_span(text, ABSTRACT_HEADER_RE)
        if s == -1:
            return text

        sub = text[s:e]
        img = IMG_MD_RE.search(sub)
        if not img:
            return text

        rel = img.group("path").strip()
        abs_path = str((self.fdir / rel).resolve())
        self.figs_info["im_abs"] = ["摘要图", abs_path]

        # 删除图片这一整行
        line_start = text.rfind("\n", 0, s + img.start()) + 1
        line_end = text.find("\n", s + img.end())
        if line_end == -1:
            line_end = len(text)
        new_text = text[:line_start] + text[line_end + 1:]
        return new_text

    def _extract_and_strip_fig_section(self, text: str) -> Tuple[List[str], str, str]:
        """
        提取《附图说明》整节：
          - text_ims: ["图1为xxx", "图2是xxx", ...]
          - fig_annos_str: 以“最后一条图描述”为锚点，截其后的“图中标记说明”，做轻度美化
        然后把该整节从 md 中删除。
        """
        s, e = self._section_span(text, FIGDESC_HEADER_RE)
        if s == -1:
            return [], "", text

        section = text[s:e]

        # 去除 [0017] 之类编号以降低干扰（clean 用于提“图n为/是…”；原始 section 保留给锚点匹配）
        clean = CN_INDEX_TAG_RE.sub(" ", section)
        clean = re.sub(r'^\s*#?\s*附图说明\s*', '', clean, flags=re.MULTILINE).strip()
        mfirst = re.search(r'图\s*[0-9０-９]', clean)
        if mfirst:
            clean = clean[mfirst.start():]

        # 1) 图描述行
        text_ims: List[str] = []
        for seg in re.split(r'[；;。]\s*', clean):
            seg = seg.strip()
            if not seg:
                continue
            m = re.match(r'^图\s*([0-9０-９]+)\s*(.*)$', seg)
            if m:
                n = self._to_halfwidth_digits(m.group(1))
                rest = m.group(2).strip(" ：:为是，,")
                if rest:
                    text_ims.append(f"图{n}{('为' if not rest.startswith(('为', '是', ':', '：')) else '')}{rest}")
                else:
                    text_ims.append(f"图{n}")

        # 2) 以最后一条图描述为锚点，抽“图中标记说明”
        fig_annos_str = ""
        if text_ims:
            last_line = text_ims[-1]
            mm = re.match(r'^图\s*([0-9]+)\s*(.*)$', self._to_halfwidth_digits(last_line))
            if mm:
                last_no = mm.group(1)
                core_desc = (mm.group(2) or "")
                core_desc = core_desc.lstrip("为是：:,， ").strip()
                anchor_re = re.compile(
                    r'(?:\[\s*\d{3,}\s*\]\s*)?图\s*' + re.escape(last_no) +
                    r'\s*(?:为|是|:|：)?\s*' +
                    re.escape(core_desc).replace(r'\ ', r'\s*') +
                    r'[^。；;\n]*[。；;]?',
                    re.S
                )
                last_match = None
                for m in anchor_re.finditer(section):
                    last_match = m
                anchor_pos = last_match.end() if last_match else None
                tail = section[anchor_pos:] if anchor_pos is not None else ""
                fig_annos_str = self._beautify_annos_str(tail)

        # 删除整节《附图说明》
        new_text = text[:s] + text[e:]
        return text_ims, fig_annos_str, new_text

    def _beautify_annos_str(self, s: str) -> str:
        """轻度美化：去编号、合并空白、统一中英文标点、连字符去空格，迭代收敛。"""
        if not s:
            return ""
        def _pass(x: str) -> str:
            x = re.sub(r'\[\s*\d{3,}\s*\]', '', x)         # 去 [0031]
            x = re.sub(r'[ \t\r\n]+', ' ', x).strip()      # 合并空白
            # 统一标点
            x = re.sub(r'\s*[:：]\s*', '：', x)
            x = re.sub(r'\s*[,，]\s*', '，', x)
            x = re.sub(r'\s*[;；]\s*', '；', x)
            # 连字符去空格（2- 1 -> 2-1）
            x = re.sub(r'(\d)\s*-\s*(\d)', r'\1-\2', x)
            # 多余标点收敛
            x = re.sub(r'，{2,}', '，', x)
            x = re.sub(r'；{2,}', '；', x)
            x = x.strip('：，；。 ')
            x = x.rstrip('。；;')
            return x
        prev, cur = None, s
        for _ in range(4):
            prev, cur = cur, _pass(cur)
            if cur == prev:
                break
        return cur

    def _strip_tail_image_blocks(self, text: str) -> str:
        """移除文末连续的 “图片引用 + （可选）图x说明行” 的堆叠。"""
        tail_re = re.compile(
            r'(?:\s*'
            r'!\[[^\]]*\]\([^)]+\)\s*'
            r'(?:\n\s*图[^\n]*\s*)?'
            r')+\s*$',
            re.S
        )
        return tail_re.sub("", text)

    def _build_im_n_entries(self, all_imgs: List[Dict[str, Any]], text_ims: List[str]) -> None:
        """
        组合 im_1/im_2…：
          - 优先用 “图片后紧邻的图号” 来配对路径
          - 其次按出现顺序回退
          - 文本描述优先取 text_ims 里的 “图n为…”；没有就用 alt 或“图n”
        """
        # 1) 把摘要图从候选里排除（已经作为 im_abs）
        abs_img_path = None
        if "im_abs" in self.figs_info and isinstance(self.figs_info["im_abs"], list) and len(self.figs_info["im_abs"]) >= 2:
            abs_img_path = self.figs_info["im_abs"][1]
        candidate_imgs = [it for it in all_imgs if it["abs"] != abs_img_path]

        # 2) 解析 text_ims -> n -> 描述（保留原始“图n为…”句式，后续 build_figs_repo 会清洗）
        desc_map: Dict[int, str] = {}
        for t in text_ims or []:
            m = re.match(r'^图\s*([0-9０-９]+)\s*(.*)$', t.strip())
            if m:
                n = int(self._to_halfwidth_digits(m.group(1)))
                desc = m.group(2).lstrip("：:为是，, ")
                desc_map[n] = ("图%d为%s" % (n, desc)) if desc else ("图%d" % n)

        # 3) 先用“图片后紧邻的图号”精确配对
        no_to_path: Dict[int, str] = {}
        used_idxs = set()
        for idx, it in enumerate(candidate_imgs):
            if it["fig_no"] is not None:
                n = it["fig_no"]
                if n not in no_to_path:
                    no_to_path[n] = it["abs"]
                    used_idxs.add(idx)

        # 4) 对没有图号的图片，按顺序补充到还没有路径的“图n”
        expected_ns = sorted(desc_map.keys()) if desc_map else list(range(1, len(candidate_imgs) + 1))
        rest_imgs = [it for i, it in enumerate(candidate_imgs) if i not in used_idxs]

        j = 0
        for n in expected_ns:
            if n not in no_to_path and j < len(rest_imgs):
                no_to_path[n] = rest_imgs[j]["abs"]
                j += 1

        # 5) 根据 no_to_path 填充 figs_info 的 im_n
        for n in sorted(no_to_path.keys()):
            key = f"im_{n}"
            desc = desc_map.get(n)
            if not desc:
                # 退化用 alt / 或者“图n”
                alt = ""
                for it in candidate_imgs:
                    if it["abs"] == no_to_path[n]:
                        alt = it.get("alt") or ""
                        break
                desc = alt.strip() or f"图{n}"
            self.figs_info[key] = [desc, no_to_path[n]]

    @staticmethod
    def _to_halfwidth_digits(s: str) -> str:
        return s.translate(str.maketrans("０１２３４５６７８９", "0123456789")).strip()

    # -------------------- 落盘 --------------------
    def _write_down(self, content: str) -> Tuple[Path, Path]:
        in_name = Path(self.mdf).stem
        out_md = self.fdir / f"{in_name}_split.md"
        out_md.write_text(content, encoding="utf-8")

        # 可选旁存原始 figs_MetaDict.json（排查用）
        if self.write_meta_raw:
            meta_path = self.fdir / "figs_MetaDict.json"
            with open(meta_path, "w", encoding='utf-8') as fj:
                json.dump(self.figs_info, fj, ensure_ascii=False, indent=2)

        # 规范化 -> figs.json
        repo = build_figs_repo(
            self.figs_info,
            include_b64=self.include_b64,
            include_path=self.include_path,
            safe_read=self.safe_read,
            max_b64_mb=self.max_b64_mb,
            data_uri=self.data_uri,
        )
        out_figs = self.fdir / "figs.json"
        with open(out_figs, "w", encoding="utf-8") as fo:
            json.dump(repo, fo, ensure_ascii=False, indent=2)

        return out_md, out_figs


# -------------------- 批量运行（可选） --------------------
if __name__ == '__main__':
    from tqdm import tqdm

    root_dir = r"../.log/SimplePDF"
    tft_mds = list(Path(root_dir).rglob("*/full.md"))
    for md in tft_mds:
        md = str(md)
        sp = PatentMdSplit(
            md,
            include_b64=False,   # 如需瘦身可改为 False
            include_path=True, # 不暴露本地路径时设 False
            max_b64_mb=6.0,     # image超过6M就不转base64
            data_uri=False,
            write_meta_raw=True, #
        )
        sp()


In [None]:
# struct   full_split.md 


# normal markdown
"""  
-->
# record       # --> metadata  --> for search 
...
abs

# claims       # 看似不是很重要的东西  --> may used in answer
1. ...
2. ...
...

# mannuals     # 细节     --> add for answer


将“专利文本（粗糙 Markdown）”规范化为结构化 Markdown：
- 归并“著录信息”（含摘要）
- 补齐“权利要求书”一级标题
- 归档“说明书”并抽取子段（标题、背景技术、发明/实用新型内容、具体实施方式）
- 忽略“附图说明 / 说明书附图”段（按用户要求）

"""

import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple


CLAIM_LINE_RE       = re.compile(r"^\s*[（(]?\s*\d+\s*[)）\.．、]\s*")   # 1. / 1、 / 1) / （1）/ 1．
ABS_ANCHOR_RE       = re.compile(r"^\s*#?\s*\(57\)\s*摘要\s*$")          # “(57)摘要”行（容错带/不带 #）
NAME_54_RE          = re.compile(r"^\s*#?\s*\(54\)\s*(?:发明名称|实用新型名称|名称|题名)\s*$")
TYPE_UTILITY_RE     = re.compile(r"\(12\).*?实用新型")
TYPE_INVENTION_RE   = re.compile(r"\(12\).*?发明")
H1_RE               = re.compile(r"^\s*#\s+(?P<title>.+?)\s*$")
ANY_H_RE            = re.compile(r"^\s*#+\s*(?P<title>.+?)\s*$")

# —— 说明书子段识别（含常见变体/同义标题）——
SEC_TECH        = {"技术领域"}
SEC_BG          = {"背景技术"}
SEC_CONTENT_INV = {"发明内容"}                 # 严格按你要求，仅认“发明内容”
SEC_CONTENT_UM  = {"实用新型内容"}              # 严格按你要求，仅认“实用新型内容”
SEC_IMPL        = {"具体实施方式"}
SEC_FIGS        = {"附图说明", "说明书附图"}     # 忽略输出

# 也把“裸标题”（没有 # 的行）当作分段/截断锚点
PLAIN_HEADINGS = SEC_TECH | SEC_BG | SEC_CONTENT_INV | SEC_CONTENT_UM | SEC_IMPL | SEC_FIGS

@dataclass
class Pieces:
    patent_type   : str                  # 'utility' | 'invention' | 'unknown'
    meta_prefix   : List[str]            # 摘要行之前的 (xx) 信息块（已去掉行首#）
    abstract_lines: List[str]            # 摘要正文（不含 "(57) 摘要" 行）
    claim_lines   : List[str]            # 权利要求条款行（整块，含内部空行）
    body_rest     : List[str]            # 余下说明书原始行
    title_text    : Optional[str]        # 标题
    sections      : Dict[str, List[str]] # 说明书分段原文（按识别标题键名收拢）

class PatentMdStruct:
    """
    将 full_split.md 重构并写同目录：full_split_struct.md
    输出结构为：
    # record  著录信息
    ...(著录信息行)
    (57) 摘要
    ...(摘要正文)

    # claims 权利要求书
    1. ...
    2. ...
    ...

    # manuals 说明书 
    标题：
       xxx
    背景技术：
       ...
    发明内容：/实用新型内容：
       ...
    具体实施方式：
       ...
    """
    def __init__(self, markdown_file_path: str | Path):
        self.src = Path(markdown_file_path)
        self.text = self.src.read_text(encoding="utf-8", errors="ignore").replace("\r\n", "\n")

    def __call__(self) -> Path:
        pcs = self._segment(self.text)
        md  = self._compose(pcs)
        out = self.src.with_name("full_split_struct.md")
        out.write_text(md, encoding="utf-8")
        return out

    # ---------------- segmentation ----------------
    def _segment(self, txt: str) -> Pieces:
        lines = txt.split("\n")

        # 专利类型（前 60 行粗判）
        head60 = "\n".join(lines[:60])
        if TYPE_UTILITY_RE.search(head60):
            ptype = "utility"
        elif TYPE_INVENTION_RE.search(head60):
            ptype = "invention"
        else:
            ptype = "unknown"

        # 摘要锚点
        abs_idx = None
        for i, ln in enumerate(lines):
            if ABS_ANCHOR_RE.match(ln.strip()):
                abs_idx = i; break

        meta_prefix, abstract_lines, claim_lines, body_rest = [], [], [], []

        if abs_idx is None:
            # 无“(57) 摘要”标题：把开头信息先收为 meta，直到发现权利要求或“说明书段标题/裸标题”
            cut = 0
            while cut < len(lines) and not CLAIM_LINE_RE.match(lines[cut]) and not self._looks_body_heading_or_plain(lines[cut]):
                meta_prefix.append(self._strip_leading_hash(lines[cut])); cut += 1
            # 权利要求块（无摘要时，同样按“首个编号行→下一个分节标题（含裸标题）”提取）
            c_s, c_e = self._find_claims_block(lines, start=cut)
            if c_s is not None:
                claim_lines = lines[c_s:c_e]
                body_rest   = lines[c_e:]
            else:
                body_rest   = lines[cut:]
        else:
            # 著录信息到摘要标题行为止
            for ln in lines[:abs_idx]:
                meta_prefix.append(self._strip_leading_hash(ln))
            # 摘要正文：从摘要行下一行，直到出现权利要求起点或“说明书段标题/裸标题”
            j = abs_idx + 1
            while j < len(lines) and not CLAIM_LINE_RE.match(lines[j]) and not self._looks_body_heading_or_plain(lines[j]):
                abstract_lines.append(lines[j].rstrip()); j += 1
            # 权利要求整块：从首个编号行开始，到下一个分节标题（包含裸标题）
            c_s, c_e = self._find_claims_block(lines, start=j)
            if c_s is not None:
                claim_lines = lines[c_s:c_e]
                body_rest   = lines[c_e:]
            else:
                body_rest   = lines[j:]

        sections   = self._extract_body_sections(body_rest)
        title_text = self._extract_title(lines, sections)

        return Pieces(
            patent_type=ptype,
            meta_prefix=self._trim(meta_prefix),
            abstract_lines=self._trim(abstract_lines),
            claim_lines=self._trim_trailing_blank(claim_lines),  # 保留内部空行，去掉尾部空行
            body_rest=body_rest,
            title_text=title_text,
            sections=sections,
        )

    def _find_claims_block(self, lines: List[str], start: int) -> Tuple[Optional[int], Optional[int]]:
        """
        从 start 开始寻找首个权利要求编号行（CLAIM_LINE_RE），
        返回 [c_start, c_end) 区间，其中 c_end 为后续遇到的首个“Markdown 标题(#开头) 或 裸标题行(技术领域/背景技术/发明内容/实用新型内容/具体实施方式/附图说明/说明书附图)”。
        若未找到编号行，返回 (None, None)。
        """
        n = len(lines)
        c_start = None
        i = start
        while i < n:
            if CLAIM_LINE_RE.match(lines[i]):
                c_start = i
                break
            # 如果在遇到编号前先撞到了“分节标题/裸标题”，说明没有权利要求
            if self._looks_body_heading_or_plain(lines[i]):
                return None, None
            i += 1
        if c_start is None:
            return None, None

        # 往后一直吃，直到遇到分节标题/裸标题
        j = c_start + 1
        while j < n and not self._looks_body_heading_or_plain(lines[j]):
            j += 1
        return c_start, j

    # ---------------- compose ----------------
    def _compose(self, p: Pieces) -> str:
        out: List[str] = []

        # record：著录信息 + 摘要
        out.append("# 著录信息")     
        out.extend(self._trim(p.meta_prefix))
        out.append("")
        out.append("(57) 摘要")
        out.extend(p.abstract_lines)
        out.append("")

        # claims：整块原样放入
        out.append("# 权力要求书")
        out.extend(p.claim_lines)
        out.append("")

        # manuals：说明书（四小节）
        out.append("# 说明书")

        # 标题
        out.append("标题：")
        if p.title_text:
            out.append(f"   {p.title_text}")
        out.append("")

        # 背景技术
        out.append("背景技术：")
        bg = p.sections.get("背景技术", [])
        out.extend(bg)
        out.append("")

        # 发明内容 / 实用新型内容（二选一标题）
        content_title, content_lines = self._pick_content_block(p)
        out.append(f"{content_title}：")
        out.extend(content_lines)
        out.append("")

        # 具体实施方式
        out.append("具体实施方式：")
        impl = []
        for k in SEC_IMPL:
            if k in p.sections:
                impl = p.sections[k]; break
        out.extend(impl)
        out.append("")

        return "\n".join(out)

    # ---------------- helpers ----------------
    def _pick_content_block(self, p: Pieces) -> Tuple[str, List[str]]:
        if "发明内容" in p.sections:
            return "发明内容", p.sections["发明内容"]
        if "实用新型内容" in p.sections:
            return "实用新型内容", p.sections["实用新型内容"]
        # 如果二者都没有，则按类型兜底给空段
        return ("发明内容" if p.patent_type == "invention" else "实用新型内容"), []

    @staticmethod
    def _strip_leading_hash(s: str) -> str:
        s2 = s.lstrip()
        if s2.startswith("# "):  # 只把顶层 # 去掉，保持行文本
            return s2[2:].strip()
        return s.rstrip()

    @staticmethod
    def _trim(lines: List[str]) -> List[str]:
        i, j = 0, len(lines)
        while i < j and not lines[i].strip():
            i += 1
        while j > i and not lines[j - 1].strip():
            j -= 1
        return [ln.rstrip() for ln in lines[i:j]]

    @staticmethod
    def _trim_trailing_blank(lines: List[str]) -> List[str]:
        # 保留内部空行，只去掉末尾空白
        j = len(lines)
        while j > 0 and not lines[j - 1].strip():
            j -= 1
        return [ln.rstrip() for ln in lines[:j]]

    @staticmethod
    def _title_clean(t: str) -> str:
        return re.sub(r"\s+", " ", t).strip()

    def _extract_title(self, lines: List[str], sections: Dict[str, List[str]]) -> Optional[str]:
        # 优先 (54) 名称 下一行非 # 的文本
        idx54 = None
        for i, ln in enumerate(lines[:200]):
            if NAME_54_RE.match(ln):
                idx54 = i; break
        if idx54 is not None:
            j = idx54 + 1
            while j < len(lines):
                cand = lines[j].strip()
                if cand and (not ANY_H_RE.match(cand)):
                    return self._title_clean(cand)
                j += 1
        # 次选：第一个不是通用小节名的一级标题
        known = PLAIN_HEADINGS
        for ln in lines:
            m = H1_RE.match(ln)
            if m:
                t = m.group("title").strip()
                if t not in known and not t.startswith("("):
                    return self._title_clean(t)
        # 再兜底：任一不属于已知集合的分节名
        for k in sections.keys():
            if k not in known:
                return self._title_clean(k)
        return None

    def _looks_body_heading_or_plain(self, line: str) -> bool:
        """既识别以 # 开头的 Markdown 标题，也识别没有 # 的‘裸标题行’。"""
        s = line.strip()
        m = ANY_H_RE.match(s)
        if m:
            t = m.group("title").strip()
            return t in (PLAIN_HEADINGS)
        # 裸标题：整行等于这些关键字
        return s in PLAIN_HEADINGS

    def _extract_body_sections(self, body_lines: List[str]) -> Dict[str, List[str]]:
        """
        从 body_lines（摘要与权利要求书之后的剩余部分）中，按一级 # 或裸标题切分并抽取：
        - 技术领域
        - 背景技术
        - 发明内容 / 实用新型内容
        - 具体实施方式
        忽略“附图说明/说明书附图”。
        """
        secs: Dict[str, List[str]] = {}
        if not body_lines:
            return secs

        # 定位各块起点：既支持 "# 标题" 也支持裸标题
        indices: List[Tuple[str, int]] = []  # (标题, start_line_idx_of_block)
        for i, ln in enumerate(body_lines):
            s = ln.strip()
            m = ANY_H_RE.match(s)
            if m:
                title = m.group("title").strip()
                if title in PLAIN_HEADINGS:
                    indices.append((title, i))
            elif s in PLAIN_HEADINGS:
                indices.append((s, i))

        # 若完全没有标题，就把剩余整体作为“具体实施方式”
        if not indices:
            content = [x.rstrip() for x in body_lines]
            content = self._trim(content)
            if content:
                secs["具体实施方式"] = content
            return secs

        # 收尾标记
        indices.append(("#END#", len(body_lines)))

        # 提取各段
        for (title, s), (_, e) in zip(indices, indices[1:]):
            if title in SEC_FIGS:
                continue  # 忽略附图说明类
            content = [ln.rstrip() for ln in body_lines[s + 1:e]]
            content = self._trim(content)
            if content:
                secs[title] = content

        return secs    
    
# ---- 示例运行（可注释）----
if __name__ == "__main__":
    
    from tqdm import tqdm

    root_dir = Path.cwd().parent / "./.log" / "SimplePDF"
    # print(str(root_dir))
    tft_mds = list(Path(root_dir).rglob("full_split.md"))
    for md in tqdm(tft_mds):
        demo = str(md)
        outp = PatentMdStruct(demo)()
        print("写入：", outp)


In [None]:
# 优化一下 figs.json

import re
import base64
from pathlib import Path
from typing import Any, Dict, Optional
import json 

def build_figs_repo(
    figs: Dict[str, Any],
    include_b64: bool = True,          # 默认仅存 base64（更适合入库）
    include_path: bool = True,        # 是否保存本地路径  
    safe_read: bool = True,            # 读文件前做存在性检查
    max_b64_mb: float = 5.0,           # 超过该体积(单图)则不转 base64
    data_uri: bool = False,            # base64 是否加 data URI 前缀
) -> Dict[str, Any]:
    """
    将原始 figs_MetaDict.json 规范化为：
    {
        "im_abs": [Path|None, str] | None,     # 摘要图：[路径或None, base64或"" ]；无摘要图则 None
        "ims_desc": {int: str},                # 图号 -> 纯描述（去掉“图n为/是/：”前缀）
        "ims_absp": {int: Path},               # 图号 -> 路径（只有当 include_path=True 才填）
        "ims_bs64": {int: str},                # 图号 -> base64（只有当 include_b64=True 才填）
        "ims_annos": str,                      # 附图标记说明（轻度美化）
    }
    """
    # -------------------- 工具 --------------------
    def _to_halfwidth_digits(s: str) -> str:
        return re.sub(r'[０-９]', lambda m: str(ord(m.group(0)) - 65248), s)

    def _parse_combo_descs(text: str) -> Dict[int, str]:
        """支持一行中出现多图：'图10…图11…' -> {10:'…', 11:'…'}"""
        if not text:
            return {}
        norm = _to_halfwidth_digits(text)
        hits = list(re.finditer(r'图\s*([0-9]{1,3})', norm))
        out: Dict[int, str] = {}
        for i, m in enumerate(hits):
            n = int(m.group(1))
            start = m.start()
            end = hits[i + 1].start() if i + 1 < len(hits) else len(norm)
            chunk = norm[start:end].strip()
            prefix = rf'^图\s*{n}\s*(?:[:：]?\s*)?(?:为|是)?\s*'
            desc = re.sub(prefix, '', chunk).strip(' ：:，,；;.。 \n\t')
            out[n] = desc
        return out

    def _pick_better(a: Optional[str], b: Optional[str]) -> str:
        a = (a or '').strip(); b = (b or '').strip()
        if not a: return b
        if not b: return a
        return b if len(b) >= len(a) else a

    def _mime_from_suffix(p: Path) -> str:
        ext = p.suffix.lower()
        if ext in {".jpg", ".jpeg"}: return "image/jpeg"
        if ext in {".png"}: return "image/png"
        if ext in {".gif"}: return "image/gif"
        if ext in {".webp"}: return "image/webp"
        return "application/octet-stream"

    def _file_to_b64(p: Path) -> str:
        try:
            if max_b64_mb is not None:
                if p.exists() and p.is_file() and p.stat().st_size > max_b64_mb * 1024 * 1024:
                    return ""
            with open(p, "rb") as f:
                raw = base64.b64encode(f.read()).decode("utf-8")
            return f"data:{_mime_from_suffix(p)};base64,{raw}" if data_uri else raw
        except Exception:
            return ""

    def _beautify_annos_str(s: str) -> str:
        if not s: return ""
        s = s.replace("\n", " ").replace("\r", " ")
        s = re.sub(r"\s+", " ", s)
        s = s.replace(":", "：").replace(";", "；").replace(",", "，")
        return s.strip(" ；；。.")

    # -------------------- 容器 --------------------
    repo: Dict[str, Any] = {
        "im_abs": None,
        "ims_desc": {},
        "ims_absp": {},
        "ims_bs64": {},
        "ims_annos": "",
    }

    # -------------------- 摘要图 -> [Path|None, b64|"" ] --------------------
    im_abs_val = figs.get("im_abs")
    if isinstance(im_abs_val, list) and len(im_abs_val) >= 2:
        abs_path = Path(im_abs_val[1])
        path_ok = (not safe_read) or abs_path.exists()
        abs_b64 = _file_to_b64(abs_path) if (include_b64 and path_ok) else ""
        repo["im_abs"] = [abs_path if (include_path and path_ok) else None, abs_b64]

    # -------------------- 先收集所有路径（可选） --------------------
    im_key_pat = re.compile(r"^im_(\d+)$")
    for k, v in figs.items():
        m = im_key_pat.match(k)
        if not m or not (isinstance(v, list) and len(v) >= 2):
            continue
        n = int(m.group(1))
        p = Path(v[1])
        if include_path:
            if (not safe_read) or p.exists():
                repo["ims_absp"][n] = p

    # -------------------- 汇聚描述（lines_ims + 每个 im_n 的标题） --------------------
    desc_map: Dict[int, str] = {}
    for line in figs.get("lines_ims", []) or []:
        for n, desc in _parse_combo_descs(line).items():
            desc_map[n] = _pick_better(desc_map.get(n), desc)

    for k, v in figs.items():
        m = im_key_pat.match(k)
        if m and isinstance(v, list) and v:
            raw_title = str(v[0])
            for n, desc in _parse_combo_descs(raw_title).items():
                desc_map[n] = _pick_better(desc_map.get(n), desc)

    # 保证与已有图号对齐
    known_ns = set(int(m.group(1)) for k in figs for m in [im_key_pat.match(k)] if m)
    for n in known_ns:
        desc_map.setdefault(n, "")

    repo["ims_desc"] = {int(k): v for k, v in sorted(desc_map.items(), key=lambda x: int(x[0]))}

    # -------------------- base64（可选） --------------------
    if include_b64:
        # 优先从路径生成；如果没存路径也没关系，我们仍按 figs 源路径读取
        for k, v in figs.items():
            m = im_key_pat.match(k)
            if not m or not (isinstance(v, list) and len(v) >= 2):
                continue
            n = int(m.group(1))
            p = Path(v[1])
            if (not safe_read) or p.exists():
                repo["ims_bs64"][n] = _file_to_b64(p)

    # -------------------- 附图标记说明 --------------------
    repo["ims_annos"] = _beautify_annos_str(figs.get("annos_ims", ""))

    return repo

    

root_dir = r"..\.log\SimplePDF"

tgt_figs = next(Path(root_dir).rglob("figs_MetaDict.json"), None)

new_figs = str(tgt_figs)[:-14] + ".json"
# print(new_figs)    # ok

with open(tgt_figs, "r", encoding='utf-8') as fig:
    figs = json.load(fig)
    
print(build_figs_repo(figs))

  

    
