# Enrichr

> Fix, clean markdown headings and enrich it with figures description, ...

This module aims to fix and enrich markdown headings from OCR'd PDF files by:

1. Fixing heading hierarchy that was corrupted during OCR
2. Adding page numbers to headings for better navigation
3. Enriching figure references with descriptive text and creating a table of figures

In [None]:
#| default_exp enrichr

In [None]:
#| export
from pathlib import Path
import os
import re
from dotenv import load_dotenv
from fastcore.all import *
import dspy
from pydantic import BaseModel
from typing import List

In [None]:
#| export
load_dotenv()
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

In [None]:
#| exports
cfg = AttrDict({
    'fixed_suffix': '_fixed',
    'lm': 'gemini/gemini-2.0-flash-exp',
    'api_key': GEMINI_API_KEY,
    'max_tokens': 8192,
    'track_usage': False
})

In [None]:
#| export
src_dir = Path("../_data/md_library/49d2fba781b6a7c0d94577479636ee6f")

## Fixing Markdown Headings

In [None]:
#| eval: false
doc = src_dir / 'abridged_evaluation_report_final_olta_ndoja_pdf'
pages = [p for p in doc.ls(file_exts=".md") if cfg.fixed_suffix not in p.stem] 
pages = L(pages).sorted(key=lambda p: int(p.stem.split('_')[1])); pages

(#31) [Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_1.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_2.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_3.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_4.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_5.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_6.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_7.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_8.md'),Path('../_data/md_library/49d2fba781b6a7c

In [None]:
#| exports
def get_hdgs(md_txt): return re.findall(r'^#+.*$', md_txt, re.MULTILINE)

In [None]:
#| exports
def get_hdgs_with_pages(
    pages: list[Path] # List of pages
    ):
    "Get headings and the page number they are on"
    headings = []
    for i, page in enumerate(pages, 1):  # page numbers start at 1
        page_headings = get_hdgs(page.read_text())
        # add each heading with its page number
        for o in page_headings:
            headings.append({'heading': o, 'page': i})
    return headings

In [None]:
#| eval: false
hdgs = get_hdgs_with_pages(pages); hdgs[:5]

[{'heading': '# **PPMi**', 'page': 1},
 {'heading': '# CONTENTS ', 'page': 3},
 {'heading': '# 1. Introduction ', 'page': 4},
 {'heading': '# 2. Background of the JI-HoA ', 'page': 5},
 {'heading': '### 2.1. Context and design of the JI-HoA', 'page': 5}]

In [None]:
#| eval: false
toc = L([get_hdgs(p.read_text()) for p in pages]).concat(); toc[:5]

(#5) ['# **PPMi**','# CONTENTS ','# 1. Introduction ','# 2. Background of the JI-HoA ','### 2.1. Context and design of the JI-HoA']

In [None]:
#| exports
def format_hdgs(
    hdgs: list[dict] # List of headings with page numbers
    ):
    "Format headings with page numbers"
    formatted = []
    page_positions = {}
    
    for item in hdgs:
        page = item['page']
        page_positions[page] = page_positions.get(page, 0) + 1
        formatted.append(f"{item['heading']} (Page {page}, Position {page_positions[page]})")
    
    return "\n".join(formatted)

In [None]:
#| eval: false
print(format_hdgs(hdgs)[:500])

# **PPMi** (Page 1, Position 1)
# CONTENTS  (Page 3, Position 1)
# 1. Introduction  (Page 4, Position 1)
# 2. Background of the JI-HoA  (Page 5, Position 1)
### 2.1. Context and design of the JI-HoA (Page 5, Position 2)
# 2.2. External factors affecting the implementation of the JI  (Page 7, Position 1)
# 3. Methodology  (Page 8, Position 1)
# 4. Findings  (Page 10, Position 1)
### 4.1. Relevance (Page 10, Position 2)
### 4.1.1. Relevance of programme activities for migrants, returnees, and comm


In [None]:
#| exports
lm = dspy.LM(cfg.lm, api_key=cfg.api_key)
dspy.configure(lm=lm)
dspy.settings.configure(track_usage=cfg.track_usage)

In [None]:
#| exports
class HeadingResult(BaseModel):
    old: str
    page: int
    position: int
    new: str
    changed: bool  # True if correction was made

In [None]:
#| exports
class FixHeadingHierarchy(dspy.Signature):
    """Fix markdown heading hierarchy by analyzing the document's numbering patterns:
    - Detect numbering scheme (1.2.3, I.A.1, A.1.a, etc.)
    - Apply hierarchy levels based on nesting depth: # for top level, ## for second level, ### for third level
    - When a section number is lower than a previously seen number at the same level (e.g., seeing '2.' after '3.1'), it's likely a subsection or list item, not a main section
    - Unnumbered headings: keep as-is if at document boundaries, treat as subsections if within numbered sections
    - Return ALL headings with their corrected form
    """
    
    headings_with_pages: str = dspy.InputField(desc="List of headings with page numbers")
    results: List[HeadingResult] = dspy.OutputField(desc="All headings with corrections and change status")

In [None]:
#| exports
def fix_md(
    hdgs: list[dict], # List of headings with page numbers
    track_usage: bool=cfg.track_usage,
    ):
    "Fix markdown headings"
    lm = dspy.LM(cfg.lm, api_key=cfg.api_key, max_tokens=cfg.max_tokens)
    dspy.configure(lm=lm)
    dspy.settings.configure(track_usage=track_usage)

    inp = format_hdgs(hdgs)
    fix_hdgs = dspy.ChainOfThought(FixHeadingHierarchy)
    result = fix_hdgs(headings_with_pages=inp)
    return result

In [None]:
#| eval: false
result = fix_md(hdgs, track_usage=True)
print("Result:", result)
print("Usage:", result.get_lm_usage())

Result: Prediction(
    reasoning='The provided headings have inconsistent hierarchy. I will correct the hierarchy based on the numbering and content. The numbering scheme appears to be a mix of numbered sections (1, 2, 3, etc.) and subsections (2.1, 2.1.1, etc.). I will use this numbering to determine the appropriate heading level. Unnumbered headings within numbered sections will be treated as subsections. Headings that appear to be list items (e.g., "3. Increase attention...") will be treated as top-level headings.',
    results=[HeadingResult(old='# **PPMi**', page=1, position=1, new='# **PPMi**', changed=False), HeadingResult(old='# CONTENTS', page=3, position=1, new='# CONTENTS', changed=False), HeadingResult(old='# 1. Introduction', page=4, position=1, new='# 1. Introduction', changed=False), HeadingResult(old='# 2. Background of the JI-HoA', page=5, position=1, new='# 2. Background of the JI-HoA', changed=False), HeadingResult(old='### 2.1. Context and design of the JI-HoA', pa

In [None]:
#| exports
def group_corrections_by_page(
    results: list[HeadingResult], # List of headings with corrections and change status
    ):
    "Group HeadingResult corrections by page number into dict with page nums as keys"
    page_groups = {}
    for result in results:
        page = result.page
        if page not in page_groups:
            page_groups[page] = []
        page_groups[page].append(result)
    return page_groups

In [None]:
#| eval: false
group_corrections_by_page(result.results)

{1: [HeadingResult(old='# **PPMi**', page=1, position=1, new='# **PPMi**', changed=False)],
 3: [HeadingResult(old='# CONTENTS', page=3, position=1, new='# CONTENTS', changed=False)],
 4: [HeadingResult(old='# 1. Introduction', page=4, position=1, new='# 1. Introduction', changed=False)],
 5: [HeadingResult(old='# 2. Background of the JI-HoA', page=5, position=1, new='# 2. Background of the JI-HoA', changed=False),
  HeadingResult(old='### 2.1. Context and design of the JI-HoA', page=5, position=2, new='## 2.1. Context and design of the JI-HoA', changed=False)],
 7: [HeadingResult(old='# 2.2. External factors affecting the implementation of the JI', page=7, position=1, new='## 2.2. External factors affecting the implementation of the JI', changed=True)],
 8: [HeadingResult(old='# 3. Methodology', page=8, position=1, new='# 3. Methodology', changed=False)],
 10: [HeadingResult(old='# 4. Findings', page=10, position=1, new='# 4. Findings', changed=False),
  HeadingResult(old='### 4.1. Re

In [None]:
#| exports
def apply_corrections_to_page(
    page_nb, # Page number
    corrections, # List of corrections
    pages_list, # List of pages
    suffix=cfg.fixed_suffix, # Suffix for the new file
    ):
    "Apply corrections to a page by replacing original headings with corrected versions and page numbers"
    page_file = pages_list[page_nb - 1]
    lines = page_file.read_text().splitlines()
    corrections_copy = corrections.copy()
    
    for i, line in enumerate(lines):
        for correction in corrections_copy:
            if line.strip() == correction.old.strip():
                lines[i] = f"{correction.new} .... page {page_nb}"
                corrections_copy.remove(correction)
                break
            
    new_file = page_file.with_stem(f"{page_file.stem}{suffix}")
    new_file.write_text('\n'.join(lines))

In [None]:
#| eval: false
apply_corrections_to_page(5, result.results, pages)

In [None]:
#| exports
def apply_all_corrections(
    results, # List of headings with corrections and change status
    pages_list, # List of pages
    ):
    "Apply all corrections to the pages"
    grouped = group_corrections_by_page(results)
    for page_nb, corrections in grouped.items(): 
        apply_corrections_to_page(page_nb, corrections, pages_list)

In [None]:
#| eval: false
apply_all_corrections(result.results, pages)

In [None]:
#| exports
def fix_doc_hdgs(
    src_dir, # Path to the folder containing the document
    force=False, # Whether to overwrite the existing files
    ):
    "Process the document directory"
    folder = Path(src_dir)
    fixed_files = list(folder.glob(f"*{cfg.fixed_suffix}.md"))
    if fixed_files and not force:
        print(f"Found {len(fixed_files)} {cfg.fixed_suffix} files. Use force=True to overwrite.")
        return
    if fixed_files and force: [f.delete() for f in fixed_files]
    pages = folder.ls(file_exts=".md").sorted(key=lambda p: int(p.stem.split('_')[1]))
    result = fix_md(get_hdgs_with_pages(pages))
    apply_all_corrections(result.results, pages)

In [None]:
#| eval: false
print(doc)
fix_doc_hdgs(doc, force=True)

../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf


## Enrich with figures description