# Map references to specific pages

In [None]:
from collections import defaultdict
from typing import List, Set, Dict
from llamore.reference import Reference
import pymupdf


def search_for(term: str, doc: "pymupdf.Document") -> Set[int]:
    """
    Search for a term in the document and return a list of page numbers where it is found.
    """
    resulting_pages = []
    for i, page in enumerate(doc):
        if page.search_for(term):
            resulting_pages.append(i + 1)

    return set(resulting_pages)


def search_for_title(title: str, doc: "pymupdf.Document") -> Set[int]:
    """
    Search for a title in the document and return a list of page numbers where it is found.
    """
    if pages := search_for(title, doc):
        return pages

    idx_half = len(title) // 2
    if pages := search_for(title[:idx_half], doc).union(
        search_for(title[idx_half:], doc)
    ):
        return pages

    idx_quarter = len(title) // 4
    if (
        pages := search_for(title[:idx_quarter], doc)
        .union(search_for(title[idx_quarter:idx_half], doc))
        .union(search_for(title[idx_half : idx_half + idx_quarter], doc))
        .union(search_for(title[idx_half + idx_quarter :], doc))
    ):
        return pages
    
    raise ValueError(f"Title not found in the document: {title}")


def map_refs_to_pages(
    refs: List[Reference], doc: "pymupdf.Document"
) -> Dict[int, List[Reference]]:
    mapping = defaultdict(list)
    for ref in refs:
        pages = set(range(1, len(doc) + 1))
        if ref.analytic_title:
            title_pages = search_for_title(ref.analytic_title, doc)
            pages &= title_pages

        if len(pages) > 1 and ref.journal_title:
            title_pages = search_for(ref.journal_title, doc).union(
                search_for(ref.journal_title, doc)
            )
            pages &= title_pages

        if len(pages) > 1 and ref.authors:
            for person in ref.authors:
                if person.surname:
                    pages &= search_for(person.surname, doc)
                elif person.first_name:
                    pages &= search_for(person.first_name, doc)

        if len(pages) == 0:
            raise ValueError(f"Reference not found in the document: {ref}")
        if len(pages) > 1:
            raise ValueError(f"Reference found on multiple pages: {pages}")

        mapping[list(pages)[0]].append(ref)

    pages_wo_refs = set(range(1, len(doc) + 1)) - set(mapping.keys())
    for page in pages_wo_refs:
        mapping[page] = []

    return mapping

In [30]:
from pathlib import Path
from rich.progress import track
from lxml import etree
from llamore import TeiBiblStruct
from multiprocessing import Pool


def folder_to_mapping(folder: Path) -> Dict[Path, Dict[int, List[Reference]]]:
    parser = TeiBiblStruct()
    pub2tei_path = list(folder.glob("*pub2tei.tei.xml"))
    pdf_path = list(folder.glob("*pdf"))
    if len(pub2tei_path) != 1 or len(pdf_path) != 1:
        # print(
        #     f"Skipping {folder} as it does not contain the required files or has too many of it: {pub2tei_path} and {pdf_path}"
        # )
        return {}

    tree = etree.parse(pub2tei_path[0])
    elements = tree.findall(
        ".//biblStruct", namespaces={None: "http://www.tei-c.org/ns/1.0"}
    )
    if len(elements) == 0:
        # print(f"No biblStruct or too many found in {pub2tei_path[0]}: {len(elements)}")
        return {}

    refs = []
    for el in elements:
        ref = parser.to_references(el, raise_empty_error=False)
        if ref:
            refs.append(ref[0])

    if len(refs) == 0:
        # print(f"No references found in {pub2tei_path[0]}")
        return {}

    doc = pymupdf.open(pdf_path[0])

    try:
        folder_mapping = map_refs_to_pages(refs, doc)
    except ValueError as e:
        # print(f"Error processing {folder}")
        return {}
    else:
        return {folder: folder_mapping}


data_path = Path("/home/david/mpcdf/mplhlt/cupido/data/PLOS_1000/")
folders = [path for path in data_path.glob("*") if path.is_dir()]

with Pool(12) as p:
    results = []
    for result in track(p.imap_unordered(folder_to_mapping, folders), total=len(folders)):
        results.append(result)



Output()

In [31]:
results;

In [34]:
mapping = {}
for d in results:
    mapping = {**mapping, **d}

In [35]:
mapping_name = {}
for k, v in mapping.items():
    mapping_name[k.name] = v

In [36]:
mapping_name;

In [None]:
from data import Data, Example

examples = []
for name, pages in mapping_name.items():
    for page, refs in pages.items():
        example = Example(file=name, page=page, refs=refs or None)
        examples.append(example)

data = Data(examples=examples)

## Save/Load json data

In [None]:
from pathlib import Path
from data import Data

output = Path("./data/data.json")
#output.write_text(data.model_dump_json(exclude_defaults=True, indent=2))

In [10]:
data = Data.model_validate_json(output.read_text())

In [11]:
len(data.examples)

8391