# Running the 'data to document' step of the pdf2text pipeline

This notebook contains code which processes a folder of intermediate outputs (i.e. folders direct from Adobe Extract or files from pdfalto) into `Document` objects, and saves .json and .txt files to a specified folder.

It's left as a notebook for now, in order to enable experimentation with development of pipeline elements and not make premature decisions about implementation in the product pipeline.

In [2]:
import sys
sys.path.append("..")

from pathlib import Path
from typing import List

from tqdm.auto import tqdm

from extract.extract import DocumentEmbeddedTextExtractor, AdobeAPIExtractor
from extract.document import Document

In [13]:
INTERMEDIATE_FOLDER = Path("../../../data/pdf2text/intermediate-final/")
OUTPUT_FOLDER = Path("../../../data/pdf2text/output-final-220424/")

PDFALTO_PATH = Path("../../../misc/pdfalto/pdfalto")

In [7]:
embedded_extractor = DocumentEmbeddedTextExtractor(pdfalto_path=PDFALTO_PATH)
adobe_extractor = AdobeAPIExtractor(credentials_path=".")

In [8]:
# TODO: it could be useful to restructure the intermediate directory so each PDF parsed has its own folder, rather than many folders or XML files
# To do this we'd have to modify pdf2text, but this is probably better done after the merge.
# For now we identify folders and files belonging to each PDF using the method below

def group_intermediate_dir_by_pdf():
    """
    This assumes a flat structure for the intermediate dir, containing both Adobe and pdfalto outputs.
    It groups related files or folders in the directory by the stem of their PDF filename.
    
    E.g. directory structure: 
    ```
    - cclw-1055-bf17ca3b41b943fe83f0bd5c5ff36823_0
        - structuredData.json
    - cclw-1055-bf17ca3b41b943fe83f0bd5c5ff36823_1
        - structuredData.json
        - tables/
    - cclw-1055-bf17ca3b41b943fe83f0bd5c5ff36823_2
        - structuredData.json
    - cclw-8482-7a59b4bc5d7841cd9d8a0010215c97ec_metadata.xml
    - cclw-8482-7a59b4bc5d7841cd9d8a0010215c97ec_outline.xml
    - cclw-8482-7a59b4bc5d7841cd9d8a0010215c97ec.xml
    ```
    
    output:
    ```
    {
        "cclw-1055-bf17ca3b41b943fe83f0bd5c5ff36823": [
            "cclw-1055-bf17ca3b41b943fe83f0bd5c5ff36823_0",
            "cclw-1055-bf17ca3b41b943fe83f0bd5c5ff36823_1",
            "cclw-1055-bf17ca3b41b943fe83f0bd5c5ff36823_2",
        ],
        "cclw-8482-7a59b4bc5d7841cd9d8a0010215c97ec": [
            "cclw-8482-7a59b4bc5d7841cd9d8a0010215c97ec_metadata.xml",
            "cclw-8482-7a59b4bc5d7841cd9d8a0010215c97ec_outline.xml",
            "cclw-8482-7a59b4bc5d7841cd9d8a0010215c97ec.xml"
        ]
    }
    ```
    """
    pdf_intermediate_mapping = dict()

    pdf_stems = list(set([p.stem.split("_")[0] for p in INTERMEDIATE_FOLDER.iterdir()]))

    for pdf_stem in pdf_stems:
        pdf_intermediate_mapping[pdf_stem] = sorted([p for p in INTERMEDIATE_FOLDER.iterdir() if str(p.name).startswith(pdf_stem)])
        
    return pdf_intermediate_mapping

pdf_intermediate_mapping = group_intermediate_dir_by_pdf()

In [11]:
def parse_adobe_folders(folders: List[Path], pdf_filename: str) -> Document:
    """Parse list of adobe folders into one Document object."""
    pages = []
    # Folders are sorted here to ensure the correct order in parsing
    json_paths = [p / "structuredData.json" for p in sorted(folders)]
    curr_page_offset = 0
    
    for _path in json_paths:
        temp_doc = adobe_extractor.data_to_document(
            data_path=_path, 
            pdf_filename=pdf_filename,
            page_offset=curr_page_offset,
        )

        pages += temp_doc.pages
        if pages:
            curr_page_offset = pages[-1].page_id + 1
        
    return Document(
        pages=pages,
        filename=pdf_filename,
    )

# ----------------------------

pdf_document_objects = dict()

for pdf_stem, related_paths in tqdm(pdf_intermediate_mapping.items()):
    try:
        if all([p.is_dir() for p in related_paths]):
            document = parse_adobe_folders(related_paths, pdf_stem)
            pdf_document_objects[pdf_stem] = document

        elif valid_paths := [p for p in related_paths if p.name == f"{pdf_stem}.xml"]:
            # Finding the correctly named XML file could also mean that folders are 
            # present, but these are from Adobe failures
            if len(valid_paths) == 1:
                document = embedded_extractor.data_to_document(data_path=valid_paths[0], pdf_filename=pdf_stem)
                pdf_document_objects[pdf_stem] = document
            else:
                print(f"Too many paths for {pdf_stem}")
        else:
            # TODO: handle adobe split failures which have fallen back to embedded text extractor
            print(pdf_stem, "?")

    except Exception as e:
        print(f"Failed for {pdf_stem}: {e}")

 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                          | 678/940 [09:29<06:30,  1.49s/it]

.DS ?


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 940/940 [13:15<00:00,  1.18it/s]


In [12]:
# Check that all PDFs have produced a Document object - list should be empty
[k for k,v in pdf_document_objects.items() if not v]

[]

In [24]:
# Serialise results to JSON and txt
for pdf_stem, document in tqdm(pdf_document_objects.items()):
    document.save_json(OUTPUT_FOLDER / f"{pdf_stem}.json")
    document.save_text(OUTPUT_FOLDER / f"{pdf_stem}.txt")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 939/939 [01:39<00:00,  9.40it/s]
