# PDFTriage explorer

In [25]:
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import (
    ExtractPDFOptions,
)
from adobe.pdfservices.operation.pdfops.options.extractpdf.table_structure_type import (
    TableStructureType,
)
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import (
    ExtractElementType,
)

import os.path
import zipfile
import json

import dotenv

In [17]:
# Get environment variables
dotenv.load_dotenv()

client_id = os.getenv("ADOBE_CLIENT_ID")
client_secret = os.getenv("ADOBE_CLIENT_SECRET")

# Some constants
input_pdf = "UK_08.pdf"
input_pdf_dir = "/Users/dvdblk/Downloads/pdf_files_complete/"
input_pdf_path = os.path.join(input_pdf_dir, input_pdf)

out_dir = "../../data/processed/adobe-extract/"

In [None]:

def get_structured_data(input_file_path: str) -> dict:
    input_basename = os.path.basename(input_file_path)
    # Check if .json file exists
    json_file = input_basename + ".json"

    if os.path.isfile(os.path.join(out_dir, json_file)):
        print("File already exists, skipping download")

        # Open and read json file
        with open(os.path.join(out_dir, json_file)) as f:
            data = json.load(f)
        return data

    zip_file = os.path.join(out_dir, f"{input_basename}.zip")

    # Initial setup, create credentials instance.
    credentials = (
        Credentials.service_principal_credentials_builder()
        .with_client_id(client_id)
        .with_client_secret(client_secret)
        .build()
    )

    # Create an ExecutionContext using credentials and create a new operation instance.
    execution_context = ExecutionContext.create(credentials)
    extract_pdf_operation = ExtractPDFOperation.create_new()

    # Set operation input from a source file.
    source = FileRef.create_from_local_file(input_pdf_path)
    extract_pdf_operation.set_input(source)

    # Build ExtractPDF options and set them into the operation
    extract_pdf_options: ExtractPDFOptions = (
        ExtractPDFOptions.builder().with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]).with_table_structure_format(TableStructureType.CSV).build()
    )
    extract_pdf_operation.set_options(extract_pdf_options)

    # Execute the operation.
    result: FileRef = extract_pdf_operation.execute(execution_context)

    # Save the result to the specified location.
    result.save_as(zip_file)

    # Extract json file and save to out_dir
    archive = zipfile.ZipFile(zip_file, "r")
    jsonentry = archive.open("structuredData.json")
    jsondata = jsonentry.read()
    data = json.loads(jsondata)

    with open(os.path.join(out_dir, json_file), "w") as f:
        json.dump(data, f)

    # if os.path.isfile(zip_file):
    #     os.remove(zip_file)

    return data


In [18]:
result = get_structured_data(input_file_path=input_pdf_path)

In [19]:
# Print path, page, text and table file paths for each element in the result document
for element in result["elements"]:
    print("Path: " + element["Path"])
    if page := element.get("Page"):
        print("Page: " + str(element["Page"]))
    if text := element.get("Text"):
        print("Text: " + element["Text"])
    if file_paths := element.get("filePaths"):
        print("File paths: " + str(file_paths))
    print()

Path: //Document/Figure

Path: //Document/P
Text: THE UK  SKILLS SYSTEM 

Path: //Document/P[2]
Text: AN INTRODUCTION 

Path: //Document/Figure[2]

Path: //Document/H1
Page: 1
Text: THE UK SKILLS SYSTEM: 

Path: //Document/H1[2]
Page: 1
Text: AN INTRODUCTION 

Path: //Document/P[3]
Page: 1
Text: This document provides a summary of the UK’s Technical and Vocational Education and Training (TVET) system and how it provides the UK economy with highly skilled people. It contains the following sections: 

Path: //Document/L/LI/Lbl
Page: 1
Text: 1.

Path: //Document/L/LI/LBody
Page: 1
Text:  WHY SKILLS ARE IMPORTANT TO THE UK 

Path: //Document/L/LI[2]/Lbl
Page: 1
Text: 2.

Path: //Document/L/LI[2]/LBody
Page: 1
Text:  WHAT ARE TECHNICAL AND VOCATIONAL SKILLS? 

Path: //Document/L/LI[3]/Lbl
Page: 1
Text: 3.

Path: //Document/L/LI[3]/LBody
Page: 1
Text:  ABOUT THE UK SKILLS SYSTEM 

Path: //Document/L/LI[4]/Lbl
Page: 1
Text: 4.

Path: //Document/L/LI[4]/LBody
Page: 1
Text:  A FLEXIBLE, HIGH-QU

In [292]:
# data structures for the entire PDF that include structural metadata along with text
from typing import List, Optional, Set, Tuple, Union
from enum import StrEnum
import weakref

class TextOrigin(StrEnum):
    PARAGRAPH = "PARAGRAPH"
    ASIDE = "ASIDE"
    """Text that is usually next to a figure or table"""
    LIST = "LIST"
    TABLE = "TABLE"
    FIGURE = "FIGURE"

class Paragraph:

    def __init__(self, text: str, origin: TextOrigin) -> None:
        self.text = text
        self.origin = origin

class Section:

    def __init__(self, title: Optional[str] = None, pages: Set[int] = set(), section_type: Optional[str] = None, paragraphs: List[str] = [], subsections: List["Section"] = [], parent: Optional["Section"] = None) -> None:
        self.title = title
        self.pages = pages
        self.section_type = section_type
        self.paragraphs = paragraphs
        self.parent: Optional[weakref.ReferenceType[Section]] = weakref.ref(parent) if parent else None
        self.subsections: List[Section] = subsections

    def __repr__(self) -> str:
        return f"<Section ({self.section_type}) title={self.title} parent={self.parent}>"

class Document(Section):
    """A tree of Sections"""

    def __init__(self, title: str | None = None, pages: Set[int] = set(), paragraphs: List[str] = [], subsections: List[Section] = [], parent: Section | None = None) -> None:
        super().__init__(title, pages, "document", paragraphs, subsections, parent)

class SectionTree:
    """Represents the document"""

    def __init__(self, sections: List[Section] = []) -> None:
        self.sections = sections
        self.preface: List[Paragraph] = []

def structured_json_to_document(structured_json: dict) -> Document:
    # header_to_section_type = {
    #     "H1": "section",
    #     "H2": "subsection",
    #     "H3": "subsubsection",
    # }
    document = Document()

    curr_section = document
    section_to_insert_header = {}
    elements_iterator = iter(structured_json["elements"])
    elements_stack: List[Tuple[dict, str]] = []

    def get_next_elem() -> Tuple[dict, str]:
        if elements_stack:
            element, path = elements_stack.pop()
        else:
            element = next(elements_iterator)
            path = element.get("Path")
            if path is None:
                raise ValueError("Path is None")

            # Verify that xpath starts with //Document
            if not path.startswith("//Document"):
                raise ValueError(f"Path does not start with //Document: {path}")

            # Remove //Document
            path = path.replace("//Document", "", 1)

        return element, path

    def add_paragraph(paragraph: Paragraph, pages: Optional[Union[Set[int], int]]):
        curr_section.paragraphs.append(paragraph)
        if pages:
            if isinstance(pages, int):
                curr_section.pages.add(pages)
            elif isinstance(pages, set):
                curr_section.pages.update(pages)

    try:
        while True:
            element, path = get_next_elem()

            import re

            if path == "/Figure":
                # Figure
                if figure_text := element.get("Text"):
                    add_paragraph(Paragraph(figure_text, TextOrigin.FIGURE), pages=element.get("Page"))
            elif _ := re.match(r"^\/P(\[\d+\])?$", path):
                # Paragraph
                add_paragraph(Paragraph(element["Text"], TextOrigin.PARAGRAPH), pages=element.get("Page"))
            elif match := re.match(r"^\/(H\d)(\[\d+\])?$", path):
                # Header
                title = element["Text"]
                section_type = match.group(1)

                # Verify next element is a paragraph and not a followup header
                while True:
                    next_element, next_path = get_next_elem()
                    reinsert_back = True
                    if next_match := re.match(r"^\/(H\d)(\[\d+\])$", next_path):
                        if next_match.group(1) == section_type:
                            next_title = next_element["Text"]
                            if not title.endswith(" ") and not next_title.startswith(" "):
                                title += " "
                            title += next_title
                            reinsert_back = False

                    if reinsert_back:
                        # Next element is not a header, push it to the stack
                        elements_stack.append((next_element, next_path))
                        break

                #print(curr_section.section_type, curr_section.title)
                # Create new section
                new_section = Section(
                    title=title,
                    section_type=section_type,
                )

                # Check if new section is a subsection
                if curr_section.section_type == "document":
                    # New section is a top level (H1) section
                    # Add it to the document
                    curr_section.subsections.append(new_section)
                    # Set parent of new section to document
                    new_section.parent = curr_section
                    # Every new H1 has to be inserted into document subsections
                    section_to_insert_header[section_type] = curr_section
                else:
                    if section_type < curr_section.section_type:
                        # H2 -> H1 or H3 -> H2 or H3 -> H1
                        if parent_section := section_to_insert_header.get(section_type):
                            parent_section.subsections.append(new_section)
                            new_section.parent = parent_section
                        else:
                            # TODO: find closest parent of curr_section with section_type < new_section.section_type
                            raise ValueError("Could not find parent section")

                    elif section_type > curr_section.section_type:
                        # H1 -> H2 or H2 -> H3 or H1 -> H3
                        # curr_section is the parent
                        print(curr_section.section_type, curr_section.title, "--->", new_section.section_type, new_section.title)
                        curr_section.subsections.append(new_section)
                        new_section.parent = curr_section
                        section_to_insert_header[section_type] = curr_section
                    else:
                        #print(f"{curr_section} ||||||||||||| {new_section}")
                        # H1 = H1 or H2 = H2 or H3 = H3
                        # curr_section.parent is the parent as they are on the same level
                        curr_section.parent.subsections.append(new_section)
                        new_section.parent = curr_section.parent
                print(curr_section.section_type, curr_section.title, "--->", new_section.section_type, new_section.title)
                print("\t", new_section.title, new_section.subsections)
                print()
                curr_section = new_section
    except StopIteration:
        pass

    return document

In [293]:
document = structured_json_to_document(result)

document None ---> H1 THE UK SKILLS SYSTEM: AN INTRODUCTION 
	 THE UK SKILLS SYSTEM: AN INTRODUCTION  []

H1 THE UK SKILLS SYSTEM: AN INTRODUCTION  ---> H2 Confederation of British Industry, 
	 Confederation of British Industry,  [<Section (H2) title=Confederation of British Industry,  parent=<Section (H1) title=THE UK SKILLS SYSTEM: AN INTRODUCTION  parent=<Section (document) title=None parent=None>>>]

H2 Confederation of British Industry,  ---> H1 1. WHY SKILLS ARE IMPORTANT TO THE UK 
	 1. WHY SKILLS ARE IMPORTANT TO THE UK  [<Section (H2) title=Confederation of British Industry,  parent=<Section (H1) title=THE UK SKILLS SYSTEM: AN INTRODUCTION  parent=<Section (document) title=None parent=None>>>]

H1 1. WHY SKILLS ARE IMPORTANT TO THE UK  ---> H1 2. WHAT ARE TECHNICAL AND VOCATIONAL SKILLS? 
	 2. WHAT ARE TECHNICAL AND VOCATIONAL SKILLS?  [<Section (H2) title=Confederation of British Industry,  parent=<Section (H1) title=THE UK SKILLS SYSTEM: AN INTRODUCTION  parent=<Section (doc

In [285]:
document.subsections[0].subsections

[<Section (H2) title=STRUCTURE OF THE TVET SYSTEM >,
 <Section (H2) title=TVET: DIFFERENT APPROACHES ACROSS THE UK >,
 <Section (H2) title=HOW THE TVET SYSTEM IS FUNDED >,
 <Section (H2) title=EXCELLENT LEARNER EXPERIENCE >,
 <Section (H2) title=IMPORTANCE OF EMPLOYER ENGAGEMENT >,
 <Section (H3) title=Employer influence on institutions at local, regional, and national level through: >,
 <Section (H3) title=Employer involvement in delivery through: >,
 <Section (H2) title=WHAT QUALIFICATIONS DO LEARNERS STUDY IN THE UK? >,
 <Section (H2) title=FLEXIBLE PROGRESSION PATHWAYS >,
 <Section (H2) title=RIGOROUS QUALIFICATIONS >,
 <Section (H2) title=HOW THE UK’S TVET SYSTEM WILL CONTINUE TO DEVELOP >]

In [188]:
# Count number of sections per section type
counts = {}

def count_sections(section: Section):
    if section.section_type not in counts:
        counts[section.section_type] = 0
    counts[section.section_type] += 1
    for subsection in section.subsections:
        count_sections(subsection)

count_sections(document)
print(counts)

RecursionError: maximum recursion depth exceeded