In [1]:
!pip install pdfservices-sdk==1.0.1

You should consider upgrading via the '/Users/kalyan/Documents/CPR/pdf-corpus-analysis/.venv/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
# %load_ext autoreload
# %autoreload 2

import json
import re
from typing import List, Tuple, Set
from collections import defaultdict

from extract.document import TextBlock, Page, Document
from extract.extract import DocumentTextExtractor

In [3]:
with open("temp-adobe.json", "r") as f:
    data = json.load(f)


In [4]:
[el["Path"] for el in data['elements']]

['//Document/Figure',
 '//Document/P',
 '//Document/Figure[2]',
 '//Document/Aside/P',
 '//Document/Aside/P[2]',
 '//Document/Aside/P[3]',
 '//Document/Aside/P[4]',
 '//Document/Aside/P[5]',
 '//Document/P[2]',
 '//Document/P[3]',
 '//Document/P[4]',
 '//Document/P[5]',
 '//Document/P[6]',
 '//Document/P[7]',
 '//Document/Aside[2]/P',
 '//Document/Aside[2]/P[2]',
 '//Document/Aside[2]/P[3]',
 '//Document/Aside[2]/P[4]',
 '//Document/Figure[3]',
 '//Document/Aside[3]/P',
 '//Document/Aside[3]/P[2]',
 '//Document/H1',
 '//Document/P[8]',
 '//Document/P[9]',
 '//Document/P[10]',
 '//Document/P[11]',
 '//Document/P[12]',
 '//Document/P[13]',
 '//Document/P[14]',
 '//Document/Figure[4]',
 '//Document/P[15]',
 '//Document/H1[2]',
 '//Document/P[16]',
 '//Document/P[17]',
 '//Document/H1[3]',
 '//Document/P[18]',
 '//Document/P[19]',
 '//Document/P[20]',
 '//Document/P[21]',
 '//Document/P[22]',
 '//Document/P[23]',
 '//Document/P[24]',
 '//Document/H1[4]',
 '//Document/Figure[5]',
 '//Docume

In [33]:
class AdobeAPIExtractor(DocumentTextExtractor):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._elements_exclude = [
            "Aside",
            "Figure",
            "Footnote",
            "Reference",
            "TOC",
            "Watermark",
            "Table",
        ]
        # Maximum clockwise or anti-clockwise rotation a text element can have, otherwise it's excluded from the parsing results.
        self._max_rotation_degrees = 20

    @staticmethod
    def _flatten_data(data: dict) -> dict:
        """Flatten out 'Kids' elements which refer to PDF structure."""
        new_data = {k:v for k,v in data.items() if k != "elements"}
        new_data["elements"] = []
        
        for el in data["elements"]:
            if "Kids" in el:
                # We take all the properties of the parent and pass them 
                # to the each kid, but the kid can overwrite any properties
                # passed to it by the parent (e.g. bounding boxes).
                # This enables propagating page numbers, language prediction
                # and other properties to the kids.
                parent = {k:v for k,v in el.items() if k != "Kids"}
                for kid in el["Kids"]:
                    new_kid = parent.copy()
                    new_kid.update(kid)
                    new_data["elements"].append(kid)
            else:
                new_data["elements"].append(el)
        
        return new_data
    
    @staticmethod
    def _get_lines(char_bounds) -> List[Tuple[float, float]]:
        """Get and merge lines.

        Args:
            char_bounds (_type_): _description_

        Returns:
            _type_: _description_
        """
        
        # Get lines as ymin and ymax coordinates of each character bounds
        lines = [list(x) for x in set([(i[1], i[3]) for i in char_bounds])]
        lines.sort(key=lambda interval: interval[0])
        
        # Merge overlapping lines
        merged = [lines[0]]
        for current in lines:
            previous = merged[-1]
            if current[0] <= previous[1]:
                previous[1] = max(previous[1], current[1])
            else:
                merged.append(current)
        
        return merged
    
    @staticmethod
    def _get_line_number_of_char_bound(char_bound, lines):
        in_line_bool_array = [char_bound[1] >= line[0] and char_bound[3] <= line[1] for line in lines]
        line_number_list = [idx for idx, val in enumerate(in_line_bool_array) if val]
        
        if len(line_number_list) != 1:
            raise Exception
        
        return line_number_list[0]

    
    def _element_to_text_block(self, el: dict, block_id: str) -> TextBlock:
        char_bounds = el['CharBounds']
        merged_lines = self._get_lines(char_bounds)
        chars_in_lines_idxs = [self._get_line_number_of_char_bound(char_bound, merged_lines) for char_bound in char_bounds]
        line_change_idxs = [0] + [i for i in range(1,len(chars_in_lines_idxs)) if chars_in_lines_idxs[i]!=chars_in_lines_idxs[i-1]] + [len(el['Text'])]
        text_by_line = [el['Text'][line_change_idxs[idx]: line_change_idxs[idx+1]] for idx in range(len(line_change_idxs)-1)]
        
        # Store custom attributes for StyleSpans which are nested under another element, e.g subscripts or underlines.
        # Also change their path to the path of their parent element to make them easier to merge later.
        # Type is left as StyleSpan.
        if self._structure_path(el["Path"], remove_numbers=True)[-1] == "StyleSpan" and el.get("attributes") and el.get("Text"):
            custom_attributes = el.get("attributes")
            path = self._structure_path(el["Path"], remove_numbers=False)[:-1]
        else:
            custom_attributes = None
            path = self._structure_path(el["Path"], remove_numbers=False)
        
        return TextBlock(
            text=text_by_line,
            text_block_id=block_id,
            coords=self._convert_coordinate_axis(el['Bounds'], el['Page']),
            type=self._structure_path(el["Path"], remove_numbers=True)[-1],
            path=path,
            custom_attributes=custom_attributes
        )
    
    def _convert_coordinate_axis(self, coords: List[float], page_number: int) -> List[float]:
        """Convert coordinates so that the origin is at top left, rather than bottom left output by Adobe.

        Args:
            data: JSON data output by Adobe API.
            coords: list of coordinates output by Adobe: [x0, y0, x1, y1] with origin at bottom left.
            page_number: number of page output by Adobe. Indexed at 0.
        """
        page_height = self._current_data['pages'][page_number]['height']
        
        # To reverse the coordinate system we subtract y0 and y1 from the page height and swap
        # them.
        return [coords[0], page_height-coords[3], coords[2], page_height-coords[1]]
    
    @staticmethod
    def _structure_path(path: str, remove_numbers: bool = True) -> List[str]:
        """
        Convert a PDF path into a list. 
        E.g. '//Document/Aside[3]/P[2]' becomes['Document', 'Aside', 'P'].
        """
        
        path_split = path[2:].split("/")
        
        if not remove_numbers:
            return path_split
        else:
            return [re.sub(r"\[\d+\]", "", i) for i in path_split]
    
    @staticmethod    
    def _index_of(val, in_list):
        try:
            return in_list.index(val)
        except ValueError:
            return None

    def _convert_data(self, data: dict, filename: str) -> Document:
        page_id = 0
        block_counter = 1
        text_blocks_by_page = defaultdict(list)
        self._current_data = self._flatten_data(data)

        for el in self._current_data['elements']:
            # Ignore rotated text elements
            element_rotation = el.get("Rotation", 0)
            if self._max_rotation_degrees < element_rotation < 360-self._max_rotation_degrees:
                continue 
            
            # Ignore superscript
            if el.get("attributes", {}).get("TextPosition") == "Sup":
                continue
                
            # TODO: handle subscript
            # if self._structure_path(el["Path"])[-1] == "StyleSpan":
            #     print(el)
                        
            if el["Page"] != page_id:
                page_id += 1
                block_counter = 1
            
            if not any([e in self._structure_path(el["Path"]) for e in self._elements_exclude]):
                block_id = f"p{page_id}_b{block_counter}"
                
                # Ignore blocks without any text which haven't already been excluded by type
                if "Text" in el:
                    text_blocks_by_page[page_id].append(
                        self._element_to_text_block(el, block_id)
                    )

                block_counter += 1
        
        pages = []    
        
        for page_id, page_text_blocks in text_blocks_by_page.items():
            pages.append(
                Page(
                    text_blocks=page_text_blocks,
                    page_id=page_id,
                    dimensions=(data['pages'][page_id]['width'], data['pages'][page_id]['height']),
                )
            )    
            
        document = Document(
            pages=pages,
            filename=filename,
        )
                                    
        return document

extractor = AdobeAPIExtractor()

doc = extractor._convert_data(data, filename="test")
doc.save_json("test_doc.json")
doc.save_text("test_doc.txt")

## postprocess subscripts

In [6]:
# merge textblocks with same path
# - set bounding box to the enclosing bounding box of all spans
# - if a bounding box contains subscript, mark it somehow
# - can we also do this for superscript?        

In [37]:
from collections import Counter
from copy import deepcopy

class AdobeTextStylingPostProcessor:
    @staticmethod
    def _classify_text_block_styling(text_block: TextBlock):  
        if not text_block.custom_attributes:
            return None
        
        if text_block.custom_attributes.get("BaselineShift", 0) < 0:
            return "subscript"
        elif text_block.custom_attributes.get("TextDecorationType") == "Underline":
            return "underline"
        elif text_block.custom_attributes.get("TextPosition") == "Sup":
            return "superscript"
        else:
            return None
    
    @staticmethod
    def _add_text_styling_markers(text: str, styling: str):
        leading_spaces = " " * (len(text) - len(text.lstrip(' ')))
        trailing_spaces = " " * (len(text) - len(text.rstrip(' ')))
        
        if styling == "subscript":
            return f"{leading_spaces}<sub>{text.strip()}</sub>{trailing_spaces}"
        elif styling == "superscript":
            return f"{leading_spaces + trailing_spaces}"
        elif styling == "underline":
            return f"{leading_spaces}<u>{text.strip()}</u>{trailing_spaces}"
        else:
            return text
    
    def merge_text_blocks(self, text_blocks: List[TextBlock]) -> TextBlock:
        all_coords = [tuple(text_block.coords) for text_block in text_blocks]
        merged_coords = [
            # x0, y0, x1, y1
            min([c[0] for c in all_coords]),
            min([c[1] for c in all_coords]),
            max([c[2] for c in all_coords]),
            max([c[3] for c in all_coords]),
        ]

        merged_block_text = []
        
        for text_block in text_blocks:
            block_styling = self._classify_text_block_styling(text_block)
            new_block_text = [self._add_text_styling_markers(line, block_styling) for line in text_block.text]
            
            if merged_block_text == []:
                merged_block_text = new_block_text
            else:
                merged_block_text[-1] = merged_block_text[-1] + new_block_text[0]
                merged_block_text += new_block_text[1:]
        
        return TextBlock(
            text=merged_block_text,
            text_block_id=text_blocks[0].text_block_id + "_merged",
            coords=merged_coords,
            path=text_blocks[0].path
        )

    def process(self, document: Document) -> Document:
        new_document = deepcopy(document)
                
        for page in new_document.pages:
            path_counts = Counter(
                [tuple(block.path) for block in page.text_blocks]
            )

            duplicated_paths = [path for path, count in path_counts.items() if count > 1]

            for path in duplicated_paths:
                text_block_idxs, text_blocks_to_merge = list(zip(*[(idx, block) for idx, block in enumerate(page.text_blocks) if tuple(block.path) == path]))
                merged_text_block = self.merge_text_blocks(text_blocks_to_merge)
                page.text_blocks = page.text_blocks[0:text_block_idxs[0]] + [merged_text_block] + page.text_blocks[text_block_idxs[-1]+1:]
                
        return new_document
                
post_processor = AdobeTextStylingPostProcessor()
new_doc = post_processor.process(doc)

In [38]:
new_doc.to_string()

'Message From Hon. Tillman Thomas Prime Minister of Grenada\nGRENADA LIKE MANY OTHER DEVELOPING SMALL ISLAND STATES HAS NOW\nformulated an ambitious National Energy Policy (NEP). This Policy is very urgent and timely now that the global energy market is again vulnerable to the high volatility of oil prices. Soaring prices for petroleum and petroleum-related products as a result of the variable weather patterns in Europe and North America, the forecasted rising demand for these commodities in the manufacturing sectors in the USA, Europe and the BRIC nations, renewed concerns over nuclear safety after the Fukushima incident, as well as the political instability of oil producing and oil exporting countries in North Africa and the Middle East warrant that Grenada needs to provide suitable options in transitioning to a low carbon economy based on the utilization of its indigenous alternative sources of energy.The efficient use of energy must also be considered a priority as Grenada seeks to

In [None]:

page = doc.pages[-1]

path_counts = Counter(
    [tuple(block.path) for block in page.text_blocks]
)

duplicated_paths = [path for path, count in path_counts.items() if count > 1]
duplicated_paths

[('Document', 'L[4]', 'LI[4]', 'LBody')]

In [14]:
list(zip(*[(idx, block) for idx, block in enumerate(page.text_blocks) if tuple(block.path) == path]))

[(10, 11, 12),
 (TextBlock(text=['Soaring oil prices in 2007 and 2008 reached a record high of US$147 a barrel in July ', '2008 and helped to precipitate a global economic recession, which dampened but did ', 'not stop the historically-sustained growth of energy investment, energy consumption ', 'and CO'], text_block_id='p7_b18', coords=[191.8011932373047, 515.6309967041016, 550.7972717285156, 570.6089935302734], type='LBody', path=['Document', 'L[4]', 'LI[4]', 'LBody'], custom_attributes=None),
  TextBlock(text=['2 '], text_block_id='p7_b19', coords=[222.7332000732422, 562.4591064453125, 229.2132110595703, 573.0700988769531], type='StyleSpan', path=['Document', 'L[4]', 'LI[4]', 'LBody'], custom_attributes={'BaselineShift': -3}),
  TextBlock(text=['emissions. '], text_block_id='p7_b20', coords=[229.6842041015625, 557.6309967041016, 275.0261535644531, 570.6000061035156], type='LBody', path=['Document', 'L[4]', 'LI[4]', 'LBody'], custom_attributes=None))]