In [3]:
!pip install pdfservices-sdk==1.0.1 pypdf2

Collecting pypdf2
  Downloading PyPDF2-1.26.0.tar.gz (77 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.6/77.6 KB[0m [31m986.6 kB/s[0m eta [36m0:00:00[0m31m1.4 MB/s[0m eta [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Using legacy 'setup.py install' for pypdf2, since package 'wheel' is not installed.
Installing collected packages: pypdf2
  Running setup.py install for pypdf2 ... [?25ldone
[?25hSuccessfully installed pypdf2-1.26.0


In [6]:
# %load_ext autoreload
# %autoreload 2

import json
import re
from typing import List, Tuple
from collections import defaultdict

from extract.document import TextBlock, Page, Document
from extract.extract import DocumentTextExtractor, AdobeAPIExtractor

In [5]:
with open(, "r") as f:
    data = json.load(f)


In [11]:
DATA_PATH = "temp-adobe.json"
CREDENTIALS_PATH = "./pdfservices-credentials.json"

extractor = AdobeAPIExtractor(credentials_path=CREDENTIALS_PATH)

doc = extractor.data_to_document(DATA_PATH, pdf_filename="_")


## postprocess subscripts

In [12]:
from collections import Counter
from copy import deepcopy

class AdobeTextStylingPostProcessor:
    @staticmethod
    def _classify_text_block_styling(text_block: TextBlock):  
        if not text_block.custom_attributes:
            return None
        
        if text_block.custom_attributes.get("BaselineShift", 0) < 0:
            return "subscript"
        elif text_block.custom_attributes.get("TextDecorationType") == "Underline":
            return "underline"
        elif text_block.custom_attributes.get("TextPosition") == "Sup":
            return "superscript"
        else:
            return None
    
    @staticmethod
    def _add_text_styling_markers(text: str, styling: str):
        leading_spaces = " " * (len(text) - len(text.lstrip(' ')))
        trailing_spaces = " " * (len(text) - len(text.rstrip(' ')))
        
        if styling == "subscript":
            return f"{leading_spaces}<sub>{text.strip()}</sub>{trailing_spaces}"
        elif styling == "superscript":
            return f"{leading_spaces + trailing_spaces}"
        elif styling == "underline":
            return f"{leading_spaces}<u>{text.strip()}</u>{trailing_spaces}"
        else:
            return text
    
    def merge_text_blocks(self, text_blocks: List[TextBlock]) -> TextBlock:
        all_coords = [tuple(text_block.coords) for text_block in text_blocks]
        merged_coords = [
            # x0, y0, x1, y1
            min([c[0] for c in all_coords]),
            min([c[1] for c in all_coords]),
            max([c[2] for c in all_coords]),
            max([c[3] for c in all_coords]),
        ]

        merged_block_text = []
        
        for text_block in text_blocks:
            block_styling = self._classify_text_block_styling(text_block)
            new_block_text = [self._add_text_styling_markers(line, block_styling) for line in text_block.text]
            
            if merged_block_text == []:
                merged_block_text = new_block_text
            else:
                merged_block_text[-1] = merged_block_text[-1] + new_block_text[0]
                merged_block_text += new_block_text[1:]
        
        return TextBlock(
            text=merged_block_text,
            text_block_id=text_blocks[0].text_block_id + "_merged",
            coords=merged_coords,
            path=text_blocks[0].path
        )

    def process(self, document: Document) -> Document:
        new_document = deepcopy(document)
                
        for page in new_document.pages:
            path_counts = Counter(
                [tuple(block.path) for block in page.text_blocks]
            )

            duplicated_paths = [path for path, count in path_counts.items() if count > 1]

            for path in duplicated_paths:
                text_block_idxs, text_blocks_to_merge = list(zip(*[(idx, block) for idx, block in enumerate(page.text_blocks) if tuple(block.path) == path]))
                merged_text_block = self.merge_text_blocks(text_blocks_to_merge)
                page.text_blocks = page.text_blocks[0:text_block_idxs[0]] + [merged_text_block] + page.text_blocks[text_block_idxs[-1]+1:]
                
        return new_document
                
post_processor = AdobeTextStylingPostProcessor()
new_doc = post_processor.process(doc)

In [13]:
new_doc.to_string()

'Message From Hon. Tillman Thomas Prime Minister of Grenada\nGRENADA LIKE MANY OTHER DEVELOPING SMALL ISLAND STATES HAS NOW\nformulated an ambitious National Energy Policy (NEP). This Policy is very urgent and timely now that the global energy market is again vulnerable to the high volatility of oil prices. Soaring prices for petroleum and petroleum-related products as a result of the variable weather patterns in Europe and North America, the forecasted rising demand for these commodities in the manufacturing sectors in the USA, Europe and the BRIC nations, renewed concerns over nuclear safety after the Fukushima incident, as well as the political instability of oil producing and oil exporting countries in North Africa and the Middle East warrant that Grenada needs to provide suitable options in transitioning to a low carbon economy based on the utilization of its indigenous alternative sources of energy.The efficient use of energy must also be considered a priority as Grenada seeks to

In [None]:

page = doc.pages[-1]

path_counts = Counter(
    [tuple(block.path) for block in page.text_blocks]
)

duplicated_paths = [path for path, count in path_counts.items() if count > 1]
duplicated_paths

[('Document', 'L[4]', 'LI[4]', 'LBody')]

In [14]:
list(zip(*[(idx, block) for idx, block in enumerate(page.text_blocks) if tuple(block.path) == path]))

[(10, 11, 12),
 (TextBlock(text=['Soaring oil prices in 2007 and 2008 reached a record high of US$147 a barrel in July ', '2008 and helped to precipitate a global economic recession, which dampened but did ', 'not stop the historically-sustained growth of energy investment, energy consumption ', 'and CO'], text_block_id='p7_b18', coords=[191.8011932373047, 515.6309967041016, 550.7972717285156, 570.6089935302734], type='LBody', path=['Document', 'L[4]', 'LI[4]', 'LBody'], custom_attributes=None),
  TextBlock(text=['2 '], text_block_id='p7_b19', coords=[222.7332000732422, 562.4591064453125, 229.2132110595703, 573.0700988769531], type='StyleSpan', path=['Document', 'L[4]', 'LI[4]', 'LBody'], custom_attributes={'BaselineShift': -3}),
  TextBlock(text=['emissions. '], text_block_id='p7_b20', coords=[229.6842041015625, 557.6309967041016, 275.0261535644531, 570.6000061035156], type='LBody', path=['Document', 'L[4]', 'LI[4]', 'LBody'], custom_attributes=None))]