In [287]:
# pdf_path = "test/pdf/dc/form/Ceridian-2-EC-SDS-v2.1.pdf"
# output_pdf_path = "test/target/dc/form/Ceridian-2-EC-SDS-v2.1.pdf"

pdf_path = "test/pdf/dc/form/acuron-label.pdf"
output_pdf_path = "test/target/dc/form/acuron-label.pdf"

# pdf_path = "test/pdf/dc/form/Blue_Book_2023_web.pdf"
# output_pdf_path = "test/target/dc/form/Blue_Book_2023_web.pdf"

# pdf_path = "test/pdf/dc/form/Ceridian-2-EC-Supplemental-Label.pdf"
# output_pdf_path = "test/target/dc/form/Ceridian-2-EC-Supplemental-Label.pdf"

pdf_path = "test/pdf/dc/form/OMDXE11749.pdf"
output_pdf_path = "test/target/dc/form/OMDXE11749.pdf"

In [288]:
import fitz
import json
import pandas as pd
import numpy as np
from pprint import pprint
from collections import defaultdict
from typing import Iterable


Desired features

- The page width and height
- The bbox info
- The text
- The font size
- The font family
- Is bold, italic, or underlined
<!-- - Line height -->
- The color
<!-- - Text direction -->
  
Flags
- Title 1
- Title 2
- Title 3
- Text


In [289]:
doc = fitz.open(pdf_path)

In [290]:
page_rect = doc.page_cropbox(0)

doc_width = page_rect.width
doc_height = page_rect.height

doc_width, doc_height

(612.0, 792.0)

In [291]:
def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2 ** 0:
        l.append("superscript")
    if flags & 2 ** 1:
        l.append("italic")
    if flags & 2 ** 2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2 ** 3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2 ** 4:
        l.append("bold")
    return ", ".join(l)

In [292]:
class TextSpan():
    _spans = []
    _page_index = 0

    def __init__(self, page_index: int, spans: Iterable = []):
        self._page_index = page_index
        self._spans = spans

    @property
    def page_index(self):
        return self._page_index

    @property
    def spans(self):
        return self._spans
    
    @spans.setter
    def spans(self, spans):
        self._spans = spans

    def __repr__(self):
        text = ""
        for span in self.spans:
            
            # if span text has no actual content like alphabet, number, etc.
            if span["text"].strip() == "" or span["text"] == "\n":
                continue

            text += span["text"]
        return text.strip()
    
    def __str__(self):
        return self.__repr__()
    
    @property
    def text(self):
        return self.__repr__()
    
    @property
    def font(self):
        # Find the most common font
        font_counts = defaultdict(int)
        for span in self.spans:
            font_counts[span["font"]] += len(span["text"])
        return max(font_counts, key=font_counts.get) if font_counts else ""
    
    @property
    def size(self):
        # Find the most common font size
        size_counts = defaultdict(int)
        for span in self.spans:
            size_counts[span["size"]] += len(span["text"])
        return max(size_counts, key=size_counts.get) if size_counts else 0
    
    @property
    def bbox(self):
        # Find the boundary of the text
        x0, y0, x1, y1 = np.Inf, np.Inf, 0, 0
        for span in self.spans:
            x0 = min(x0, span["bbox"][0])
            y0 = min(y0, span["bbox"][1])
            x1 = max(x1, span["bbox"][2])
            y1 = max(y1, span["bbox"][3])
        return (x0, y0, x1, y1)
    
    @property
    def color(self):
        # Find the most common color
        color_counts = defaultdict(int)
        for span in self.spans:
            color_counts[span["color"]] += len(span["text"])
        return max(color_counts, key=color_counts.get) if color_counts else 0
    
    @property
    def is_bold(self):
        # Find the most common bold
        bold_counts = defaultdict(int)
        for span in self.spans:
            is_bold = "bold" in flags_decomposer(span["flags"])
            bold_counts[is_bold] += len(span["text"])
        return max(bold_counts, key=bold_counts.get) if bold_counts else False
    
    @property
    def is_italic(self):
        # Find the most common italic
        italic_counts = defaultdict(int)
        for span in self.spans:
            is_italic = "italic" in flags_decomposer(span["flags"])
            italic_counts[is_italic] += len(span["text"])
        return max(italic_counts, key=italic_counts.get) if italic_counts else False
    
    

    

In [293]:
def within_range(value, target, tolerance):
    return value >= target - tolerance and value <= target + tolerance

def is_same_font_family(font_a, font_b):
    return font_a.split("-")[0] == font_b.split("-")[0]


In [294]:
def merge_spans(spans, page_index):
    texts = []

    if len(spans) == 0:
        return texts


    text = TextSpan(page_index, [spans[0]])
    for span in spans[1:]:
        last_span = spans[-1]

        # Find the last none empty span
        for s in reversed(text.spans):
            if s["text"].strip():
                last_span = s
                break


        span_rect = fitz.Rect(span["bbox"])
        last_span_rect = fitz.Rect(last_span["bbox"])

        vertical_distance = abs(span_rect.y0 - last_span_rect.y1)
        horizontal_distance = abs(span_rect.x0 - last_span_rect.x1)
        top_distance = abs(span_rect.y0 - last_span_rect.y0)
        left_distance = abs(span_rect.x0 - last_span_rect.x0)
        
        text_flag = flags_decomposer(span["flags"])

        flag = False # True for merge and False for not merge

        # if span["font"] == last_span["font"] and within_range(span_rect.height, last_span_rect.height, 0.2):
        #     flag = True

        # if is_same_font_family(span["font"], last_span["font"]) and horizontal_distance <= 0.8:
        #     flag = True


        # If on the same line and the distance is small
        if top_distance <= 2 and horizontal_distance <= 2:
            flag = True

            if not within_range(span_rect.height, last_span_rect.height, 0.2):
                flag = False

        # If on the same column and the distance is small
        if left_distance <= 2 and vertical_distance <= 2:
            flag = True

            if not within_range(span_rect.height, last_span_rect.height, 1):
                flag = False

            if not is_same_font_family(span["font"], last_span["font"]):
                flag = False


        # If on the same line and the distance is relatively small, but has the same font
        if top_distance <= last_span_rect.height * 0.8 \
            and horizontal_distance <= 0.1 \
            and is_same_font_family(span["font"], last_span["font"]):
            flag = True

        # If vertically beside each other and have the same font
        if vertical_distance <= 1 and horizontal_distance > 10 and span["font"] == last_span["font"]:
            flag = True

        if 'superscript' in text_flag: 
            flag = True


        if flag:
            text.spans.append(span)
        else:
            texts.append(text)
            text = TextSpan(page_index, [span])

    texts.append(text)

    return texts

In [295]:
pages_texts = []

for page_index, page in enumerate(doc):
    blocks = page.get_text("dict")["blocks"]

    spans = []
    
    blocks = page.get_text("dict")["blocks"]
    for index, block in enumerate(blocks):
        for line in block.get("lines") or []:
            for span in line.get("spans") or []:
                spans.append(span)

    texts = merge_spans(spans, page_index)

    pages_texts.append(texts)


    for text_index, text in enumerate(texts):
        # generate random color
        color = list(np.random.rand(3,))

        for span in text.spans:
            rect = fitz.Rect(span["bbox"])
            point = rect.tl
            point.x -= 10
            point.y += 8

            page.draw_rect(rect, color=color, width=0.6, overlay=True, stroke_opacity=0.5)
            page.insert_text(point, f"{text_index}", color=color, fontname="helvetica-bold", fontsize=8, overlay=True)


In [296]:
all_text_sizes = []

for page_texts in pages_texts:
    for text_index, text in enumerate(page_texts):
        all_text_sizes.append(text.size)

mean_text_size = np.mean(all_text_sizes)
mean_text_size

9.01799873080208

In [297]:
BOLD_FONTS_KEYWORDS = ["bold", "black", "heavy", "black", "extra", "ultra", "black"]

def is_bold_font(font):
    font = font.lower()
    for keyword in BOLD_FONTS_KEYWORDS:
        if keyword in font:
            return True
    return False


In [298]:
def is_title(text, size, font):
    is_bold = is_bold_font(font)
    
    if text.strip() == "" or len(text) <= 3:
        return False
    
    if any(char == '.' or char == ':' for char in text) and text != text.upper():
        # print(text, "invalid chars")
        return False
    
    if size >= mean_text_size * 1.2:
        # print(text, size, font, "size >= mean_span_size * 1.5")
        return True
    
    if size >= mean_text_size * 1.02 and is_bold:
        # print(text, size, font, "size >= mean_span_size * 0.08 and is_consistent and is_bold")
        return True
    
    # if text == text.upper() and is_bold and not any(not char.isalpha() for char in text):
    #     print(text, "text is all uppercase and is consistent")
    #     return True
    
    # print(text, "not title")
    return False
    

In [299]:
for page_index, page in enumerate(doc):
    page_texts = pages_texts[page_index]

    for text_index, text in enumerate(page_texts):
        all_text_sizes.append(text.size)
        
        title = is_title(text.text, text.size, text.font)
        
        if title: 
            print(text.bbox)
            rect = fitz.Rect(text.bbox)
            page.draw_rect(rect, color=(1, 0, 0), width=1.2, overlay=True, stroke_opacity=0.9)

(216.62399291992188, 274.5906677246094, 394.4100036621094, 292.3989562988281)
(184.99046325683594, 315.5779724121094, 426.00225830078125, 333.3862609863281)
(191.50900268554688, 355.1468200683594, 268.25738525390625, 368.5529479980469)
(286.1858215332031, 355.1468200683594, 340.21600341796875, 368.5529479980469)
(358.1288146972656, 355.1468200683594, 419.4604187011719, 368.5529479980469)
(135.96397399902344, 265.5522766113281, 151.90426635742188, 376.49798583984375)
(271.2760009765625, 415.1278076171875, 339.6861267089844, 428.533935546875)
(245.82138061523438, 434.11883544921875, 365.2064208984375, 447.52496337890625)
(174.95567321777344, 461.0490417480469, 436.0224609375, 474.4551696777344)
(170.53369140625, 471.99053955078125, 440.49615478515625, 485.39666748046875)
(271.3890075683594, 538.856201171875, 367.7718505859375, 561.1163940429688)
(186.69000244140625, 587.9277954101562, 424.2789306640625, 601.333984375)
(173.70791625976562, 598.8693237304688, 437.3003845214844, 612.2755126

In [300]:
doc.save(output_pdf_path)

In [301]:
pages_texts[0][0].spans

[{'size': 24.0,
  'flags': 20,
  'font': 'Arial-BoldMT',
  'color': 0,
  'ascender': 0.9052734375,
  'descender': -0.2119140625,
  'text': 'X9 1000 and X9 1100 Combines (North',
  'origin': (83.56529998779297, 125.00799560546875),
  'bbox': (83.56529998779297,
   103.28143310546875,
   526.4190673828125,
   130.09393310546875)},
 {'size': 24.0,
  'flags': 20,
  'font': 'Arial-BoldMT',
  'color': 0,
  'ascender': 0.9052734375,
  'descender': -0.2119140625,
  'text': 'American Edition)',
  'origin': (201.6549072265625, 150.97119140625),
  'bbox': (201.6549072265625,
   129.24462890625,
   408.33587646484375,
   156.05712890625)},
 {'size': 11.999899864196777,
  'flags': 20,
  'font': 'Arial-BoldMT',
  'color': 0,
  'ascender': 0.9052734375,
  'descender': -0.2119140625,
  'text': '(Serial No. 825001 -XXXXXX)',
  'origin': (222.97300720214844, 167.01702880859375),
  'bbox': (222.97300720214844,
   156.15383911132812,
   387.04412841796875,
   169.5599822998047)}]

In [302]:
def normalize(value, range):
    return (value - range[0]) / (range[1] - range[0])

In [303]:
selected_texts = []

selected_texts.append([pages_texts[1][0], 'h2'])
selected_texts.append([pages_texts[1][120], 'h1'])
selected_texts.append([pages_texts[1][115], 'h2'])
selected_texts.append([pages_texts[1][116], 't'])

selected_texts.append([pages_texts[2][8], 'h2'])

selected_texts.append([pages_texts[3][0], 'h2'])
selected_texts.append([pages_texts[3][2], 'h2'])
selected_texts.append([pages_texts[3][3], 'h3'])
selected_texts.append([pages_texts[3][4], 't'])
selected_texts.append([pages_texts[3][5], 'h3'])

selected_texts.append([pages_texts[5][17], 'h3'])
selected_texts.append([pages_texts[5][25], 'h3'])
selected_texts.append([pages_texts[5][33], 'h1'])

selected_texts.append([pages_texts[16][0], 'h2'])
selected_texts.append([pages_texts[16][21], 'h1'])

selected_texts.append([pages_texts[440][50], 'h2'])
selected_texts.append([pages_texts[440][51], 't'])
selected_texts.append([pages_texts[440][52], 't'])
selected_texts.append([pages_texts[440][53], 't'])
selected_texts.append([pages_texts[440][54], 't'])
selected_texts.append([pages_texts[440][69], 't'])

selected_texts.append([pages_texts[441][0], 'h2'])
selected_texts.append([pages_texts[441][35], 'h2'])
selected_texts.append([pages_texts[441][36], 't'])
selected_texts.append([pages_texts[441][62], 't'])
selected_texts.append([pages_texts[441][72], 't'])

selected_texts.append([pages_texts[448][0], 'h2'])
selected_texts.append([pages_texts[448][7], 'h2'])
selected_texts.append([pages_texts[448][13], 'h2'])
selected_texts.append([pages_texts[448][16], 't'])
selected_texts.append([pages_texts[448][17], 't'])
selected_texts.append([pages_texts[448][22], 'h1'])


# selected_texts.append([pages_texts[1][13], 'h1'])
# selected_texts.append([pages_texts[1][17], 'h2'])

# selected_texts.append([pages_texts[2][0], 'h1'])
# selected_texts.append([pages_texts[2][1], 'h2'])
# selected_texts.append([pages_texts[2][6], 'h2'])
# selected_texts.append([pages_texts[2][8], 'h2'])
# selected_texts.append([pages_texts[2][10], 't'])
# selected_texts.append([pages_texts[2][13], 'h2'])
# selected_texts.append([pages_texts[2][15], 'h2'])
# selected_texts.append([pages_texts[2][16], 't'])
# selected_texts.append([pages_texts[2][17], 't'])
# selected_texts.append([pages_texts[2][18], 't'])


pprint(selected_texts)

[[Trademarks, 'h2'],
 [Introduction, 'h1'],
 [Foreword, 'h2'],
 [READ THIS MANUAL carefully to learn how to operateand service your machine correctly. Failure to do socould result in personal injury or equipment damage.This manual and safety signs on your machine may also,
  't'],
 [A Message to Our Customers, 'h2'],
 [Download Instructions, 'h2'],
 [Emissions Performance and Tampering, 'h2'],
 [Operation and Maintenance, 'h3'],
 [The engine, including the emissions control system,shall be operated, used, and maintained in accordancewith the instructions provided in this manual to maintainthe emissions performance of the engine within therequirements applicable to the engine's category/certification.,
  't'],
 [Tampering, 'h3'],
 [Safety Signs, 'h3'],
 [Operator’s Station, 'h3'],
 [Contents, 'h1'],
 [Recognize Safety Information, 'h2'],
 [Safety, 'h1'],
 [Footnotes Description Page, 'h2'],
 [(A)*, 't'],
 [For improved straw quality in dry crops and grainquality, use lower speed.,
  't'

In [304]:
# Write training data to csv
with open("training.csv", "w") as f:
    # print('sep=\t', file=f)
    
    for [text, flag] in selected_texts:
        rect = fitz.Rect(text.bbox)

        text_length = len(text.text)

        x0, y0 = normalize(rect.x0, [0, doc_width]), normalize(rect.y0, [0, doc_height])
        x1, y1 = normalize(rect.x1, [0, doc_width]), normalize(rect.y1, [0, doc_height])

        size = normalize(text.size, [0, mean_text_size])

        print(
            text_length,
            # text.size,
            size, 
            x0, y0, x1, y1, 
            text.color, 
            int(text.is_bold), int(text.is_italic), 
            doc_width, doc_height, 
            flag,
            sep=",", file=f
        )
