In [301]:
# pdf_path = "test/pdf/dc/form/Ceridian-2-EC-SDS-v2.1.pdf"
# output_pdf_path = "test/target/dc/form/Ceridian-2-EC-SDS-v2.1.pdf"

pdf_path = "test/pdf/dc/form/acuron-label.pdf"
output_pdf_path = "test/target/dc/form/acuron-label.pdf"

# pdf_path = "test/pdf/dc/form/Blue_Book_2023_web.pdf"
# output_pdf_path = "test/target/dc/form/Blue_Book_2023_web.pdf"

# pdf_path = "test/pdf/dc/form/Ceridian-2-EC-Supplemental-Label.pdf"
# output_pdf_path = "test/target/dc/form/Ceridian-2-EC-Supplemental-Label.pdf"

# pdf_path = "test/pdf/dc/form/OMDXE11749.pdf"
# output_pdf_path = "test/target/dc/form/OMDXE11749.pdf"

In [302]:
import fitz
import pandas as pd
import numpy as np
from pprint import pprint
from collections import defaultdict
from typing import Iterable


In [303]:
doc = fitz.open(pdf_path)

In [304]:
doc.page_cropbox(0)

page_width = doc.page_cropbox(0)[2]
page_height = doc.page_cropbox(0)[3]

page_width, page_height

(612.0, 792.0)

In [305]:
pages = []

for page in doc:
    pages.append(page)

## Helper functions & classess

### Functions

In [306]:
def is_within_range(value, target, tolerance):
    return value >= target - tolerance and value <= target + tolerance

def is_same_font_family(font_a, font_b):
    return font_a.split("-")[0] == font_b.split("-")[0]

def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2 ** 0:
        l.append("superscript")
    if flags & 2 ** 1:
        l.append("italic")
    if flags & 2 ** 2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2 ** 3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2 ** 4:
        l.append("bold")
    return ", ".join(l)

### TextLine

In [307]:
class TextLine(list):
    def __init__(self, spans: list = []):
        super().__init__(spans)

    @property
    def text(self):
        return self.__repr__()

    @property
    def font(self):
        # Find the most common font
        font_counts = defaultdict(int)
        for span in self:
            font_counts[span.font] += len(span.text)
        return max(font_counts, key=font_counts.get) if font_counts else ""
    
    @property
    def size(self):
        # Find the most common font size
        size_counts = defaultdict(int)
        for span in self:
            size_counts[span.size] += len(span.text)
        return max(size_counts, key=size_counts.get) if size_counts else 0
    
    @property
    def bbox(self):
        # Find the boundary box of all spans
        x0 = min([span.bbox[0] for span in self])
        y0 = min([span.bbox[1] for span in self])
        x1 = max([span.bbox[2] for span in self])
        y1 = max([span.bbox[3] for span in self])
        return (x0, y0, x1, y1)
    
    @property
    def rect(self):
        return fitz.Rect(*self.bbox)
    
    @property
    def line_height(self):
        return self.rect.height
    

### TextSegment

In [308]:
class TextSegment(list):
    _previous_segment = None
    _next_segment = None

    def __init__(self, spans: list = []):
        super().__init__(spans)
    
    @property
    def text(self):
        return self.__repr__()
    
    @property
    def font(self):
        # Find the most common font
        font_counts = defaultdict(int)
        for span in self:
            font_counts[span.font] += len(span.text)
        return max(font_counts, key=font_counts.get) if font_counts else ""
    
    @property
    def size(self):
        # Find the most common font size
        size_counts = defaultdict(int)
        for span in self:
            size_counts[span.size] += len(span.text)
        return max(size_counts, key=size_counts.get) if size_counts else 0
    
    @property
    def bbox(self):
        # Find the boundary box of all spans
        x0 = min([span.bbox[0] for span in self])
        y0 = min([span.bbox[1] for span in self])
        x1 = max([span.bbox[2] for span in self])
        y1 = max([span.bbox[3] for span in self])
        return (x0, y0, x1, y1)
    
    @property
    def rect(self):
        return fitz.Rect(*self.bbox)
    
    @property 
    def line_height(self):
        # Find the most common line height
        line_height_counts = defaultdict(int)
        for span in self:
            line_height_counts[span.rect.height] += len(span.text)
        return max(line_height_counts, key=line_height_counts.get) if line_height_counts else 0
    
    @property
    def page_index(self):
        return self[0].page_index
    
    @property
    def previous_segment(self):
        return self._previous_segment
    
    @previous_segment.setter
    def previous_segment(self, segment):
        self._previous_segment = segment

    @property
    def next_segment(self):
        return self._next_segment
    
    @next_segment.setter
    def next_segment(self, segment):
        self._next_segment = segment

    @property
    def has_consistent_font_family(self):
        for span in self:
            if not is_same_font_family(span.font, self.font):
                return False

        return True
    
    @property
    def is_bold(self):
        is_bold = True
        for span in self:
            is_bold = is_bold and span.is_bold
        return is_bold
    
    @property
    def number_of_lines(self):
        return len(self.lines)

    @property
    def lines(self):
        # group same lines together
        lines_dict = defaultdict(list)
        lines = []
        for span in self:
            lines_dict[f"{span.page_index}-{span.line_number}"].append(span)

        for line in lines_dict.values():
            lines.append(TextLine(line))
            
        return lines
    

### TextSpan

In [309]:
class TextSpan():
    _text = ""
    _size = 0
    _flag = 0
    _font = ""
    _color = ""
    _bbox = (0, 0, 0, 0)
    _origin = (0, 0)

    _page_index = 0
    _line_number = 0

    def __init__(self, span, page_index, line_number):
        self._text = span["text"]
        self._size = span["size"]
        self._flag = span["flags"]
        self._font = span["font"]
        self._color = span["color"]
        self._bbox = span["bbox"]
        self._origin = span["origin"]

        self._page_index = page_index
        self._line_number = line_number

    def __repr__(self):
        return self._text.strip()
    
    def __str__(self):
        return self.__repr__()
    
    @property
    def text(self):
        return self._text
    
    @property
    def size(self):
        return self._size
    
    @property
    def flag(self):
        return self._flag
    
    @property
    def font(self):
        return self._font
    
    @property
    def color(self):
        return self._color
    
    @property
    def bbox(self):
        return self._bbox
    
    @property
    def rect(self):
        return fitz.Rect(self._bbox)
    
    @property
    def origin(self):
        return self._origin
    
    @property
    def page_index(self):
        return self._page_index
    
    @property
    def line_number(self):
        return self._line_number
    
    @property
    def is_bold(self):
        BOLD_FONTS_KEYWORDS = ["bold", "black", "heavy", "black", "extra", "ultra", "black"]

        font = self.font.lower()
        for keyword in BOLD_FONTS_KEYWORDS:
            if keyword in font:
                return True
        return False

        return bool(self.flag & 2 ** 4)

## Tranform Spans

In [310]:
all_spans = []

for page_index, page in enumerate(doc):
    line_number = 0

    blocks = page.get_text("dict")["blocks"] or []
    for block_index, block in enumerate(blocks):

        lines_dict = block.get("lines") or []
        for line_index, line in enumerate(lines_dict):

            all_spans.extend([TextSpan(span, page_index, line_number) for span in line["spans"]])
            line_number += 1


## Body text information

In [311]:
all_text_sizes = defaultdict(int)
all_text_fonts = defaultdict(int)
all_text_fonts_families = defaultdict(int)
all_text_line_heights = defaultdict(int)

for span in all_spans:
    font = span.font
    text = span.text
    size = span.size
    line_height = span.rect.height

    if len(text) < 20: continue

    weight = (len(text) * 0.01) ** 2 # y = 0.01x^2, https://www.desmos.com/calculator/8hnjw2c8i7

    all_text_sizes[size] += weight
    all_text_fonts[font] += weight
    all_text_fonts_families[font.split("-")[0]] += weight
    all_text_line_heights[line_height] += weight


body_text_size = max(all_text_sizes, key=all_text_sizes.get)
body_text_font = max(all_text_fonts, key=all_text_fonts.get)
all_text_fonts_family = max(all_text_fonts_families, key=all_text_fonts_families.get)
body_text_line_height = max(all_text_line_heights, key=all_text_line_heights.get)


body_text_size, body_text_font, all_text_fonts_family, body_text_line_height,

(7.636900424957275, 'FrutigerLTStd-Roman', 'FrutigerLTStd', 9.539276123046875)

In [312]:
sorted(all_text_fonts_families, key=all_text_fonts_families.get, reverse=True), all_text_fonts_families

(['FrutigerLTStd', 'HelveticaNeueLTPro', 'OCRBStd'],
 defaultdict(int,
             {'HelveticaNeueLTPro': 15.860600000000003,
              'FrutigerLTStd': 703.4048999999991,
              'OCRBStd': 0.0529}))

In [313]:
all_segments = []

segment = TextSegment([all_spans[0]])

for span_index, span in enumerate(all_spans[1:]):
    last_span = segment[-1]
    last_line = segment.lines[-1]

    print("\n\n")


    # Find the last none empty span in segment
    for s in reversed(segment):
        if s.text.strip():
            last_span = s
            break



    print(segment.lines)

    is_same_font_fam = is_same_font_family(span.font, last_line.font)
    is_same_font = span.font == last_line.font
    is_same_size = is_within_range(span.size, last_line.size, 0.2)

    vertical_distance = span.rect.y0 - last_line.rect.y1
    horizontal_distance = span.rect.x0 - last_span.rect.x1
    left_distance = span.rect.x0 - last_span.rect.x0

    vertical_distance_to_segment = span.rect.y0 - segment.rect.y1
    vertical_distance_to_segment_abs = abs(vertical_distance_to_segment)

    is_on_same_line = span.line_number == last_span.line_number and span.page_index == last_span.page_index
    is_right_next_to = is_on_same_line and horizontal_distance <= 2
    is_below_next_to = not is_on_same_line and \
        vertical_distance_to_segment <= 2 and \
        vertical_distance_to_segment >= -min(span.rect.height * 0.5, last_line.rect.height * 0.5) and \
        horizontal_distance <= last_line.rect.width * 0.5 and \
        horizontal_distance <= span.rect.width * 0.5
        


    if_merge = False # the flag
    print(span, last_span, sep=' | ')

    print(is_on_same_line)

    if is_right_next_to:
        print("right next to")
        if_merge = True

    if is_below_next_to:
        print("below next to")
        if_merge = True

        if span.is_bold and not last_span.is_bold and span.rect.width < last_span.rect.width:
            print("bold and not bold")
            if_merge = False

        if not span.is_bold and last_span.is_bold and span.rect.width > last_span.rect.width:
            print("not bold and bold")
            if_merge = False

        # If height differs too much, don't merge
        if not is_within_range(span.rect.height, last_line.rect.height, min(span.rect.height * 0.4, last_span.rect.height * 0.4)):
            print("not same height")
            if_merge = False

        if is_same_font_fam and not is_same_size:
            print("not same size")
            if_merge = False

        if not is_same_font_fam:
            print("not same font family")
            if_merge = False


    if if_merge:
        segment.append(span)
    else: 
        all_segments.append(segment)
        new_segment = TextSegment([span])

        segment.next_segment = new_segment
        new_segment.previous_segment = segment

        segment = new_segment


all_segments.append(segment)





[[PULL HERE TO OPEN]]
Sale, use and distribution of this product in Nassau and Suffolk Counties in the State of New York is prohibited. | PULL HERE TO OPEN
False



[[Sale, use and distribution of this product in Nassau and Suffolk Counties in the State of New York is prohibited.]]
A Herbicide for Control of Annual Grass and Broadleaf Weeds in Field Corn, | Sale, use and distribution of this product in Nassau and Suffolk Counties in the State of New York is prohibited.
False



[[A Herbicide for Control of Annual Grass and Broadleaf Weeds in Field Corn,]]
Seed Corn, Silage Corn, Sweet Corn and Yellow Popcorn | A Herbicide for Control of Annual Grass and Broadleaf Weeds in Field Corn,
False
below next to



[[A Herbicide for Control of Annual Grass and Broadleaf Weeds in Field Corn,], [Seed Corn, Silage Corn, Sweet Corn and Yellow Popcorn]]
Active Ingredients | Seed Corn, Silage Corn, Sweet Corn and Yellow Popcorn
False



[[Active Ingredients]]
: | Active Ingredients
True
right next

In [314]:
def is_title(segment):
    size = segment.size
    height = segment.line_height
    is_consistent = segment.has_consistent_font_family
    is_bold = segment.is_bold
    number_of_lines = segment.number_of_lines

    print(segment, height, body_text_line_height, is_bold, segment.font, number_of_lines)
    
    if not is_consistent:
        print("not consistent")
        return False
    
    if number_of_lines > 3:
        print("number of lines too many", number_of_lines)
        return False
    
    if is_same_font_family(segment.font, body_text_font) and size <= body_text_size:
        print("same with body font family and size and size smaller")
        return False

    if height <= body_text_line_height:
        print("height too small")
        return False
    
    if is_bold and height >= body_text_line_height * 1.05:
        print("is bold and height big")
        return True
    
    if is_bold and height >= body_text_line_height * 1.02 and not is_same_font_family(segment.font, body_text_font):
        print("is bold and height relatively big and font different")
        return True
    
    if height >= body_text_line_height * 1.2:
        print("height big")
        return True
    
    if line_number == 1 and is_bold and height >= body_text_line_height * 1.0:
        print("one line and bold and height big")

    print("uncatched reason")
    return False
    

In [315]:
for segment_index, segment in enumerate(all_segments):
    color = list(np.random.rand(3,))
    rect = segment.rect
    page = pages[segment.page_index]

    if is_title(segment):
        page.draw_rect(rect, color=color, width=2, overlay=True, stroke_opacity=0.8)

    for span_index, span in enumerate(segment):
        page = pages[span.page_index]
        rect = span.rect
        point = rect.tl
        point.x -= 10
        point.y += 8

        page.draw_rect(rect, color=color, width=0.6, overlay=True, stroke_opacity=0.5)
        page.insert_text(point, f"{segment_index}", color=color, fontname="helvetica-bold", fontsize=8, overlay=True)


[PULL HERE TO OPEN] 6.646202087402344 9.539276123046875 False HelveticaNeueLTPro-Roman 1
height too small
[Sale, use and distribution of this product in Nassau and Suffolk Counties in the State of New York is prohibited.] 9.946640014648438 9.539276123046875 False HelveticaNeueLTPro-Bd 1
uncatched reason
[A Herbicide for Control of Annual Grass and Broadleaf Weeds in Field Corn,, Seed Corn, Silage Corn, Sweet Corn and Yellow Popcorn] 11.39764404296875 9.539276123046875 False HelveticaNeueLTPro-Roman 2
uncatched reason
[Active Ingredients, :, S, -Metolachlor: (CAS No. 87392-12-9) . . . ., . . . . . ., . ., . . . . . . . . . . . . . . . . . . . . . . . ., . . . . ., ., . 23.40%, Atrazine, *, : (CAS No. 1912-24-9), . ., . ., . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ., 10.93%, Mesotrione: (CAS No. 104206-82-8) . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 2.60%, Bicyclopyrone: (CAS No. 352010-68-5), . . .

In [316]:
doc.save(output_pdf_path)