In [1364]:
pdf_path = "test/pdf/dc/form/Ceridian-2-EC-SDS-v2.1.pdf"
output_pdf_path = "test/target/dc/form/Ceridian-2-EC-SDS-v2.1.pdf"

# pdf_path = "test/pdf/dc/form/acuron-label.pdf"
# output_pdf_path = "test/target/dc/form/acuron-label.pdf"

# pdf_path = "test/pdf/dc/form/Blue_Book_2023_web.pdf"
# output_pdf_path = "test/target/dc/form/Blue_Book_2023_web.pdf"

# pdf_path = "test/pdf/dc/form/Ceridian-2-EC-Supplemental-Label.pdf"
# output_pdf_path = "test/target/dc/form/Ceridian-2-EC-Supplemental-Label.pdf"

# pdf_path = "test/pdf/dc/form/OMDXE11749.pdf"
# output_pdf_path = "test/target/dc/form/OMDXE11749.pdf"

In [1365]:
import fitz
import pandas as pd
import numpy as np
from pprint import pprint
from collections import defaultdict
from typing import Iterable


In [1366]:
doc = fitz.open(pdf_path)

In [1367]:
doc.page_cropbox(0)

page_width = doc.page_cropbox(0)[2]
page_height = doc.page_cropbox(0)[3]

page_width, page_height

(612.0, 792.0)

In [1368]:
pages = []

for page in doc:
    pages.append(page)

## Helper functions & classess

### Functions

In [1369]:
def is_within_range(value, target, tolerance):
    return value >= target - tolerance and value <= target + tolerance

def is_same_font_family(font_a, font_b):
    return font_a.split("-")[0] == font_b.split("-")[0]

def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2 ** 0:
        l.append("superscript")
    if flags & 2 ** 1:
        l.append("italic")
    if flags & 2 ** 2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2 ** 3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2 ** 4:
        l.append("bold")
    return ", ".join(l)

### TextSegment

In [1370]:
class TextSegment(list):
    _previous_segment = None
    _next_segment = None

    def __init__(self, spans: list = []):
        super().__init__(spans)
    
    @property
    def text(self):
        return self.__repr__()
    
    @property
    def font(self):
        # Find the most common font
        font_counts = defaultdict(int)
        for span in self:
            font_counts[span.font] += len(span.text)
        return max(font_counts, key=font_counts.get) if font_counts else ""
    
    @property
    def size(self):
        # Find the most common font size
        size_counts = defaultdict(int)
        for span in self:
            size_counts[span.size] += len(span.text)
        return max(size_counts, key=size_counts.get) if size_counts else 0
    
    @property
    def bbox(self):
        # Find the boundary box of all spans
        x0 = min([span.bbox[0] for span in self])
        y0 = min([span.bbox[1] for span in self])
        x1 = max([span.bbox[2] for span in self])
        y1 = max([span.bbox[3] for span in self])
        return (x0, y0, x1, y1)
    
    @property
    def rect(self):
        return fitz.Rect(*self.bbox)
    
    @property 
    def line_height(self):
        # Find the most common line height
        line_height_counts = defaultdict(int)
        for span in self:
            line_height_counts[span.rect.height] += len(span.text)
        return max(line_height_counts, key=line_height_counts.get) if line_height_counts else 0
    
    @property
    def page_index(self):
        return self[0].page_index
    
    @property
    def previous_segment(self):
        return self._previous_segment
    
    @previous_segment.setter
    def previous_segment(self, segment):
        self._previous_segment = segment

    @property
    def next_segment(self):
        return self._next_segment
    
    @next_segment.setter
    def next_segment(self, segment):
        self._next_segment = segment

        

### TextSpan

In [1371]:
class TextSpan():
    _text = ""
    _size = 0
    _flag = 0
    _font = ""
    _color = ""
    _bbox = (0, 0, 0, 0)
    _origin = (0, 0)

    _page_index = 0
    _line_number = 0

    def __init__(self, span, page_index, line_number):
        self._text = span["text"]
        self._size = span["size"]
        self._flag = span["flags"]
        self._font = span["font"]
        self._color = span["color"]
        self._bbox = span["bbox"]
        self._origin = span["origin"]

        self._page_index = page_index
        self._line_number = line_number

    def __repr__(self):
        return self._text.strip()
    
    def __str__(self):
        return self.__repr__()
    
    @property
    def text(self):
        return self._text
    
    @property
    def size(self):
        return self._size
    
    @property
    def flag(self):
        return self._flag
    
    @property
    def font(self):
        return self._font
    
    @property
    def color(self):
        return self._color
    
    @property
    def bbox(self):
        return self._bbox
    
    @property
    def rect(self):
        return fitz.Rect(self._bbox)
    
    @property
    def origin(self):
        return self._origin
    
    @property
    def page_index(self):
        return self._page_index
    
    @property
    def line_number(self):
        return self._line_number
    
    @property
    def is_bold(self):
        return self.flag & 2 ** 4

## Tranform Spans

In [1372]:
all_spans = []

for page_index, page in enumerate(doc):
    line_number = 0

    blocks = page.get_text("dict")["blocks"] or []
    for block_index, block in enumerate(blocks):

        lines = block.get("lines") or []
        for line_index, line in enumerate(lines):

            all_spans.extend([TextSpan(span, page_index, line_number) for span in line["spans"]])
            line_number += 1


## Body text information

In [1373]:
all_text_sizes = defaultdict(int)
all_text_fonts = defaultdict(int)
all_text_line_heights = defaultdict(int)

for span in all_spans:
    font = span.font
    text = span.text
    size = span.size
    line_height = span.rect.height

    all_text_sizes[size] += len(text)
    all_text_fonts[font] += len(text)
    all_text_line_heights[line_height] += len(text)


body_text_size = max(all_text_sizes, key=all_text_sizes.get)
body_text_font = max(all_text_fonts, key=all_text_fonts.get)
body_text_line_height = max(all_text_line_heights, key=all_text_line_heights.get)

body_text_size, body_text_font, body_text_line_height

(9.0, 'AkzidenzGroteskBE-Regula', 10.746002197265625)

In [1374]:
all_segments = []

segment = TextSegment([all_spans[0]])

for span_index, span in enumerate(all_spans[1:]):
    last_span = segment[-1]

    # Find the last none empty span in segment
    for s in reversed(segment):
        if s.text.strip():
            last_span = s
            break
    

    is_same_font_fam = is_same_font_family(span.font, last_span.font)
    is_same_font = span.font == last_span.font
    is_same_size = is_within_range(span.size, last_span.size, 0.2)
    is_same_color = span.color == last_span.color
    is_same_flag = span.flag == last_span.flag

    vertical_distance = span.rect.y0 - last_span.rect.y1
    vertical_distance_abs = abs(vertical_distance)
    horizontal_distance = span.rect.x0 - last_span.rect.x1
    horizontal_distance_abs = abs(horizontal_distance)
    left_distance = span.rect.x0 - last_span.rect.x0
    left_distance_abs = abs(left_distance)

    vertical_distance_to_segment = span.rect.y0 - segment.rect.y1
    vertical_distance_to_segment_abs = abs(vertical_distance_to_segment)
    left_distance_to_segment = span.rect.x0 - segment.rect.x0
    left_distance_to_segment_abs = abs(left_distance_to_segment)

    is_on_same_line = span.line_number == last_span.line_number and span.page_index == last_span.page_index
    is_right_next_to = is_on_same_line and horizontal_distance <= 2
    is_below_next_to = not is_on_same_line and \
        vertical_distance_to_segment <= 2 and \
        vertical_distance_to_segment >= -3 and \
        horizontal_distance <= last_span.rect.width * 0.5 and \
        horizontal_distance <= span.rect.width * 0.5
        


    if_merge = False # the flag
    print(span)

    if is_right_next_to:
        print("right next to")
        if_merge = True

    if is_below_next_to:
        print("below next to")
        if_merge = True

        if span.is_bold and not last_span.is_bold and span.rect.width < last_span.rect.width:
            print("bold and not bold")
            if_merge = False

        if not span.is_bold and last_span.is_bold and span.rect.width > last_span.rect.width:
            print("not bold and bold")
            if_merge = False

        # If height differs too much, don't merge
        if not is_within_range(span.rect.height, last_span.rect.height, segment.line_height * 0.6):
            print("not same height")
            if_merge = False


    if if_merge:
        segment.append(span)
    else: 
        all_segments.append(segment)
        new_segment = TextSegment([span])

        segment.next_segment = new_segment
        new_segment.previous_segment = segment

        segment = new_segment


all_segments.append(segment)


Atticus, LLC
below next to
bold and not bold
940 NW Cary Parkway, Suite 200
below next to
not bold and bold
Cary, NC 27513
below next to
1
1. IDENTIFICATION
PRODUCT NAME:
DESCRIPTION:
EPA REG. NO.:
COMPANY
IDENTIFICATION:
below next to
Ceridian 2 EC
A liquid herbicide.
91234-154
Atticus, LLC
940 NW Cary Parkway, Suite 200
below next to
not bold and bold
Cary, NC 27513
below next to
2. HAZARD IDENTIFICATION
May cause an allergic skin reaction
below next to
not bold and bold
(H317)
right next to
same line
Causes skin and eye irritation
below next to
(H315+H320)
right next to
same line
Suspected of causing cancer
below next to
(H351)
right next to
same line
Harmful if swallowed or if inhaled
below next to
(H302+H332)
right next to
same line
Combustible liquid
below next to
(H227)
right next to
same line
Harmful to aquatic life
below next to
(H402)
right next to
same line
HAZARD CLASSIFICATION
Health Hazards
below next to
Category
Sensitization-Skin
below next to
not bold and bold
1
Eye Da

In [1375]:
for segment_index, segment in enumerate(all_segments):
    color = list(np.random.rand(3,))
    rect = segment.rect
    page = pages[segment.page_index]

    # page.draw_rect(rect, color=color, width=1.2, overlay=True, stroke_opacity=0.5)

    for span_index, span in enumerate(segment):
        page = pages[span.page_index]
        rect = span.rect
        point = rect.tl
        point.x -= 10
        point.y += 8

        page.draw_rect(rect, color=color, width=0.6, overlay=True, stroke_opacity=0.5)
        page.insert_text(point, f"{segment_index}", color=color, fontname="helvetica-bold", fontsize=8, overlay=True)


In [1376]:
doc.save(output_pdf_path)