In [2619]:
pdf_path = "test/pdf/dc/form/Ceridian-2-EC-SDS-v2.1.pdf"
output_pdf_path = "test/target/dc/form/Ceridian-2-EC-SDS-v2.1.pdf"

# pdf_path = "test/pdf/dc/form/acuron-label.pdf"
# output_pdf_path = "test/target/dc/form/acuron-label.pdf"

pdf_path = "test/pdf/dc/form/Blue_Book_2023_web.pdf"
output_pdf_path = "test/target/dc/form/Blue_Book_2023_web.pdf"

# pdf_path = "test/pdf/dc/form/Ceridian-2-EC-Supplemental-Label.pdf"
# output_pdf_path = "test/target/dc/form/Ceridian-2-EC-Supplemental-Label.pdf"

In [2620]:
import fitz
import json
import pandas as pd
import numpy as np
from pprint import pprint
from collections import defaultdict
from typing import Iterable


In [2621]:
doc = fitz.open(pdf_path)

In [2622]:
doc.page_cropbox(0)

Rect(0.0, 0.0, 585.0, 756.0)

In [2623]:
class TextSpan():
    _spans = []
    _page_index = 0

    def __init__(self, page_index: int, spans: Iterable = []):
        self._page_index = page_index
        self._spans = spans

    @property
    def page_index(self):
        return self._page_index

    @property
    def spans(self):
        return self._spans
    
    @spans.setter
    def spans(self, spans):
        self._spans = spans

    def __repr__(self):
        text = ""
        for span in self.spans:
            text += span["text"]
        return text
    
    def __str__(self):
        return self.__repr__()
    
    @property
    def text(self):
        return self.__repr__()
    
    @property
    def font(self):
        # Find the most common font
        font_counts = defaultdict(int)
        for span in self.spans:
            font_counts[span["font"]] += len(span["text"])
        return max(font_counts, key=font_counts.get) if font_counts else ""
    
    @property
    def size(self):
        # Find the most common font size
        size_counts = defaultdict(int)
        for span in self.spans:
            size_counts[span["size"]] += len(span["text"])
        return max(size_counts, key=size_counts.get) if size_counts else 0
    
    @property
    def bbox(self):
        # Find the boundary of the text
        x0, y0, x1, y1 = np.Inf, np.Inf, 0, 0
        for span in self.spans:
            x0 = min(x0, span["bbox"][0])
            y0 = min(y0, span["bbox"][1])
            x1 = max(x1, span["bbox"][2])
            y1 = max(y1, span["bbox"][3])
        return (x0, y0, x1, y1)

In [2624]:
def within_range(value, target, tolerance):
    return value >= target - tolerance and value <= target + tolerance

def is_same_font_family(font_a, font_b):
    return font_a.split("-")[0] == font_b.split("-")[0]

In [2625]:
def merge_spans(spans, page_index):
    texts = []

    if len(spans) == 0:
        return texts


    text = TextSpan(page_index, [spans[0]])
    for span in spans[1:]:
        last_span = spans[-1]

        # Find the last none empty span
        for s in reversed(text.spans):
            if s["text"].strip():
                last_span = s
                break


        span_rect = fitz.Rect(span["bbox"])
        last_span_rect = fitz.Rect(last_span["bbox"])

        vertical_distance = abs(span_rect.y0 - last_span_rect.y1)
        horizontal_distance = abs(span_rect.x0 - last_span_rect.x1)
        top_distance = abs(span_rect.y0 - last_span_rect.y0)
        left_distance = abs(span_rect.x0 - last_span_rect.x0)
        

        flag = False # True for merge and False for not merge

        # if span["font"] == last_span["font"] and within_range(span_rect.height, last_span_rect.height, 0.2):
        #     flag = True

        # if is_same_font_family(span["font"], last_span["font"]) and horizontal_distance <= 0.8:
        #     flag = True


        # If on the same line and the distance is small
        if top_distance <= 2 and horizontal_distance <= 2:
            flag = True

            if not within_range(span_rect.height, last_span_rect.height, 0.2):
                flag = False

        # If on the same column and the distance is small
        if left_distance <= 2 and vertical_distance <= 2:
            flag = True

            if not within_range(span_rect.height, last_span_rect.height, 1):
                flag = False

            if not is_same_font_family(span["font"], last_span["font"]):
                flag = False


        # If on the same line and the distance is relatively small, but has the same font
        if top_distance <= last_span_rect.height * 0.8 \
            and horizontal_distance <= 0.1 \
            and is_same_font_family(span["font"], last_span["font"]):
            flag = True

        # If vertically beside each other and have the same font
        if vertical_distance <= 1 and horizontal_distance > 10 and span["font"] == last_span["font"]:
            flag = True



        if flag:
            text.spans.append(span)
        else:
            texts.append(text)
            text = TextSpan(page_index, [span])

    texts.append(text)

    return texts

In [2626]:
pages_texts = []

for page_index, page in enumerate(doc):
    blocks = page.get_text("dict")["blocks"]

    spans = []
    
    blocks = page.get_text("dict")["blocks"]
    for index, block in enumerate(blocks):
        for line in block.get("lines") or []:
            for span in line.get("spans") or []:
                spans.append(span)

    texts = merge_spans(spans, page_index)

    pages_texts.append(texts)


    for text_index, text in enumerate(texts):
        # generate random color
        color = list(np.random.rand(3,))

        for span in text.spans:
            rect = fitz.Rect(span["bbox"])
            point = rect.tl
            point.x -= 10
            point.y += 8

            page.draw_rect(rect, color=color, width=0.6, overlay=True, stroke_opacity=0.5)
            page.insert_text(point, f"{text_index}", color=color, fontname="helvetica-bold", fontsize=8, overlay=True)


In [None]:
all_text_sizes = []

for page_texts in pages_texts:
    for text_index, text in enumerate(page_texts):
        all_text_sizes.append(text.size)

mean_text_size = np.mean(all_text_sizes)
mean_text_size

8.671246054271856

In [None]:
BOLD_FONTS_KEYWORDS = ["bold", "black", "heavy", "black", "extra", "ultra", "black"]

def is_bold_font(font):
    font = font.lower()
    for keyword in BOLD_FONTS_KEYWORDS:
        if keyword in font:
            return True
    return False


In [None]:
def is_title(text, size, font):
    is_bold = is_bold_font(font)
    
    if text.strip() == "" or len(text) <= 3:
        return False
    
    if any(char == '.' or char == ':' for char in text) and text != text.upper():
        print(text, "invalid chars")
        return False
    
    if size >= mean_text_size * 1.2:
        print(text, size, font, "size >= mean_span_size * 1.5")
        return True
    
    if size >= mean_text_size * 1.02 and is_bold:
        print(text, size, font, "size >= mean_span_size * 0.08 and is_consistent and is_bold")
        return True
    
    # if text == text.upper() and is_bold and not any(not char.isalpha() for char in text):
    #     print(text, "text is all uppercase and is consistent")
    #     return True
    
    print(text, "not title")
    return False
    

In [None]:
for page_index, page in enumerate(doc):
    page_texts = pages_texts[page_index]

    for text_index, text in enumerate(page_texts):
        all_text_sizes.append(text.size)
        
        title = is_title(text.text, text.size, text.font)
        
        if title: 
            print(text.bbox)
            rect = fitz.Rect(text.bbox)
            page.draw_rect(rect, color=(1, 0, 0), width=1.2, overlay=True, stroke_opacity=0.9)

Manufactured for:Atticus, LLC940 NW Cary Parkway, Suite 200  Cary, NC 27513 invalid chars
1. IDENTIFICATION 10.0 AkzidenzGroteskBE-Bold size >= mean_span_size * 0.08 and is_consistent and is_bold
(31.0, 209.22999572753906, 122.92298889160156, 221.0)
PRODUCT NAME: 9.0 AkzidenzGroteskBE-Bold size >= mean_span_size * 0.08 and is_consistent and is_bold
(27.0, 229.40699768066406, 104.08590698242188, 240.0)
DESCRIPTION: 8.954887390136719 AkzidenzGroteskBE-Bold size >= mean_span_size * 0.08 and is_consistent and is_bold
(27.0, 244.15699768066406, 92.55979919433594, 254.75)
EPA REG. NO.: 9.0 AkzidenzGroteskBE-Bold size >= mean_span_size * 0.08 and is_consistent and is_bold
(27.0, 258.9070129394531, 90.58679962158203, 269.5)
COMPANY IDENTIFICATION: 8.954887390136719 AkzidenzGroteskBE-Bold size >= mean_span_size * 0.08 and is_consistent and is_bold
(27.0, 275.6070251464844, 101.85560607910156, 297.0)
Ceridian 2 EC 9.0 AkzidenzGroteskBE-Bold size >= mean_span_size * 0.08 and is_consistent and is_

In [None]:
doc.save(output_pdf_path)