In [1123]:
# pdf_path = "test/pdf/dc/form/Ceridian-2-EC-SDS-v2.1.pdf"
# output_pdf_path = "test/target/dc/form/Ceridian-2-EC-SDS-v2.1.pdf"

# pdf_path = "test/pdf/dc/form/acuron-label.pdf"
# output_pdf_path = "test/target/dc/form/acuron-label.pdf"

pdf_path = "test/pdf/dc/form/Blue_Book_2023_web.pdf"
output_pdf_path = "test/target/dc/form/Blue_Book_2023_web.pdf"

# pdf_path = "test/pdf/dc/form/Ceridian-2-EC-Supplemental-Label.pdf"
# output_pdf_path = "test/target/dc/form/Ceridian-2-EC-Supplemental-Label.pdf"

In [1124]:
import fitz
import json
import pandas as pd
import numpy as np
from pprint import pprint
from collections import defaultdict
from typing import Iterable


In [1125]:
doc = fitz.open(pdf_path)

In [1126]:
doc.page_cropbox(0)

Rect(0.0, 0.0, 585.0, 756.0)

In [1127]:
class TextSpan():
    _spans = []
    _page_index = 0

    def __init__(self, page_index: int, spans: Iterable = []):
        self._page_index = page_index
        self._spans = spans

    @property
    def page_index(self):
        return self._page_index

    @property
    def spans(self):
        return self._spans
    
    @spans.setter
    def spans(self, spans):
        self._spans = spans

    def __repr__(self):
        text = ""
        for span in self.spans:
            text += span["text"]
        return text
    
    def __str__(self):
        return self.__repr__()
    
    @property
    def text(self):
        return self.__repr__()
    
    @property
    def font(self):
        # Find the most common font
        font_counts = defaultdict(int)
        for span in self.spans:
            font_counts[span["font"]] += len(span["text"])
        return max(font_counts, key=font_counts.get) if font_counts else ""
    
    @property
    def size(self):
        # Find the most common font size
        size_counts = defaultdict(int)
        for span in self.spans:
            size_counts[span["size"]] += len(span["text"])
        return max(size_counts, key=size_counts.get) if size_counts else 0
    
    @property
    def bbox(self):
        # Find the boundary of the text
        x0, y0, x1, y1 = np.Inf, np.Inf, 0, 0
        for span in self.spans:
            x0 = min(x0, span["bbox"][0])
            y0 = min(y0, span["bbox"][1])
            x1 = max(x1, span["bbox"][2])
            y1 = max(y1, span["bbox"][3])
        return (x0, y0, x1, y1)

In [1128]:
def within_range(value, target, tolerance):
    return value >= target - tolerance and value <= target + tolerance

In [1129]:
def merge_spans(spans, page_index):
    texts = []

    if len(spans) == 0:
        return texts


    text = TextSpan(page_index, [spans[0]])
    for span in spans[1:]:
        last_span = spans[-1]

        # Find the last none empty span
        for s in reversed(text.spans):
            if s["text"].strip():
                last_span = s
                break


        span_rect = fitz.Rect(span["bbox"])
        last_span_rect = fitz.Rect(last_span["bbox"])

        if span["font"] == last_span["font"] and within_range(span_rect.height, last_span_rect.height, 1): #and within_range(span["size"], last_span["size"], 0.2):
            text.spans.append(span)

        elif span["font"] == last_span["font"] and within_range(span_rect.x1, last_span_rect.x0, 2):
            text.spans.append(span)

        else:
            texts.append(text)
            text = TextSpan(page_index, [span])

    texts.append(text)

    return texts

In [1130]:
pages_texts = []

for page_index, page in enumerate(doc):
    blocks = page.get_text("dict")["blocks"]

    spans = []
    
    blocks = page.get_text("dict")["blocks"]
    for index, block in enumerate(blocks):
        for line in block.get("lines") or []:
            for span in line.get("spans") or []:
                spans.append(span)

    texts = merge_spans(spans, page_index)

    pages_texts.append(texts)


    for text_index, text in enumerate(texts):
        # generate random color
        color = list(np.random.rand(3,))

        for span in text.spans:
            rect = fitz.Rect(span["bbox"])
            point = rect.tl
            point.x -= 10
            point.y += 8

            page.draw_rect(rect, color=color, width=0.6, overlay=True, stroke_opacity=0.5)
            page.insert_text(point, f"{text_index}", color=color, fontname="helvetica-bold", fontsize=8, overlay=True)


In [None]:
all_text_sizes = []

for page_texts in pages_texts:
    for text_index, text in enumerate(page_texts):
        all_text_sizes.append(text.size)

mean_text_size = np.mean(all_text_sizes)
mean_text_size

8.364539177014455

In [None]:
BOLD_FONTS_KEYWORDS = ["bold", "black", "heavy", "black", "extra", "ultra", "black"]

def is_bold_font(font):
    font = font.lower()
    for keyword in BOLD_FONTS_KEYWORDS:
        if keyword in font:
            return True
    return False


In [None]:
def is_title(text, size, font):
    is_bold = is_bold_font(font)
    
    if any(char == '.' or char == ':' for char in text):
        print(text, "invalid chars")
        return False
    
    if size >= mean_text_size * 1.2:
        print(text, size, font, "size >= mean_span_size * 1.5")
        return True
    
    if size >= mean_text_size * 1.02 and is_bold:
        print(text, size, font, "size >= mean_span_size * 0.08 and is_consistent and is_bold")
        return True
    
    # if text == text.upper() and is_bold and not any(not char.isalpha() for char in text):
    #     print(text, "text is all uppercase and is consistent")
    #     return True
    
    # print(text, "not title")
    return False
    

In [None]:
for page_index, page in enumerate(doc):
    page_texts = pages_texts[page_index]

    for text_index, text in enumerate(page_texts):
        all_text_sizes.append(text.size)
        
        title = is_title(text.text, text.size, text.font)
        
        if title: 
            print(text.bbox)
            rect = fitz.Rect(text.bbox)
            page.draw_rect(rect, color=(1, 0, 0), width=1.2, overlay=True, stroke_opacity=0.9)

CROP PROTECTIONGUIDE 42.99850082397461 Helvetica-Bold size >= mean_span_size * 1.5
(96.49839782714844, 80.13922119140625, 506.90057373046875, 168.15711975097656)
2023 153.56619262695312 HelveticaNeue-Bold size >= mean_span_size * 1.5
(122.46839904785156, 155.9453582763672, 480.89190673828125, 338.9962463378906)
Published by:Alberta Wheat Commission 200, 6815 - 8 street NE Calgary, Alberta Canada T2E 7H7 invalid chars
Administration: Sandra Fields Communications: Megan Evans Copy editor: Ian Doig Design: Tommy Wilson Production layout: GrainsWest Publications Society  Additional advisory committe members: Nevin Rosaasen, Rick Taillieu Cover image: George Clayton Photography invalid chars
In co-operation with the agro-chemical industryWith support from Alberta Barley, Alberta Canola Producers Commission and  Alberta Pulse Growers 9.5 Charter-Bold size >= mean_span_size * 0.08 and is_consistent and is_bold
(47.999969482421875, 372.3302001953125, 393.1256103515625, 416.92523193359375)
Copy

In [None]:
doc.save(output_pdf_path)