In [29]:
import cv2
import pytesseract
import numpy as np
from pdf2image import convert_from_path

WHITELIST = """ !\\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]`abcdefghijklmnopqrstuvwxyz{|}"""
BLACKLIST = """~_^"""

In [30]:
pdf_path = "resources/test-entries/pdfs/1987-page468.pdf"
tiff_path = "/home/midge/Downloads/1987-page468.tiff"
config=f"--psm 6 -c tessedit_char_whitelist={WHITELIST} -c tessedit_char_blacklist={BLACKLIST}"

In [31]:
def convert_pdf_to_tiff(pdf_path, tiff_path):
    images = convert_from_path(
        pdf_path,
        grayscale=True,
        fmt='tiff',
        dpi=500,
    )

    images[0].save(
        tiff_path,
        save_all=True,
        append_images=images[1:],
        compression='tiff_lzw'
    )

    return cv2.imread(tiff_path)

def sharpen_image(image):
    sharpen_kernel = np.array([
        [-1, -1, -1],
        [-1,  9, -1],
        [-1, -1, -1]
    ])

    return cv2.filter2D(image, -1, sharpen_kernel)

def split_page(image):
    height, width, _ = image.shape
    middle = (width // 2)

    left_col = image[:-100, 125:middle]
    right_col = image[:-100, middle:-110]

    return left_col, right_col

def process_half(image):
    # Grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Binarization
    ret, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)

    # Draw the fake-boxes
    rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (200, 20))
    dilation = cv2.dilate(thresh, rect_kernel, iterations=1)

    # Draw the bounding boxes based on the fake ones
    image2 = image.copy()
    contours, hierarchy = cv2.findContours(dilation, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
    cv2.drawContours(image2, contours, -1, (0, 255, 0), 3)

    # Return the boxed image
    return image2

def extract_text_from_processed_image(image):
    return pytesseract.image_to_string(image, lang="eng", config=config).replace("|", "1")

In [32]:
image = convert_pdf_to_tiff(pdf_path, tiff_path)
sharpened_image = sharpen_image(image)
left, right = split_page(sharpened_image)
left, right = process_half(left), process_half(right)

print(extract_text_from_processed_image(left))
print(extract_text_from_processed_image(right))

458 SPRINGBORN GROUP, INC
SPRECKELS TECHNICAL RESOURCE LABORATORY
See Amstar Corp
SPRENGNETHER INSTRUMENTS, INC
Sce Dyneer Corporation
S385 SPRINGBORN GROUP, INC, One Springborn Center, PO (gic)
Box J, Enfield, CT 06082. Tel: 203-749-8371; Telex: 443-6041; FAX:
203-749-7533
Chmn & Chief Exec Officer Dr Robert C Springborn; Pres & Chief
Operating Officer R J Springborn
Professional Staff: 106 (Doctorates: 9)—Analytical chemistry,
biochemistry, chemical engineering, ecotoxicology, environmental
chemistry, pathology, pharmacology, physical chemistry, toxicology,
veterinary pathology. Technicians & Auxiliaries: 110
Fields of R&D: Contract research and development and product
development in polymers, chemicals, and related products, including
analysis, polymer characterization, physical testing, engineering design,
synthesis, compounding, specialty manufacturing, market analysis, and-
toxicology; occupational and environmental health and safety services;
consumer producls testing and inspec