### Load libraries

We'll use:
- `pdf2image` to convert the pdf file to jpg images at 500dpi resolution
- `argparse` to parse command-line arguments
- `cv2` (OpenCV) to do image processing routines
- `pytesseract` for out-of-the-box OCR (LSTM-based, popular with good docs)
- `numpy` for standard arry manipulation
- `json` for exporting the final dictionary
- `re` for any parsing of text needed with regular expressions

In [1]:
from pdf2image import convert_from_path
import argparse
import cv2
import pytesseract
import numpy as np
import json
import re

### Helper functions

#### Extract contours with a standard image-processing routine

This procedure is like fancy low-pass filtering to bring large items into relief in a way that is friendly to document parsing.

In [2]:
def get_contours(gray_img, gaussian_blur_dims: tuple, kernel_dims: tuple):
    blur = cv2.GaussianBlur(gray_img, gaussian_blur_dims, 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, kernel_dims)
    dilate = cv2.dilate(thresh, kernel, iterations = 1)
    cntrs = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cntrs = cntrs[0] if len(cntrs) == 2 else cntrs[1]
    cntrs = sorted(cntrs, key = lambda x: cv2.boundingRect(x)[0])
    return(cntrs)

#### Picking out vertical lines to demarcate columns

In [3]:
def grab_vertical_line_bounding_boxes(raw_contours):
    line_contours = [cv2.boundingRect(c) for c in raw_contours if \
                     cv2.boundingRect(c)[2] < 30 and \
                     cv2.boundingRect(c)[3] > 200 and \
                     cv2.boundingRect(c)[0] > 250]
    return(line_contours)

def grab_vertical_line_bounding_box(roi):
    raw_contours = get_contours(roi, (5,21), (11, 21))
    line_boxes = [cv2.boundingRect(c) for c in raw_contours if \
                  cv2.boundingRect(c)[2] < 30 and \
                  cv2.boundingRect(c)[3] > 200 and \
                  cv2.boundingRect(c)[0] > 300]
    line_boxes = sorted(line_boxes, key = lambda x: x[3], reverse = True)
    return(line_boxes[0])

#### Helper for determining if center of narrow line contour demargating columns is in larger block initially detected in first pass

In [4]:
def point_in_contour(point, contour):
    return (
        point[0] >= contour[0] and \
        point[0] <= contour[0] + contour[2] and \
        point[1] >= contour[1] and \
        point[1] <= contour[1] + contour[3] 
    )


#### Helper for splitting existing wide roi if vertical line lies inside it

In [5]:
def split_list_item_rois_into_cols(list_item_rois):
    split_list_rois = []
    for roi in list_item_rois:
        line_coords = grab_vertical_line_bounding_box(roi)
        split_list_rois.append({
            "left": roi[:, 0:line_coords[0]],
            "right": roi[:, line_coords[0]:]
        })
    return(split_list_rois)

#### For parsing names/professions using post-processing

In [6]:
def parse_name_list(tess_output: str):
    # Split by extracted lines and remove empties
    name_list = [nm for nm in tess_output.split("\n") if len(nm) > 0]
    # Remove category headers
    name_list = [nm for nm in name_list if not nm.isupper()]
    # If no comma in string, append to previous
    new_name_list = []
    last_item_idx = -1
    for nm in name_list:
        if last_item_idx >= 0:
            if "," in nm:
                new_name_list.append(nm)
                last_item_idx += 1
            else:
                new_name_list[last_item_idx] = new_name_list[last_item_idx] + nm
        else:
            new_name_list.append(nm)
            last_item_idx += 1
    # Separate by comma, convert to dict
    new_name_list = [{"name": nm[0], "profession": nm[1]} for nm in [n.split(", ") for n in new_name_list]]
    return(new_name_list)

### Demonstration using sample page, loading a small number of select pages from the original pdf

In [7]:
# Insert your own path
sample_path = 'book_excerpt.pdf'
images = convert_from_path(sample_path, 500)

img = images[0]

img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

#### Detect how the page is divided horizontally

In [8]:
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
raw_wide_cntrs = get_contours(gray, (101, 31), (101, 31))
wide_contours = [cv2.boundingRect(c) for c in raw_wide_cntrs if cv2.boundingRect(c)[2] > img.shape[1]/2]
wide_rois = [img[y:y+h, x:x+w] for x,y,w,h in wide_contours]

#### Within those horizontal divisions, which ones have vertical lines in them and which do not?

Detect by looking in the center of the page for vertical lines (given size constraints, then see if the center of that line is in each of the rois.

In [9]:
# Detect line breaks demarcating columns
raw_line_cntrs = get_contours(gray, (5,21), (11, 21))
line_contours = grab_vertical_line_bounding_boxes(raw_line_cntrs)

In [10]:
# Calculate line contour centers, then determine which of wide blocks contains
# list items that need parsing
line_contour_centers = [(c[0] + c[2]/2, c[1] + c[3]/2) for c in line_contours]

list_item_contours = [wc for wc in wide_contours if any([point_in_contour(lcc, wc) for lcc in line_contour_centers])]
paragraph_contours = [wc for wc in wide_contours if not any([point_in_contour(lcc, wc) for lcc in line_contour_centers])]

list_item_rois = [gray[y:y+h, x:x+w] for x,y,w,h in list_item_contours]
paragraph_rois = [gray[y:y+h, x:x+w] for x,y,w,h in paragraph_contours]

#### Do OCR on detected paragraph ROIs

In [11]:
# Extract text from paragraphs
paragraph_texts = [pytesseract.image_to_string(r, lang="enm") for r in paragraph_rois]

#### Do further splitting, then OCR and postprocessing on columns

In [12]:
# For each list-item roi, split into two, parse with pytesseract, reconcatenate
split_list_item_rois = split_list_item_rois_into_cols(list_item_rois)

list_item_texts = []
for split_roi in split_list_item_rois:
    list_item_texts.append(
        pytesseract.image_to_string(split_roi["left"], lang = "enm") + "\n" + 
        pytesseract.image_to_string(split_roi["right"], lang = "enm")
    )

In [13]:
# Parse list items with post-processing
inhabitants = [parse_name_list(nm_ls) for nm_ls in list_item_texts]

In [14]:
# Bring data together
parsed_data = {
    "page": 4, # first page of excerpt used is 4
    "text_block": paragraph_texts,
    "data": inhabitants
}

In [15]:
# Save json
with open('data.json', 'w') as fp:
    json.dump(parsed_data, fp)