In [3]:
import cv2
import numpy as np
import argparse
import imutils
import pdfrw
import os

def scaleImg(img):
    height, width = img.shape[:2]
    new_width = 1275
    new_height = 1675
    # get scaling factor
    scaling_factor_x = new_width / float(width)
    scaling_factor_y = new_height / float(height)

    # resize image
    img = cv2.resize(img, None, fx=scaling_factor_x, fy=scaling_factor_y, interpolation=cv2.INTER_AREA)

    return img


# inputs: pdf_path: path to the template pdf
# inputs: img_paths: array of cv2 images for each page in the pdf (ordered)
# inputs: skip_corners (optional): skips the automatic corner detection and just uses the full image if true
# outputs: dict of bounding boxes see /csio-forms/oaf1.json for an example
def getPdfBoxes(pdf_path, img_paths, skip_corners=False):
    ANNOT_KEY = '/Annots'
    ANNOT_FIELD_KEY = '/T'
    ANNOT_VAL_KEY = '/V'
    ANNOT_RECT_KEY = '/Rect'
    SUBTYPE_KEY = '/Subtype'
    WIDGET_SUBTYPE_KEY = '/Widget'
    PARENT_KEY = '/Parent'
    FIELD_TYPE_KEY = '/FT'
    CHECKBOX_KEY = '/Btn'
    BOX_KEY = '/Rect'
    SIZE_KEY = '/Size'
    
    bounding_boxes = []

    template_pdf = pdfrw.PdfReader(pdf_path)
    template_pdf.Root.AcroForm.update(pdfrw.PdfDict(NeedAppearances=pdfrw.PdfObject('true')))

    # IDK what this is think it's width
#     width = template_pdf[SIZE_KEY]
    width = template_pdf.pages[0].MediaBox[2]
    height = template_pdf.pages[0].MediaBox[3]

    for i in range(len(template_pdf.pages)):
        bounding_boxes.append({})
        annotations = template_pdf.pages[i][ANNOT_KEY]

        for annotation in annotations:
            if annotation[SUBTYPE_KEY] == WIDGET_SUBTYPE_KEY:
                if annotation[PARENT_KEY] and annotation[PARENT_KEY][ANNOT_FIELD_KEY]:
                    if annotation[PARENT_KEY][ANNOT_FIELD_KEY][1:-1] not in bounding_boxes[i].keys():
                        bounding_boxes[i][annotation[PARENT_KEY][ANNOT_FIELD_KEY][1:-1]] = []

                    box = []

                    type = 'checkbox' if annotation[PARENT_KEY][FIELD_TYPE_KEY] == CHECKBOX_KEY else 'text'
                    
                    for point in annotation[BOX_KEY]:
                        index = (annotation[BOX_KEY].index(point))
                        if (index == 0 or index == 2):
                            box.append(float(point)/float(width))
                        else:
                            box.append(float(point)/float(height))
#                     print (bounding_boxes)
                    bounding_boxes[i][annotation[PARENT_KEY][ANNOT_FIELD_KEY][1:-1]].append({'box': box, 'type': type})

                if annotation[ANNOT_FIELD_KEY]:
                    if annotation[ANNOT_FIELD_KEY][1:-1] not in bounding_boxes[i].keys():
                        bounding_boxes[i][annotation[ANNOT_FIELD_KEY][1:-1]] = []

                    box = []

                    type = 'checkbox' if annotation[FIELD_TYPE_KEY] == CHECKBOX_KEY else 'text'
    
                    for point in annotation[BOX_KEY]:
                        index = (annotation[BOX_KEY].index(point))
                        if (index == 0 or index == 2):
                            box.append(float(point)/float(width))
                        else:
                            box.append(float(point)/float(height))

                    bounding_boxes[i][annotation[ANNOT_FIELD_KEY][1:-1]].append({'box': box, 'type': type})
#     print (bounding_boxes)
    
    for i in range(len(img_paths)):
        form = cv2.imread(img_paths[i])
        width = len(form[0])
        height = len(form)
        print (width)
        print(height)
        for key in bounding_boxes[i].keys():
            for j, box in enumerate(bounding_boxes[i][key]):
                if key == 'Reset':
                    continue

                left_x = box['box'][0] * (width)
                right_x = box['box'][2]  * (width)
                top_y = height - box['box'][1] * (height)
                bot_y = height - box['box'][3]  * (height) 

                bounding_boxes[i][key][j]['box'] = [left_x, bot_y, right_x, top_y]
    print(bounding_boxes)
    form = scaleImg(cv2.imread(img_paths[0]))

    for key in bounding_boxes[1]:
        for box in bounding_boxes[1][key]:
            if key == 'Reset':
                continue
            cv2.rectangle(form, (int(box['box'][0]), int(box['box'][3] )),
                        (int(box['box'][2] ), int(box['box'][1] )), (100, 166, 189), 3)

    cv2.imwrite("boundingBoxes.jpg", form)
    return bounding_boxes

dirpath = os.getcwd()
fieldBounds = getPdfBoxes('OAF1.pdf', ['oafpbm/' + i for i in os.listdir(dirpath + '/oafpbm/') if i[len(i)-4:] == '.jpg'], skip_corners=True)

1275
1650
1275
1650
[{'Reset': [{'box': [0.026951633986928104, 0.9593611111111111, 0.16387581699346407, 0.9826388888888888], 'type': 'checkbox'}], 'Insurance Company': [{'box': [206.34208333333333, 286.08749999999986, 658.7479166666667, 318.44583333333344], 'type': 'text'}], 'BROKER/AGENT': [{'box': [851.09375, 285.04583333333335, 1233.1958333333332, 317.4041666666669], 'type': 'text'}]}, {'A1limit': [{'box': [543.30625, 220.86666666666656, 649.6062499999999, 242.29791666666665], 'type': 'text'}], '1PREM1': [{'box': [655.5124999999999, 220.86666666666656, 738.1895833333333, 244.48750000000018], 'type': 'text'}], '1PREM2': [{'box': [655.5124999999999, 244.48750000000018, 738.1895833333333, 268.11041666666665], 'type': 'text'}], '1PREM3': [{'box': [655.5124999999999, 271.9333333333334, 738.1895833333333, 295.55416666666656], 'type': 'text'}], '9ir1': [{'box': [166.06125, 337.27083333333326, 183.50416666666663, 353.82916666666665], 'type': 'checkbox'}], 'A1limit1': [{'box': [590.552083333