In [15]:
import cv2
import numpy as np

def sort_contours(cnts, method="left-to-right"):
#     print (cnts)
    # initialize the reverse flag and sort index
    reverse = False
    i = 0

    # handle if we need to sort in reverse
    if method == "right-to-left" or method == "bottom-to-top":
        reverse = True

    # handle if we are sorting against the y-coordinate rather than
    # the x-coordinate of the bounding box
    if method == "top-to-bottom" or method == "bottom-to-top":
        i = 1

    # construct the list of bounding boxes and sort them from top to
    # bottom
    boundingBoxes = [cv2.boundingRect(c) for c in cnts]
    (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
                                        key=lambda b: b[1][i], reverse=reverse))

    # return the list of sorted contours and bounding boxes
    return (cnts, boundingBoxes)

def box_extraction(img_for_box_extraction_path, cropped_dir_path):

    img = cv2.imread(img_for_box_extraction_path, 0)  # Read the image
    (thresh, img_bin) = cv2.threshold(img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)  # Thresholding the image
    img_bin = 255-img_bin  # Invert the image

#     cv2.imwrite("Image_bin.jpg",img_bin)
   
    # Defining a kernel length
    kernel_length = np.array(img).shape[1]//110
     
    # A verticle kernel of (1 X kernel_length), which will detect all the verticle lines from the image.
    verticle_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length))
    # A horizontal kernel of (kernel_length X 1), which will help to detect all the horizontal line from the image.
    hori_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_length, 1))
    # A kernel of (3 X 3) ones.
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))

    # Morphological operation to detect verticle lines from an image
    img_temp1 = cv2.erode(img_bin, verticle_kernel, iterations=4)
    verticle_lines_img = cv2.dilate(img_temp1, verticle_kernel, iterations=4)
#     cv2.imwrite("verticle_lines.jpg",verticle_lines_img)

    # Morphological operation to detect horizontal lines from an image
    img_temp2 = cv2.erode(img_bin, hori_kernel, iterations=4)
    horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=4)
#     cv2.imwrite("horizontal_lines.jpg",horizontal_lines_img)

    # Weighting parameters, this will decide the quantity of an image to be added to make a new image.
    alpha = 0.1
    beta = 1.0 - alpha
    
    # This function helps to add two image with specific weight parameter to get a third image as summation of two image.
    img_final_bin = cv2.addWeighted(verticle_lines_img, alpha, horizontal_lines_img, beta, 0.0)
    img_final_bin = cv2.erode(~img_final_bin, kernel, iterations=4)
    (thresh, img_final_bin) = cv2.threshold(img_final_bin, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    # For Debugging
    # Enable this line to see verticle and horizontal lines in the image which is used to find boxes
    cv2.imwrite("img_final_bin.jpg",img_final_bin)
    # Find contours for image, which will detect all the boxes
    contours, hierarchy = cv2.findContours(
        img_final_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    # Sort all the contours by top to bottom.
    (contours, boundingBoxes) = sort_contours(contours, method="top-to-bottom")

    idx = 0
    for c in contours:
        # Returns the location and width,height for every contour
        x, y, w, h = cv2.boundingRect(c)
        
        #counts to beautify the new lines
        count = 0
        
        # If the box height is greater then 20, widht is >80, then only save it as a box in "cropped/" folder.
        if (w > 80 and h > 20) and w > 3*h:
            idx += 1
            new_img = img[y:y+h, x:x+w]
            
            print (str(idx) + " LB "+ " the x coordinate is " + str(x) + " and the y coordinate is " + str(y))
            print (str(idx) + " RB "+ " the x coordinate is " + str(x + w) + " and the y coordinate is " + str(y))
            print (str(idx) + " LT "+ " the x coordinate is " + str(x) + " and the y coordinate is " + str(y + h))
            print (str(idx) + " RT "+ " the x coordinate is " + str(x + w) + " and the y coordinate is " + str(y + h))
            count = 1
            
            cv2.imwrite(cropped_dir_path+str(idx) + '.png', new_img)
        
        if (count == 1):
            print()
        
    # For Debugging
    # Enable this line to see all contours.
    cv2.drawContours(img, contours, -1, (0, 0, 255), 3)
    cv2.imwrite("img_contour1.jpg", img)


box_extraction("testing.jpg", "./cache/")

TypeError: unsupported operand type(s) for -: 'int' and 'NoneType'

In [2]:
import pdfquery
pdf = pdfquery.PDFQuery('auto_loss_notice.pdf')
pdf.load()

text = pdf.extract([('last_name', ':in_bbox("45,35,1320,1000")')])

print(pdf.tree.write('samplepdf1.xml', pretty_print=True))
# print (text)



None


In [71]:
import pdfrw

def getPdfBoxes(pdf_path, img_paths, skip_corners=False):
    ANNOT_KEY = '/Annots'
    ANNOT_FIELD_KEY = '/T'
    ANNOT_VAL_KEY = '/V'
    ANNOT_RECT_KEY = '/Rect'
    SUBTYPE_KEY = '/Subtype'
    WIDGET_SUBTYPE_KEY = '/Widget'
    PARENT_KEY = '/Parent'
    FIELD_TYPE_KEY = '/FT'
    CHECKBOX_KEY = '/Btn'
    BOX_KEY = '/Rect'
    SIZE_KEY = '/Size'

    bounding_boxes = []

    template_pdf = pdfrw.PdfReader(pdf_path)
    template_pdf.Root.AcroForm.update(pdfrw.PdfDict(NeedAppearances=pdfrw.PdfObject('true')))

    # IDK what this is think it's width
    width = template_pdf[SIZE_KEY]

    for i in range(len(template_pdf.pages)):
        bounding_boxes.append({})
        annotations = template_pdf.pages[i][ANNOT_KEY]

        for annotation in annotations:
            if annotation[SUBTYPE_KEY] == WIDGET_SUBTYPE_KEY:
                if annotation[PARENT_KEY] and annotation[PARENT_KEY][ANNOT_FIELD_KEY]:
                    if annotation[PARENT_KEY][ANNOT_FIELD_KEY][1:-1] not in bounding_boxes[i].keys():
                        bounding_boxes[i][annotation[PARENT_KEY][ANNOT_FIELD_KEY][1:-1]] = []

                    box = []

                    type = 'checkbox' if annotation[PARENT_KEY][FIELD_TYPE_KEY] == CHECKBOX_KEY else 'text'

                    for point in annotation[BOX_KEY]:
                        box.append(float(point) / float(width))

                    bounding_boxes[i][annotation[PARENT_KEY][ANNOT_FIELD_KEY][1:-1]].append({'box': box, 'type': type})

                if annotation[ANNOT_FIELD_KEY]:
                    if annotation[ANNOT_FIELD_KEY][1:-1] not in bounding_boxes[i].keys():
                        bounding_boxes[i][annotation[ANNOT_FIELD_KEY][1:-1]] = []

                    box = []

                    type = 'checkbox' if annotation[FIELD_TYPE_KEY] == CHECKBOX_KEY else 'text'

                    for point in annotation[BOX_KEY]:
                        box.append(float(point) / float(width))

                    bounding_boxes[i][annotation[ANNOT_FIELD_KEY][1:-1]].append({'box': box, 'type': type})

    scale = 3.92
    offset_x = 0.009
    offset_y = 0.016

    for i in range(1):
        form = fitToForm(cv2.imread("1.jpg"), skip_corners=skip_corners)
        width = len(form[0])
        height = len(form)
        for key in bounding_boxes[i].keys():
            for j, box in enumerate(bounding_boxes[i][key]):
                if key == 'Reset':
                    continue

                box['box'][1], box['box'][3] = box['box'][3], box['box'][1]

                box['box'][0] = max((box['box'][0] + offset_x) * scale, 0)
                box['box'][1] = max(height / width - (box['box'][1] + offset_y) * scale, 0)
                box['box'][2] = min((box['box'][2] + offset_x) * scale, 1)
                box['box'][3] = min(height / width - (box['box'][3] + offset_y) * scale, height / width)

                left_x = box['box'][0]
                right_x = box['box'][2]
                top_y = box['box'][1]
                bot_y = box['box'][3]

                bounding_boxes[i][key][j]['box'] = [(left_x, top_y), (right_x, bot_y)]

    print (bounding_boxes)

getPdfBoxes("auto_loss_notice.pdf","")

NameError: name 'fitToForm' is not defined