In [131]:
import PyPDF2
from PIL import Image
from io import BytesIO
import base64
import json
import requests
import cv2
import numpy as np
from collections import OrderedDict
import os

# Image Extraction and OCR

## code

In [146]:
def extract_image_from_page(page, pdf_source, page_n):
    image_name_prefix = 'page_' + str(page_n) + '_'
    dest_dir = os.path.join('extracted_images', pdf_source.replace('.pdf', ''))
    dest_path = os.path.join(dest_dir, image_name_prefix)
    os.makedirs(dest_dir, exist_ok=True)
    x_objects = test_page['/Resources']['/XObject'].getObject()
    for obj in x_objects:
        if x_objects[obj]['/Subtype'] == '/Image':
            size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
            data = xObject[obj].getData()
            if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
                mode = "RGB"
            else:
                mode = "P"
            if xObject[obj]['/Filter'] == '/FlateDecode':
                img = Image.frombytes(mode, size, data)
                img.save(dest_path + obj[1:] + ".png")
            elif xObject[obj]['/Filter'] == '/DCTDecode':
                img = open(dest_path + obj[1:] + ".jpg", "wb")
                img.write(data)
                img.close()
            elif xObject[obj]['/Filter'] == '/JPXDecode':
                img = open(dest_path + obj[1:] + ".jp2", "wb")
                img.write(data)
                img.close()
    return img

def query_vision_ocr(image_str, merge_boxes=True, include_merged_components=True, as_json=True, max_pix_size=None):
    api_entry_point = 'http://vision-ocr.dev.allenai.org/v1/ocr'
    header = {'Content-Type': 'application/json'}
    
    request_data = {
        'mergeBoxes': merge_boxes,
        'includeMergedComponents': include_merged_components
    }

    if len(image_str) > 500:
        request_data['image'] = image_str
    else:
        request_data['url'] = image_str
    
    json_data = json.dumps(request_data)
    response = requests.post(api_entry_point, data=json_data, headers=header)
    if response.reason != 'OK':
        print(response.reason)
    json_response = json.loads(response.content.decode())
    if as_json:
        response = json_response
    return response

In [147]:
def point_to_tuple(box):
    return tuple(OrderedDict(sorted(box.items())).values())

def get_bbox_tuples(detection):
    return map(point_to_tuple, detection['rectangle'])

def draw_detections(pil_image, detections):
    open_cv_image = np.array(pil_image) 
    for box in detections:
        lr, ul = get_bbox_tuples(box)
        cv2.rectangle(open_cv_image, ul, lr, color=(100, 100, 100), thickness=1)
    return Image.fromarray(open_cv_image, 'P')

## run

In [148]:
pdf_name = 'dec_test.pdf'
pdf_doc = PyPDF2.PdfFileReader(open(pdf_name, "rb"))
page_n = 30
test_page = pdf_doc.getPage(page_n)
img_to_disp = extract_image_from_page(test_page, pdf_name, page_n)

In [86]:
sbuffer = BytesIO()
img_to_disp.save(sbuffer, format="PNG")
b64_image = base64.b64encode(sbuffer.getvalue())
str_image = b64_image.decode()

In [96]:
response = query_vision_ocr(str_image, merge_boxes=False, include_merged_components=False)

In [97]:
for detection in response['detections']:
    print(detection['value'])

1
2
Spring 1 and Spring 2 were the same. Then, Spring 1 was pushed together a
little and clamped in place. Spring 2 was pushed together a lot and
clamped
Which spring has more stored energy?
CA
Spring 1
Spring
22
B
O
Both springs have the same energy
D
ou cannot tell unless you know what the springs are made of.


In [99]:
# img_to_disp

In [130]:
# draw_detections(img_to_disp, response['detections'])