## Google Vision OCR using repository object URL

In [327]:
# Import statements
import io
import os
import subprocess
import json
import requests

# Imports the Google Cloud client library
from google.cloud import vision
from google.cloud.vision import types
from google.protobuf.json_format import MessageToDict

In [328]:
# Outfile name getter

def get_outfile_name(url: str) -> str:
    """ Takes url and returns name for OCR file,
    in the form harleyspiller_[pid]"""
    
    # Find index bounds of collection name and pid
    i = url.find('A', 62)
    j = url.find('/', 62)
        
    # Slice url accordingly
    pid = url[i+1:j]
    
    # Return adjoined outfile name
    return "harleyspiller_" + pid

In [329]:
# OCR function

def convert(image_url: str, path: str) -> None:
    """ Takes a URL to a jp2 image file and uses
    Google Vision OCR to save its text data in .json
    and .txt files, at path location
    """
    
    # Get name
    name = get_outfile_name(image_url)
        
    # Retrieves image data from URL
    print("[Retrieving image from URL]")
    img_data = requests.get(image_url).content
    
    # Loads .jp2 file into memory
    with open(name + ".jp2", "wb") as handler:
        handler.write(img_data)
    
    # Run ImageMagick's conversion from jp2 to png
    print("[Converting jp2 image to resized PNG]")
    subprocess.run(["opj_decompress",
                    "-i",
                    name + ".jp2",
                    "-OutFor",
                    "PNG",
                    "-o",
                    name + ".png"])
    
    subprocess.run(["convert",
                    name + ".png",
                    "-resize", "1500",
                    name + "_resize.png"])
    
    print("[Finished converting to resized PNG.]")
    
    # Save the data in .json and .txt formats
    save_image(name + "_resize.png", name, path)
    
    # Remove PNG and JP2 files from memory
    os.remove(name + ".png")
    os.remove(name + "_resize.png")
    os.remove(name + ".jp2")

In [330]:
# Save data function

def save_image(file_name: str, outfile_name: str, path: str) -> None:   
    """ Saves the OCR data from png file file_name in
    .json and .txt files at location path
    """

    # Loads the image into memory
    with io.open(file_name, 'rb') as image_file: 
        content = image_file.read()

    # Constructs image object and gets text
    image = vision.types.Image(content = content)
    response = client.text_detection(image = image)
    texts = response.text_annotations                           

    # Store repsonse as a dictionary
    response = MessageToDict(response)
    
    # Ensure path ends with slash
    if path[-1] != '/':
        path = path + '/'
        
    # Convert to json
    j_file = path + outfile_name + "_annotated_ocr.json" 
    out_file = open(j_file, "w")  
    json.dump(response, out_file)     
    out_file.close()
    print("[Saved as: " + outfile_name + "_annotated_ocr.json" + " in " + path + "]")

    # Convert to .txt
    if len(texts) != 0:
        text = ('\n"{}"'.format(texts[0].description))
    else:
        text = ""
    o_file = path + outfile_name + "_OCR.txt"
    file = open(o_file,"w") 
    file.write(text)
    file.close()
    print("[Saved as: " + outfile_name + "_OCR.txt" + " in " + path + "]")

In [331]:
# OCR Function

def ocr(path: str) -> None:
    """ Loops through all_pages.txt and performs
    OCR on all the items, saving the data in .json and 
    .txt files at specified path. Tracks which items
    have been processed in finished_pages.txt.
    """

    # Find starting image
    startpoint = sum(1 for line in open('finished_pages.txt'))
    count = startpoint
    print("Starting at PID #: ", startpoint)

    # Open file 
    with open("all_pages.txt","r") as f:
        # Skip already processed images
        for i in range(startpoint):
            f.readline()
        # Loop through rest
        for line in f:
            # Get string on line
            pid = line.strip()

            # Get id number and create url to jp2
            id_num = pid[14:]
            full_url = "https://collections.digital.utsc.utoronto.ca/islandora/object/harleyspiller%3A" + id_num + "/datastream/JP2/view"

            # Print statements and convert() function call
            print("____________________________")
            print("PID #: {}: {}".format(count, full_url))
            convert(full_url, path)
            
            # Keep track of image
            g = open('finished_pages.txt', 'a')
            g.write(pid + '\n')
            g.close()
            
            count += 1

    f.close()

In [None]:
# Main Block

# Instantiates a client from Google Vision 
client = vision.ImageAnnotatorClient()

# Run ocr() with .json/.txt outputs saving to path
path = '/home/dsu/notebooks/ocr_processing/Data/ocr_processing'
ocr(path)

Starting at PID #:  0
____________________________
PID #: 0: https://collections.digital.utsc.utoronto.ca/islandora/object/harleyspiller%3A940/datastream/JP2/view
[Retrieving image from URL]
[Converting jp2 image to resized PNG]
[Finished converting to resized PNG.]
[Saved as: harleyspiller_940_annotated_ocr.json in /home/dsu/notebooks/ocr_processing/Data/ocr_processing/]
[Saved as: harleyspiller_940_OCR.txt in /home/dsu/notebooks/ocr_processing/Data/ocr_processing/]
____________________________
PID #: 1: https://collections.digital.utsc.utoronto.ca/islandora/object/harleyspiller%3A941/datastream/JP2/view
[Retrieving image from URL]
[Converting jp2 image to resized PNG]
[Finished converting to resized PNG.]
[Saved as: harleyspiller_941_annotated_ocr.json in /home/dsu/notebooks/ocr_processing/Data/ocr_processing/]
[Saved as: harleyspiller_941_OCR.txt in /home/dsu/notebooks/ocr_processing/Data/ocr_processing/]
____________________________
PID #: 2: https://collections.digital.utsc.utoron