In [33]:
!env

USER=dsu
MPLBACKEND=module://ipykernel.pylab.backend_inline
JUPYTERHUB_HOST=
JUPYTERHUB_USER=dsu
HOME=/home/dsu
JUPYTERHUB_OAUTH_CALLBACK_URL=/user/dsu/oauth_callback
JUPYTERHUB_API_URL=http://127.0.0.1:8081/hub/api
PAGER=cat
GOOGLE_APPLICATION_CREDENTIALS=/etc/google-cloud-service-keys/harley-spiller-957b8c666407.json
JUPYTERHUB_CLIENT_ID=jupyterhub-user-dsu
TERM=xterm-color
PATH=/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/opt/anaconda3/bin:/opt/anaconda3/condabin
LANG=en_CA.UTF-8
SHELL=/bin/bash
JUPYTERHUB_ADMIN_ACCESS=1
JUPYTERHUB_SERVICE_PREFIX=/user/dsu/
GIT_PAGER=cat
OPENCV_IO_ENABLE_JASPER=TRUE
JUPYTERHUB_API_TOKEN=afc7004570f04f2fb0c6653533c60b20
CLICOLOR=1
PWD=/home/dsu/notebooks/ocr_processing
JUPYTERHUB_BASE_URL=/
JPY_PARENT_PID=5673
JPY_API_TOKEN=afc7004570f04f2fb0c6653533c60b20


In [34]:
# Import statements
import io
import os
import subprocess
import json

# Imports the Google Cloud client library
from google.cloud import vision
from google.cloud.vision import types
from google.protobuf.json_format import MessageToDict

In [35]:
# Step 1: Pre-processing the image and converting to .png for OCR.
         
def pre_process(path: str) -> str:
    """ Pre-processes the image to make it ready for OCR."""
    # Folder Name
    folder = os.path.dirname(path)
    print("Woking on", folder)

    p_file = folder + "/out.png"
    os.chdir("/home/dsu/notebooks/ocr_processing")
    # Pre-Processing using textcleaner script from Imagemagik
    processed_file = subprocess.run(["./textcleaner", "-g", "-e", "normalize", "-f", "30", "-o", "12", "-s", "2",
                                     path, p_file])
    print("[Pre-processed Image]")
    return p_file

In [36]:
# Step 2: Running the Google Vision API to detect text on the pre-processed image. 

def post_process(path: str, p_file: str) -> None:
    """ Running the Google Vision API on the pre-processed image and
    saving it as a .txt and .json files."""
    # Folder Name
    folder = os.path.dirname(path)
    
    # Loads the image into memory
    with io.open(p_file, 'rb') as image_file:
        content = image_file.read()
        
    print("[Running OCR]")

    image = vision.types.Image(content = content)
    response = client.text_detection(image = image)
    texts = response.text_annotations

    # Convert the response to dictionary
    response = MessageToDict(response)

    # Convert to json
    j_file = folder + "/annotated_ocr.json" 
    out_file = open(j_file, "w")  
    json.dump(response, out_file)     
    out_file.close()
    print("[Saved as .json]")

    # Convert to .txt
    if len(texts) != 0:
        text = ('\n"{}"'.format(texts[0].description))
    else:
        text = ""
    o_file = folder + "/OCR.txt"
    file = open(o_file,"w") 
    file.write(text)
    file.close()
    print("[Saved as .txt]")

In [37]:
# OCR Processing

def ocr(path: str) -> None:
    """ Traverses through the file system recursively which
    is rooted at path. Pre-processes the image to make it ready
    for the Google Vision API to run OCR. Finally saves the 
    detected text in .json and .txt formats.
    """
    # Check for file type (Base Case) 
    if os.path.isfile(path):
        file_name = path.split('/')[-1]
        if (".jpg" in file_name) or (".tif" in file_name):            
            # Pre-processing the image using textcleaner
            p_file = pre_process(path)
            # Post-processing the image using Google Vision API
            post_process(path, p_file)

#             # Removing pre-processed file
#             os.remove(p_file) 
#             print("[Removed pre-processed file]")

    # Check for directory type (Recursive Case)
    elif os.path.isdir(path):
        for f in os.listdir(path):
            #Extends the path by one level
            new_path = path + '/' + f
            #Recursive Call
            ocr(new_path)

In [38]:
# Main Block
    
# Instantiates a client
client = vision.ImageAnnotatorClient()

path = "/home/dsu/notebooks/Spiller_Menu_Workflow/NEW-Harley-Spiller-Downloads/may-2020-output/failed-cropped-0624_original"
ocr(path)

Woking on /home/dsu/notebooks/Spiller_Menu_Workflow/NEW-Harley-Spiller-Downloads/may-2020-output/failed-cropped-0624_original/CA_UTSC_006-1-4-26-2OS/26-2-62/4
[Pre-processed Image]
[Running OCR]
[Saved as .json]
[Saved as .txt]
Woking on /home/dsu/notebooks/Spiller_Menu_Workflow/NEW-Harley-Spiller-Downloads/may-2020-output/failed-cropped-0624_original/CA_UTSC_006-1-4-26-2OS/26-2-62/3
[Pre-processed Image]
[Running OCR]
[Saved as .json]
[Saved as .txt]
Woking on /home/dsu/notebooks/Spiller_Menu_Workflow/NEW-Harley-Spiller-Downloads/may-2020-output/failed-cropped-0624_original/CA_UTSC_006-1-4-26-2OS/26-2-62/2
[Pre-processed Image]
[Running OCR]
[Saved as .json]
[Saved as .txt]
Woking on /home/dsu/notebooks/Spiller_Menu_Workflow/NEW-Harley-Spiller-Downloads/may-2020-output/failed-cropped-0624_original/CA_UTSC_006-1-4-26-2OS/26-2-62/1
[Pre-processed Image]
[Running OCR]
[Saved as .json]
[Saved as .txt]
Woking on /home/dsu/notebooks/Spiller_Menu_Workflow/NEW-Harley-Spiller-Downloads/may-202