# OCR

> API details.

In [None]:
#default_exp core
#hide
%load_ext autoreload
%autoreload 2

#from nbdev.showdoc import *



In [None]:
#export
# Standard library imports
#from importlib import resources
from pathlib import Path
import json
import pprint


# Third party imports
from PIL import Image, ImageDraw, ImageFont
from PIL.ImageFont import FreeTypeFont
from textwrap import wrap


# Read a text file and create a clear image of the text on a page.
# Read a text file and create various distorted, transformed or distressed looking page.
# Read an OCR json file from Google and create a clear image of text on a page.
# Read an OCR json file from Google and write a text file of selected areas.

# Translation from dpi to A4 size
A4_pixel_size = {
    2880: (23811, 33676), 
    2400: (19843, 28063),
    1440: (11906, 16838),
    1200: (9921, 14032),
    720:  (5953, 8419),	
    600:  (4960, 7016),
    300:  (2480, 3508),
    150:  (1240, 1754),
    96:   (794, 1123),
    72:   (595, 842)}


In [None]:
#export 
def text_wrap(text, font, max_width):
        """Wrap lines to specified width. 
        This is to enable text of width more than the image width to be display
        nicely.
        @params:
            text: str
                text to wrap
            font: obj
                font of the text
            max_width: int
                width to split the text with
        @return
            lines: list[str]
                list of sub-strings
        """
        lines = []
        
        # If the text width is smaller than the image width, then no need to split
        # just add it to the line list and return
        if font.getsize(text)[0]  <= max_width:
            lines.append(text)
        else:
            #split the line by spaces to get words
            words = text.split(' ')
            i = 0
            # append every word to a line while its width is shorter than the image width
            while i < len(words):
                line = ''
                while i < len(words) and font.getsize(line + words[i])[0] <= max_width:
                    line = line + words[i]+ " "
                    i += 1
                if not line:
                    line = words[i]
                    i += 1
                lines.append(line)
        return lines

In [None]:
#export
def get_max_line_height(lines, font):
    # Calculate the height needed to draw each line of text.
    line_heights = [ get_line_height(line,font) for line in lines]
    # Return the maximum line height.
    return max(line_heights)


def get_line_height(line, font):
    # Return the height of a line of text.
    return font.getmask(line).getbbox()[3]


def get_line_width(line, font):
    # Return the width of a line of text.
    return font.getmask(line).getbbox()[2]


def get_A4_image():
    # Set resolution (dpi)
    resolution = 150

    # Set page size in pixels
    A4 = A4_pixel_size[resolution]

    # Create a page sized image
    A4_image = Image.new(mode='RGB', size=A4, color='white')
    return A4_image

def print_opl(seq):
    for s in seq:
        print(s)


In [None]:
#export 
def create_image(img, lines, font,H_MARGIN=40,V_MARGIN=40,TEXT_COLOR='black'):

    # Interface to draw on the image
    draw = ImageDraw.Draw(img)

    # Get the line separation required
    line_height = get_max_line_height(lines, font)
    ascent, descent = font.getmetrics()
    y = V_MARGIN + descent

    for line in lines:
        # Draw this line with a left margin
        draw.text((H_MARGIN, y), line, font=font, fill=TEXT_COLOR)

        # Move on to the height at which the next line should be drawn at
        y += line_height

    return img

def get_wrapped_text(text_file,font,printable_width):
    
    wrapped = []

    with open(text_file, 'r', encoding='utf-8') as infile:
        text = infile.read()
        lines = text.splitlines()
        for line in lines:
            wrapped.extend(text_wrap(line,font,printable_width))
            
    return wrapped

def image_permutations(text_files,fonts,printable_width):
    # Return a list of tuples containing the info needed to create an image.
    images_info = []
    
    for text_file in text_files:
            for font in fonts:
                
                # print(font.size, Path(font.path).stem)
                # Add Font name and size to the output filename
                image_name = text_file.stem + f'_{Path(font.path).stem}_{font.size}'
                images_info.append((text_file, font, printable_width, image_name))
    
    return images_info


In [None]:
#export
# TODO Get Text, Font and size as arguments to the script.
img = get_A4_image()

# Set paths to texts

base_path = Path("D:\Work\Test\Webbs")
text_path = base_path / "texts"
out_folder = base_path / "Input"

# Set path to fonts
font_path = base_path / "fonts"
#font_file = font_path / "GentiumPlus-R.ttf"

left_margin = 40
top_margin = 40
right_margin = 40
bottom_margin = 40

# Get image size in pixels
img_width, img_height = img.size

x_right_margin = img_width - right_margin
y_bottom_margin = img_height - bottom_margin

printable_width = x_right_margin - left_margin

file_types = ['pdf','png']
filenames = ['bkq-bkqNT.txt','bsq-Bassa02.txt','rbt-psa2-3.txt']
font_filenames = ['GentiumPlus-R.ttf','GentiumPlus-I.ttf','arial.ttf','ariali.ttf',]

text_files = [text_path / filename for filename in filenames]  

# Set fonts and sizes.
font_sizes = [10,24] #12,16,18,24]
font_files = [font_path / font for font in font_filenames]

fonts = [ImageFont.truetype(str(font_file), font_size) for font_file in font_files for font_size in font_sizes]

#print('These are the text files:')
#print_opl(text_files)
#print('\nThese are the fonts:')
#print_opl(font_filenames)
#print('\nThese are the font sizes:')
#print_opl(font_sizes)
#print()

image_infos = image_permutations(text_files,fonts,printable_width)

#print(f'These are the {len(image_infos)} image files requested:')
#print_opl([image[3] for image in image_infos])
#print_opl(image_infos)

skip_existing = True

for image_info in image_infos:
    img = get_A4_image()
    text_file, font, printable_width, img_name = image_info
    
    wrapped_text = get_wrapped_text(text_file,font,printable_width)
    text_img = create_image(img,wrapped_text,font)
    
    for file_type in file_types:
        out_file_name = image_info[3] + '.' + file_type
        out_file = out_folder / out_file_name
        print(f'Processing {out_file.name}. Exists? : {out_file.exists()}')
        if out_file.exists():
            if skip_existing:
                print(f'{str(out_file)} already exists, skipping.')
            else :
                print(f'{str(out_file)} already exists, overwriting.')
                text_img.save(out_file)
        else:
            text_img.save(out_file)
      



Processing bkq-bkqNT_GentiumPlus-R_10.pdf. Exists? : True
D:\Work\Test\Webbs\Input\bkq-bkqNT_GentiumPlus-R_10.pdf already exists, overwriting.
Processing bkq-bkqNT_GentiumPlus-R_10.png. Exists? : True
D:\Work\Test\Webbs\Input\bkq-bkqNT_GentiumPlus-R_10.png already exists, overwriting.
Processing bkq-bkqNT_GentiumPlus-R_24.pdf. Exists? : True
D:\Work\Test\Webbs\Input\bkq-bkqNT_GentiumPlus-R_24.pdf already exists, overwriting.
Processing bkq-bkqNT_GentiumPlus-R_24.png. Exists? : True
D:\Work\Test\Webbs\Input\bkq-bkqNT_GentiumPlus-R_24.png already exists, overwriting.
Processing bkq-bkqNT_GentiumPlus-I_10.pdf. Exists? : True
D:\Work\Test\Webbs\Input\bkq-bkqNT_GentiumPlus-I_10.pdf already exists, overwriting.
Processing bkq-bkqNT_GentiumPlus-I_10.png. Exists? : True
D:\Work\Test\Webbs\Input\bkq-bkqNT_GentiumPlus-I_10.png already exists, overwriting.
Processing bkq-bkqNT_GentiumPlus-I_24.pdf. Exists? : True
D:\Work\Test\Webbs\Input\bkq-bkqNT_GentiumPlus-I_24.pdf already exists, overwriting.

In [None]:
results_path = Path("D:\Work\Test\Webbs\Input")
ms_output_path = results_path / "Microsoft"
gcp_output_path = results_path / "Google"
aws_output_path = results_path / "Amazon"

selected_input_folder = Path('D:\Work\Test\Webbs\SelectedInput')


In [None]:
from google.cloud import storage

!SET GOOGLE_APPLICATION_CREDENTIALS="D:\CryptoKeys\DavidBaines_OCR1_83942beb5ed5.json"

def list_buckets():
    # If you don't specify credentials when constructing the client, the
    # client library will look for credentials in the environment.
    storage_client = storage.Client()

    # Make an authenticated API request
    buckets = list(storage_client.list_buckets())
    return buckets

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # bucket_name = "your-bucket-name"
    # source_file_name = "local/path/to/file"
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file)

    print(
        "File {} uploaded to {}.".format(
            source_file, destination_blob_name
        )
    )

def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    import json
    import re
    from google.cloud import vision
    from google.cloud import storage

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_string()
    response = json.loads(json_string)

    # The actual response for the first page of the input file.
    first_page_response = response['responses'][0]
    annotation = first_page_response['fullTextAnnotation']

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    print('Full text:\n')
    print(annotation['text'])
    
list_buckets()




DefaultCredentialsError: Could not automatically determine credentials. Please set GOOGLE_APPLICATION_CREDENTIALS or explicitly create credentials and re-run the application. For more information, please see https://cloud.google.com/docs/authentication/getting-started

In [None]:
bucket_name = "ocr_test_bucket_2345"
destination_blob_name = "Input_data"
for source_file in saved_files:
    upload_blob(bucket_name,str(source_file),destination_blob_name)


In [None]:
notebook2script()