In this notebook, we will use Google Vision API to extract text from the images from the Heidelberg Collection in our DFKV Dataset.

In [81]:
import io
import os
import glob
import pandas as pd
# Imports the Google Cloud client library
from google.cloud import vision
from tqdm import tqdm
from IPython.display import Image
from collections import OrderedDict
import math
import json
import shutil
# Importing Image class from PIL module
from PIL import Image

In [5]:
# Set the environment variable with your own secret key
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="absolute/path/to/your/key.json"

In [82]:
def resize_image(url):
    # Opens a image in RGB mode
    im = Image.open(url)

    # Size of the image in pixels (size of original image)
    width, height = im.size

    # Cropped image of above dimension
    # (It will not change original image)
    im1 = im.crop((0, 0, width, 19 * height / 20))

    # Shows the image in image viewer
    im1.save(url)

In [85]:
heidelberg_pages = sorted(glob.glob("./data/heidelberg_data/*.jpg"))

In [84]:
# crop from all the images the bottom, which contains its url
for page in tqdm(heidelberg_pages):
    resize_image(page)

100%|██████████| 5122/5122 [09:02<00:00,  9.44it/s]


In [88]:
def detect_text(path):
    """Detects text in the file."""
    client = vision.ImageAnnotatorClient()

    # Read the image
    with io.open(path, 'rb') as image_file:
        content = image_file.read()
    
    # OCR
    image = vision.Image(content=content)
    response = client.document_text_detection(image=image)
    texts = response.text_annotations

    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))
    #return doc_id and text
    return path.split("/")[3].split("_")[1], texts[0].description 

In [96]:
# OCR all the documents and save them in a dictionnary
all_texts_heidelberg = dict()
for page in tqdm(heidelberg_pages):
    doc_id, text = detect_text(page)
    all_texts_heidelberg[doc_id] = all_texts_heidelberg.get(doc_id, "") + text

100%|██████████| 5122/5122 [1:23:42<00:00,  1.02it/s]  


In [97]:
# Save the dictionnary in a json file
with open('./data/complete_texts_heidelberg.json', 'w') as fp:
    json.dump(all_texts_heidelberg, fp)