In this notebook, we will use Google Vision API to extract text from the images from the Heidelberg Collection in our DFKV Dataset.

In [1]:
import io
import os
import glob
import pandas as pd
# Imports the Google Cloud client library
from google.cloud import vision
from collections import Counter
from tqdm import tqdm
from IPython.display import Image
from collections import OrderedDict
import math
import json
import shutil
import re
import langid
from spacy.lang.de.stop_words import STOP_WORDS as de_stop
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
# Importing Image class from PIL module
from PIL import Image

In [5]:
# Set the environment variable with your own secret key
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="absolute/path/to/your/key.json"

In [3]:
def resize_image(url):
    # Opens a image in RGB mode
    im = Image.open(url)

    # Size of the image in pixels (size of original image)
    width, height = im.size

    # Cropped image of above dimension
    # (It will not change original image)
    im1 = im.crop((0, 0, width, 19 * height / 20))

    # Shows the image in image viewer
    im1.save(url)

In [6]:
heidelberg_pages = sorted(glob.glob("./data/heidelberg_data/*.jpg"))

In [84]:
# crop from all the images the bottom, which contains its url
for page in tqdm(heidelberg_pages):
    resize_image(page)

100%|██████████| 5122/5122 [09:02<00:00,  9.44it/s]


In [7]:
def detect_text(path):
    """Detects text in the file."""
    client = vision.ImageAnnotatorClient()

    # Read the image
    with io.open(path, 'rb') as image_file:
        content = image_file.read()
    
    # OCR
    image = vision.Image(content=content)
    response = client.document_text_detection(image=image)
    texts = response.text_annotations

    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))
    #return doc_id and text
    return path.split("/")[3].split("_")[1], path.split("/")[3].split("_")[2].split(".")[0], texts[0].description 

In [9]:
# OCR all the documents and save them in a dictionnary
all_texts_heidelberg = dict()
all_texts_pages_heidelberg = dict()
for page in tqdm(heidelberg_pages):
    doc_id, page, text = detect_text(page)
    #all_texts_heidelberg[doc_id] = all_texts_heidelberg.get(doc_id, "") + text
    all_texts_pages_heidelberg[doc_id + "_" + page] = text

# If there are multiple pages in the same data entry, concatenate them all 
for k, v in all_texts_pages_heidelberg.items():
    all_texts_heidelberg[k.split("_")[0]] = all_texts_heidelberg.get(k.split("_")[0], "") + v

100%|██████████| 6584/6584 [1:49:29<00:00,  1.00it/s]  


In [10]:
# Save the texts in a json file
with open('./data/complete_texts_heidelberg.json', 'w') as fp:
    json.dump(all_texts_heidelberg, fp)
with open('./data/complete_texts_page_heidelberg.json', 'w') as fp:
    json.dump(all_texts_pages_heidelberg, fp)

## Cleaning and tokenize

We will now clean the obtained texts.

In [2]:
# read file
with open('./data/complete_texts_heidelberg.json', 'r') as fp:
    data = json.load(fp)

In [6]:
# Classify between French and German texts
german = dict()
french = dict()

for k, t in tqdm(data.items()):
    language = langid.classify(t.replace("\n", " "))
    if language[0] == "fr":
        french[k] = t.replace("\n", " ")
    else:
        german[k] = t.replace("\n", " ")

100%|██████████| 999/999 [00:10<00:00, 94.69it/s] 


In [8]:
# List of charachters to remove from texts
to_remove = ["\n", "-", "'", ",", ";", ":", ".", "!", "?", "’", "(", ")", "\"", "%", "#", "$", "&", "*", "+", "=",
            "/", "…", ">", "@", "_", "[", "]"]

In [9]:
def tokenize(doc):
    # Remove the special characters
    for p in to_remove:
        doc = doc.replace(p, " ")
    # Put everything in lowercase
    tokens = doc.lower().split(" ")
    # Exclde stopwords and words that have a digit or weird character in them
    tokens = [t for t in tokens if t not in de_stop and t not in fr_stop and t != '' and len(t)>2  and "http" not in t and "^" not in t and "<" not in t and not any(map(str.isdigit, t))]
    return Counter(tokens)

In [10]:
# Tokenize each text
dict_token_docs = dict()
for k, v in german.items():
    dict_token_docs[k] = tokenize(v)

In [11]:
# Vocabulary
vocab = set()
for counter in dict_token_docs.values():
    vocab = vocab.union(set(counter.keys()))

In [12]:
# Alphabetically sort the vocabulary
vocab = sorted(list(vocab))

In [13]:
# Create a new vocab list that we will use to store the actual words written in the vocab document
new_vocab = []

In [14]:
# Create vocabulary document
# Contains the list of words
# One word per line
with open('data/vocab.de_dfkv.txt', 'a') as f:
    f.truncate(0)
    for v in vocab:
        try:
            f.writelines(str(v) + "\n")
            new_vocab.append(v)
        except:
            vocab.remove(v)

In [15]:
# Create the docword document
with open('data/docword.de_dfkv.txt', 'a') as f:
    f.truncate(0)
    total_tokens = sum([sum(list(c.values())) for c in dict_token_docs.values()])
    # Header lines :
    # Number of documents
    # Number of unique words
    # Number of words in total
    f.writelines([str(len(dict_token_docs.keys())) + "\n",  str(len(new_vocab))+ "\n", str(total_tokens) + "\n"])
    for k, v in tqdm(sorted(dict_token_docs.items())):
        try:
            for w, n in v.items():
                # Each line consists of :
                # DOC_ID WORD_ID WORD_COUNT
                to_write = str(k) + " " + str((new_vocab.index(w) + 1)) + " " + str(n) + "\n"
                f.writelines(to_write)
        except:
            pass

100%|██████████| 998/998 [05:15<00:00,  3.17it/s]
