# OCR

In [32]:
import pytesseract #https://pypi.org/project/pytesseract/
from jiwer import cer #https://pypi.org/project/jiwer/
from pdf2image import convert_from_path
import os
from PIL import Image
from nltk.metrics.distance import edit_distance
from tika import parser
import re
from xml.etree.ElementTree import Element, SubElement, ElementTree
from bs4 import BeautifulSoup
import spacy

## Printed

### Pytesseract
If you have a file without embedded text, you need to run OCR on it first. State of the art at the moment is tesseract so that's what we're using here.

#### PDF
Note that this PDF has embedded text, but we throw it away and run pytesseract on it for demonstration purposes.

In [5]:
filePath = 'data/ocr_data/szg-005_1925_5__698_d.pdf'
doc = convert_from_path(filePath)
path, fileName = os.path.split(filePath)
fileBaseName, fileExtension = os.path.splitext(fileName)

In [None]:
for page_number, page_data in enumerate(doc):
    txt = pytesseract.image_to_string(page_data, lang="deu")
    print("Page # {} - {}".format(str(page_number),txt))

#### Image
Images (usually) don't have embedded OCR, so here we don't have to throw anything away and simply run pytesseract on a jpg.

In [10]:
text = pytesseract.image_to_string(Image.open('data/ocr_data/grs-002_1984_076_0017.jpg'), lang='deu')
text = text.replace("\n"," ").replace("  "," ")#to better compare with our GT

Let's see what it says:

In [11]:
text

'Bei Sulzer in Winterthur kommt die Ausweispflicht Was wird kontrolliert? Als man vernahm, dass im Laufe dieses Jahres beim Sulzer-Konzern in Winterthur die Ausweispflicht mittels einer maschinenlesbaren Karte eingeführt werden soll, da dachte man als Gewerkschafter unwillkürlich an die vielen Möglichkeiten der Kontrolle der Arbeitnehmer, welche via elektronische Datenverarbeitung gegeben sind. «Neue Kontrollen mit neuen Technologien» heisst ja auch eine Broschüre, die der Österreichi- sche, der Luxemburgische und der Schweizerische Gewerkschaftsbund miteinander herausgebracht haben!. Darin wird gezeigt, was mit einem maschinenlesbaren Personalausweis im Betrieb alles gemacht werden kann. Die Redaktion der «Gewerkschaftlichen Rundschau» fand es des- halb für angebracht, sich in Winterthur beim Schweizerischen Metall- und Uhrenarbeitnehmer-Verband (SMUV) und bei der Sulzer-Betriebs- kommission, die der Einführung des Ausweises zugestimmt hat, zu er- kundigen, was mit dem kommenden Auswe

Looks good! But how do we know if it actually is well done OCR? Once evaluation metric is the so-called "character error rate". We don't have "ground-truth" to compare it with, so we'll use the E-Periodica OCR as GT and compare the tesseract result to it.

### Evaluation

In [21]:
parsed = parser.from_file('data/ocr_data/grs-002_1984_076_0017.pdf')
#pdf
contents = [x.strip() for x in parsed["content"].split("\n") if x != ""]
#remove the first page
article = " ".join(contents)

article = re.sub("¬\s+", "", article)  # "bindestriche" will be removed, if they are followed by one or several whitespaces, those will be removed as well.
article = article.strip()  # remove all starting and trailing whitespaces
article = re.sub("\n", " ", article)  # replace newlines with spaces
article = re.sub("\. ", "\.\n", article)  # replace periods with newlines (for nicer printing)
article = re.sub(r'\s+', " ", article)  # replace all repeating whitespaces with only one whitespace
article = re.sub(r'\\', "", article)  # replace all double backslashes

In [23]:
#evaluate if you have ground truth (perfect transcription)
#here we take the eperiodica text as perfect transcription
gt = article

In [25]:
gt

'Bei Sulzer in Winterthur kommt die Ausweispflicht Was wird kontrolliert? Als man vernahm, dass im Laufe dieses Jahres beim Sulzer-Konzern in Winterthur die Ausweispflicht mittels einer maschinenlesbaren Karte eingeführt werden soll, da dachte man als Gewerkschafter unwillkürlich an die vielen Möglichkeiten der Kontrolle der Arbeitnehmer, welche via elektronische Datenverarbeitung gegeben sind. «Neue Kontrollen mit neuen Technologien» heisst ja auch eine Broschüre, die der Österreichi- sche, der Luxemburgische und der Schweizerische Gewerkschaftsbund miteinander herausgebracht haben!. Darin wird gezeigt, was mit einem maschinenlesbaren Personalausweis im Betrieb alles gemacht werden kann. Die Redaktion der «Gewerkschaftlichen Rundschau» fand es des- halb für angebracht, sich in Winterthur beim Schweizerischen Metall- und Uhrenarbeitnehmer-Verband (SMUV) und bei der Sulzer-Betriebs- kommission, die der Einführung des Ausweises zugestimmt hat, zu er- kundigen, was mit dem kommenden Auswe

In [27]:
error = cer(gt, text)
error

0.0

This isn't a great example because tesseract and ABBYY reader agreed perfectly... hmm

## Post-correction with edit distance

In [None]:
# edit_distance

## Handwritten
This is much trickier, and depends entirely on your data.

Here we begin with some botanical images, where just certain parts of the image contain nicely written labels.

In [28]:
!kraken -i data/ocr_data/Z-000033489.jpg data/ocr_data/output/Z-000099226_fondue_gd_v2_de.txt segment -bl ocr -m FoNDUE-GD_v2_de.mlmodel
!kraken -i data/ocr_data/Z-000033489.jpg data/ocr_data/output/Z-000099226_fondue_gd_v2_la.txt segment -bl ocr -m FoNDUE-GD_v2_la.mlmodel
!kraken -i data/ocr_data/Z-000033489.jpg data/ocr_data/output/Z-000099226_fondue_gd_v2.txt segment -bl ocr -m FoNDUE-GD_v2.mlmodel

Loading ANN /home/genta/Documents/notebooks_cs/.venv/lib/python3.8/site-packages/kraken/blla.mlmodel	[0m[32m✓[0m
Loading ANN FoNDUE-GD_v2_de.mlmodel	[0m[32m✓[0m
Segmenting	[0m[32m✓[0m
[2KProcessing [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [35m100%[0m [35m15/15[0m [36m0:00:00[0m [33m0:00:02[0mm [33m0:00:02[0m
[?25hWriting recognition results for data/ocr_data/Z-000033489.jpg	[0m[32m✓[0m
Loading ANN /home/genta/Documents/notebooks_cs/.venv/lib/python3.8/site-packages/kraken/blla.mlmodel	[0m[32m✓[0m
Loading ANN FoNDUE-GD_v2_la.mlmodel	[0m[32m✓[0m
Segmenting	[0m[32m✓[0m
[2KProcessing [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [35m100%[0m [35m15/15[0m [36m0:00:00[0m [33m0:00:02[0mm [33m0:00:02[0m
[?25hWriting recognition results for data/ocr_data/Z-000033489.jpg	[0m[32m✓[0m
Loading ANN /home/genta/Documents/notebooks_cs/.venv/lib/python3.8/site-packages/kraken/blla.mlmodel	[0m[32m✓[0m
Loading ANN FoNDUE-GD_v2.mlmodel	[0m[3

That didn't work as well as the printed OCR, but fairly readable.

On the other hand, here we have some notary pages from the Archief Amsterdam, in English. Although it may seem that this is hastily written, given the other documents in their archive this is actually quite nice handwriting.

In [None]:
!kraken -i "data/d837ae03-b2c5-6b6d-e053-b784100acdee_en.jpg" "data/ocr_data/output/d837ae03-b2c5-6b6d-e053-b784100acdee_en_McCATMuS_nfd_nofix_V1.txt" segment -bl ocr -m McCATMuS_nfd_nofix_V1.mlmodel

This didn't work very well, in large part due to the fact that the model was not trained on this handwriting.

## text to TEI xml with spacy NER
Humanities use XML files a lot, and NER works quite nicely with a typical OCR file structure. Here we take a text file, run NER on it and save it as a TEI XML file.

In [None]:
#!curl -o output.xml -F upload=@grs-002_1984_76__277_d.txt https://teigarage.tei-c.org/ege-webservice/Conversions/txt%3Atext%3Aplain/odt%3Aapplication%3Avnd.oasis.opendocument.text/TEI%3Atext%3Axml/conversion

In [None]:
txt_file = "data/ocr_data/grs-002_1984_76__277_d.txt"
output_file = "data/ocr_data/output/grs-002_1984_76__277_d_tei.xml"
output_file_ner = "data/ocr_data/output/grs-002_1984_76__277_d_tei_ner.xml"

In [33]:
nlp = spacy.load("de_core_news_lg")

  from .autonotebook import tqdm as notebook_tqdm


In [39]:
def create_tei_from_txt(txt_file, output_file, paragraph_delimiter="\n", page_delimiter="\n\n",):
    #TODO add line breaks?
    with open(txt_file, 'r', encoding='utf-8') as f:
        text = f.read()
    
    pages = text.split(page_delimiter)
    paragraphs = [x.split(paragraph_delimiter) for x in pages]

    tei = Element('teiHeader') #root
    text_section = SubElement(tei, 'text')
    body = SubElement(text_section, 'body')
    
    for page in paragraphs:
        p_page = SubElement(body,"pb")
        for paragraph in page:
            doc = nlp(paragraph)
            p_para = SubElement(p_page, 'p')  # Paragraph element
            p_para.text = paragraph
            #paragraph_ = insert_entity_tags_as_xml(text, doc.ents,p_para)
            #p_para.extend(paragraph_)
    
    # Generate the output XML file
    tree = ElementTree(tei)
    tree.write(output_file, encoding='utf-8', xml_declaration=True)
    
    print(f"TEI file created: {output_file}")

In [40]:
create_tei_from_txt(txt_file, output_file)

TEI file created: data/ocr_data/output/grs-002_1984_76__277_d_tei.xml


In [41]:
def create_ner_tei_from_tei(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        xml_doc = f.read()

    soup = BeautifulSoup(xml_doc, "xml")

    paragraphs = soup.find_all(string=True)
    for entry in paragraphs:

        doc = nlp(entry.text)
        newtext = entry
        last_tag = ""
        for i,ent in enumerate(doc.ents):
            start = ent.start_char + i*(5+len(last_tag))
            end = ent.end_char + i*(5+len(last_tag))
            entity_text = ent.text
            entity_label = ent.label_

            if entity_label == "PER":
                tag = "perName"
            elif entity_label == "ORG":
                tag = "orgName"
            elif entity_label == "GPE" or entity_label == "LOC":
                tag = "placeName"
            elif entity_label == "MONEY":
                tag = "monetaryAmount"
            else:
                tag = entity_label
            
            newtext = newtext[:start] + "<"+tag+">"+entity_text+"</"+tag+">" + newtext[end + 1:]
            last_tag = tag
        
        entry.replace_with(BeautifulSoup(newtext, features="html.parser"))
    
    with open(output_file, 'w') as f:
        f.write(soup.prettify())

In [42]:
create_ner_tei_from_tei(output_file, output_file_ner)