This notebook evaluates the post-corrected text using word error rates. We treat the text from Ash and NCKP as ground truth. This evaluation mainly focuses on the capability of neuspell to correct individual words. We get the samples by:
    * Find all terms with High quality and low quality, and moderate qualit in Edition 1 1771, Edition 7 1842.
    * From these terms, if the number of words of a term are the same between high quality text, and low quality one, then this term will be added as a sample.
    * All texts from the sample will be normalised (lowercase, remove punctuation).



In [None]:
# Get the samples
# Treat the text from Ash and NCKP as ground truth
# Find all terms with High quality and low quality, and moderate qualit in Edition 1 1771, Edition 7 1842.
# From these terms, if the number of words of a term are the same between high quality text, and low quality one, then this term will be added as a sample.

In [5]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import Namespace

hto = Namespace("https://w3id.org/hto#")

sparql = SPARQLWrapper(
    "http://query.frances-ai.com/hto"
)
sparql.setReturnFormat(JSON)

In [33]:
import re

NON_AZ_REGEXP = re.compile("[^a-z]")
def normalize(word):
    """
    Normalize a word by converting it to lower-case and removing all
    characters that are not 'a',...,'z'.

    :param word: Word to normalize
    :type word: str or unicode
    :return: normalized word
    :rtype word: str or unicode
    """
    return re.sub(NON_AZ_REGEXP, '', word.lower())


def normalize_text(text):
    all_words = text.split()
    all_normalised_words = []
    for word in all_words:
        all_normalised_words.append(normalize(word))
    return ' '.join(all_normalised_words)

In [34]:
def get_samples():
    sample_info_list = []
    sparql.setQuery("""
    PREFIX hto: <https://w3id.org/hto#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?term_uri ?mq_text ?hq_text ?lq_text WHERE {
        ?term_uri a ?term_type;
            hto:name ?name;
            hto:startsAtPage ?startPage;
            hto:hasOriginalDescription ?hq_desc, ?lq_desc, ?mq_desc.
        ?hq_desc hto:text ?hq_text;
            hto:hasTextQuality hto:High.
  		?mq_desc hto:text ?mq_text;
            hto:hasTextQuality hto:Moderate.
  		?lq_desc hto:text ?lq_text;
            hto:hasTextQuality hto:Low.
  		# Calculate word count for high-quality description
    	BIND (STRLEN(REPLACE(?hq_text, "\\\\S", " ")) AS ?high_length)

    	# Calculate word count for low-quality description
    	BIND (STRLEN(REPLACE(?lq_text, "\\\\S", " ")) AS ?low_length)
      	# Ensure that the word count matches between high and low-quality descriptions
        FILTER (?term_type = hto:ArticleTermRecord || ?term_type = hto:TopicTermRecord)
  		# Ensure that the word count matches between high and low-quality descriptions
    	FILTER (?high_length = ?low_length)
        ?vol a hto:Volume;
            hto:hadMember ?startPage.
        ?edition a hto:Edition;
            hto:hadMember ?vol;
            hto:yearPublished ?year_published.
        FILTER (?year_published = 1771 || ?year_published = 1842)
        }
    """
                    )

    try:
        ret = sparql.queryAndConvert()
        for r in ret["results"]["bindings"]:
            sample_info_list.append({
                "term_uri": r["term_uri"]["value"],
                "hq_text": normalize_text(r["hq_text"]["value"]),
                "mq_text": normalize_text(r["mq_text"]["value"]),
                "lq_text": normalize_text(r["lq_text"]["value"])
            })
    except Exception as e:
        print(e)

    return sample_info_list


In [35]:
samples = get_samples()

In [36]:
len(samples)

1923

In [12]:
from jiwer import wer


def calculate_wer(high_quality_text: str, low_quality_text: str) -> float:
    """
    Calculate the Word Error Rate (WER) between high-quality and low-quality text.

    Args:
    - high_quality_text: The ground truth high-quality text.
    - low_quality_text: The transcription to compare against the ground truth.

    Returns:
    - The Word Error Rate (WER) as a floating-point number.
    """
    # Calculate the Word Error Rate using jiwer
    error_rate = wer(high_quality_text, low_quality_text)

    return error_rate

In [38]:
for sample in samples:
    lq_wer = calculate_wer(sample["hq_text"], sample["lq_text"])
    mq_wer = calculate_wer(sample["hq_text"], sample["mq_text"])
    sample["lq_wer"] = lq_wer
    sample["mq_wer"] = mq_wer

In [39]:
improved_terms = [sample for sample in samples if sample["mq_wer"] < sample["lq_wer"]]

In [40]:
wrongly_correct_terms = [sample for sample in samples if sample["mq_wer"] > sample["lq_wer"]]

In [41]:
len(wrongly_correct_terms)

788

In [42]:
len(improved_terms)

126

In [46]:
print(f"number of terms which have been successfully corrected: {len(improved_terms)}, the success rate: {len(improved_terms) / len(samples)}")

number of terms which have been successfully corrected: 126, the success rate: 0.0655226209048362


In [48]:
print(f"number of terms which have been wrongly corrected: {len(wrongly_correct_terms)}, the fail rate: {len(wrongly_correct_terms) / len(samples)}")

number of terms which have been wrongly corrected: 788, the fail rate: 0.40977639105564223
