In [115]:
import re
from pathlib import Path
import nltk
# import jaro

from machine.corpora import FileParatextProjectSettingsParser, UsfmFileText
from machine.tokenization import WhitespaceTokenizer, LatinWordTokenizer

class WhitespaceSlashTokenizer(WhitespaceTokenizer):
    def _is_whitespace(self, c: str) -> bool:
        return super()._is_whitespace(c) or c == "\\"

In [116]:
'''Code from https://github.com/richmilne/JaroWinkler/blob/master/jaro/jaro.py'''

def jaro(s1, s2):
    # s1 is always the shorter string
    if len(s2) < len(s1):
        s1, s2 = s2, s1

    if len(s1) == 0:
        print("empty sequence")
        print(s1)
        print(s2)
        return 0, 0

    search_range = max((len(s2) // 2) - 1, 0)
    matched1 = [0] * len(s1)
    matched2 = [0] * len(s2)
    num_matches = 0

    for i, char in enumerate(s1):
        for j in range(max(i - search_range, 0), min(i + search_range, len(s2) - 1) + 1):
            if not matched2[j] and char == s2[j]:
                matched1[i] = matched2[j] = 1
                num_matches += 1
                break
    
    if num_matches == 0:
        print("no matches")
        print(s1)
        print(s2)
        return 0, 0

    # number of matched tokens in s1 such that if it is the ith matched token, the the ith matched token in s2 (in linear order) is not what it was matched with
    # this number divided by 2 is the number of transpositions
    half_transposes = 0
    j = 0
    for i, matched in enumerate(matched1):
        if not matched:
            continue
        while not matched2[j]:
            j += 1
        if s1[i] != s2[j]:
            half_transposes += 1
        j += 1

    dist = (
        num_matches / len(s1)
        + num_matches / len(s2)
        + (num_matches - half_transposes // 2) / num_matches
        ) / 3
    return dist, half_transposes

In [117]:
# gold sentences: constructed "goal" files, ideally based on source project but typically based on target project
# predicted sentences: translated output, based on source project
project_path = Path("test_S/Paratext/projects/DHH94")
# difference bw goal file and orig trg for tpi_aps is that embeds are moved to the end of their ScriptureRefs
book = "PSA"
chapters = {9, 10}
gold_file_path = Path("zzz_USFM_eval/aa_gold_standard/spa_zpu_19PSAzpuAT_9_10.SFM")
pred_file_path = Path("zzz_USFM_eval/spa_zpu/19PSA_eflomal.SFM")

In [118]:
settings = FileParatextProjectSettingsParser(project_path).parse()
gold_file_text = UsfmFileText(
    settings.stylesheet,
    settings.encoding,
    book,
    gold_file_path,
    settings.versification,
    include_markers=True,
    include_all_text=True,
    project=settings.name,
)
pred_file_text = UsfmFileText(
    settings.stylesheet,
    settings.encoding,
    book,
    pred_file_path,
    settings.versification,
    include_markers=True,
    include_all_text=True,
    project=settings.name,
)

'''Preprocess'''
# have to iterate over individually at first because gold standard file only contains the specific chapters
ignore = ["r", "rem"]
all_vrefs = []
gold_sents = []
for gs in gold_file_text:
    if (len(gs.ref.path) > 0 and gs.ref.path[-1].name in ignore) or gs.ref.chapter_num not in chapters:
        continue
    all_vrefs.append(gs.ref)
    gold_sents.append(gs.text)
pred_sents = []
pv = []
for ps in pred_file_text:
    if (len(ps.ref.path) > 0 and ps.ref.path[-1].name in ignore) or ps.ref.chapter_num not in chapters:
        continue
    pred_sents.append(ps.text)
    pv.append(ps.ref)

alts = f"({'|'.join(['f', 'x', 'fig', 'va', 'vp', 'ca'])})"
embeds_re = re.compile(fr"\\{alts}.*?\\{alts}\*")

# assume gold and pred have the exact same ScriptureRefs, as they should be based off the same original file
tokenizer = WhitespaceSlashTokenizer() # handles 'word\w' case
# tokenizer = LatinWordTokenizer()
vrefs = []
gold_sent_toks = []
pred_sent_toks = []
num_markers = [] # number of USFM markers at each vref
for ref, gs, ps in zip(all_vrefs, gold_sents, pred_sents):
    # discard embeds, as of now they will always be at the end so they will only inflate the metrics
    gs = embeds_re.sub("", gs)
    ps = embeds_re.sub("", ps)

    if gs.count("\\") > 0:
        vrefs.append(ref)
        gold_sent_toks.append(list(tokenizer.tokenize(gs)))
        pred_sent_toks.append(list(tokenizer.tokenize(ps)))
        num_markers.append(gs.count("\\"))

'''
Levenshtein distance
raw score: num of insertions/substitutions/deletions/transpositions
'''
dists = [] # edit distance
for vref, gs, ps, num_mark in zip(vrefs, gold_sent_toks, pred_sent_toks, num_markers):
    dist = nltk.edit_distance(gs, ps, transpositions=True)
    dists.append(dist)
    # print(vref, dist, num_mark)
    # print(gs)
    # print(ps)

dists_per_10_tokens = [d * 10 / len(gs) for d,gs in zip(dists, gold_sent_toks)]
dists_per_marker = [d / n for d,n in zip(dists, num_markers)]

'''
Jaro similarity
raw score:
avg of:
1. % matches in s1 (m / len(s1))
2. % matches in s2 (m / len(s2))
3. % matches not transposed ((m-t)/m), t = # "half transposes" / 2

TODO: check symmetricality of Jaro similarity
'''
jaro_scores = []
half_transposes = []
for vref, gs, ps, num_mark in zip(vrefs, gold_sent_toks, pred_sent_toks, num_markers):
    score, hts = jaro(gs, ps)
    jaro_scores.append(score)
    half_transposes.append(hts)
    # print(score, half_transposes, num_mark, vref)
    # print(gs)
    # print(ps)

ht_per_10_tokens = [t * 10 / len(g) for t,g in zip(half_transposes, gold_sent_toks)]
ht_per_marker = [t / n for t,n in zip(half_transposes, num_markers)]


no matches
['Salmo', '10', '(9b)']
['Biaschga,', 'q1', 'Xanaꞌ,', 'q1', 'bi', 'udegonoꞌ', 'dan', 'llun', 'beꞌnn', 'naꞌch', 'kaꞌ,', 'q1', 'ke', 'll‐lantegare,', 'q1', 'bwechga', 'kaꞌ', 'gonoꞌ', 'ke', "be'nn", "ka'", 'bi', 'nombiaꞌ', 'rwen,', 'q1', 'gan', 'lliw', 'llonoꞌ', 'yel', 'korchisen.']
no matches
['Yelwill-llo', 'Diozen', "nench'na", "gakrene'e", "lli'on"]
['Bechjga', 'ke', 'lleb', 'beꞌnn', 'ki,', 'q1', 'Xanaꞌ,', 'q1', 'benchga', 'ke', 'chejnieꞌ', 'beꞌnnki,', 'q1', 'leke', 'naken', 'nak', "ake'", 'zerawze', 'beꞌnn', 'naꞌche.']


In [119]:
print("number of markers", num_markers)
print("sent lengths", [len(gs) for gs in gold_sent_toks])
print("avg markers", sum(num_markers) / len(vrefs))
print("avg sent len", sum([len(gs) for gs in gold_sent_toks]) / len(vrefs))
print("\n")

print("Edit distance")
print("edit distance", dists)
print("dist per 10 tokens", dists_per_10_tokens)
print("dist per marker", dists_per_marker)
print("\n")

print("avg dist \
      avg dist per 10 tokens \
      overall avg dist per 10 tokens \
      avg dist per marker \
      overall avg dist per marker")
print(f"{sum(dists) / len(vrefs)}\t \
      {sum(dists_per_10_tokens) / len(vrefs)}\t \
      {sum(dists) * 10 / sum([len(gs) for gs in gold_sent_toks])}\t \
      {sum(dists_per_marker) / len(vrefs)}\t \
      {sum(dists) / sum(num_markers)}")
print("\n")

print("Jaro similarity")
print("jaro", jaro_scores)
print("half transposes", half_transposes)
print("transposes per 10 markers", ht_per_10_tokens)
print("transposes per marker", ht_per_marker)
print("\n")

print("avg jaro \
      avg transposes \
      avg t's per 10 tokens \
      overall avg t's per 10 tokens \
      avg t's per marker \
      overall avg t's per marker")
print(f"{sum(jaro_scores) / len(vrefs)}\t \
      {sum(half_transposes) / len(vrefs)}\t \
      {sum(ht_per_10_tokens) / len(vrefs)}\t \
      {sum(half_transposes) * 10 / sum([len(g) for g in gold_sent_toks])}\t \
      {sum(ht_per_marker) / len(vrefs)}\t \
      {sum(half_transposes) / sum(num_markers)}")

number of markers [2, 3, 3, 2, 2, 4, 2, 1, 1, 4, 3, 2, 4, 4, 1, 2, 2, 1, 5, 3, 1, 2, 2, 2, 2, 1, 4, 2, 5, 3, 3, 3, 3, 2, 1]
sent lengths [19, 18, 19, 20, 22, 26, 17, 16, 16, 21, 20, 23, 23, 33, 30, 23, 17, 18, 30, 20, 13, 20, 18, 21, 21, 16, 26, 25, 36, 18, 16, 12, 38, 19, 18]
avg markers 2.4857142857142858
avg sent len 21.37142857142857


Edit distance
edit distance [18, 19, 21, 24, 20, 23, 16, 18, 17, 19, 21, 27, 30, 29, 28, 21, 23, 17, 30, 20, 17, 19, 20, 20, 21, 21, 30, 21, 33, 17, 15, 33, 33, 19, 25]
dist per 10 tokens [9.473684210526315, 10.555555555555555, 11.052631578947368, 12.0, 9.090909090909092, 8.846153846153847, 9.411764705882353, 11.25, 10.625, 9.047619047619047, 10.5, 11.73913043478261, 13.043478260869565, 8.787878787878787, 9.333333333333334, 9.130434782608695, 13.529411764705882, 9.444444444444445, 10.0, 10.0, 13.076923076923077, 9.5, 11.11111111111111, 9.523809523809524, 10.0, 13.125, 11.538461538461538, 8.4, 9.166666666666666, 9.444444444444445, 9.375, 27.5, 8.68421

In [None]:
# def get_sentences(book_path: Path, project_path: Optional[Path] = None) -> List[str]:
#     trg_settings = FileParatextProjectSettingsParser(book_path.parent if project_path is None else project_path).parse()
#     trg_file_text = UsfmFileText(
#         trg_settings.stylesheet,
#         trg_settings.encoding,
#         trg_settings.get_book_id(book_path.name),  # trg_settings.get_book_id(book_path.name)
#         book_path,
#         trg_settings.versification,
#         include_markers=True,
#         include_all_text=True,
#         project=trg_settings.name,
#     )
#     tokenizer = UsfmTokenizer(trg_settings.stylesheet)

#     trg_sents = []
#     for sent in trg_file_text:
#         if (len(sent.ref.path) > 0 and sent.ref.path[-1].name == "rem") or len(sent.text.strip()) == 0:
#             continue
#         trg_sents.append([""])

#         sent = sent.text.strip()
#         usfm_toks = tokenizer.tokenize(sent)
#         ignore_scope = None
#         for j, tok in enumerate(usfm_toks):
#             if ignore_scope is not None:
#                 if tok.type == UsfmTokenType.END and tok.marker[:-1] == ignore_scope.marker:
#                     ignore_scope = None
#             elif tok.type == UsfmTokenType.NOTE or (tok.type == UsfmTokenType.CHARACTER and tok.marker == "fig"):
#                 ignore_scope = tok
#             elif tok.type == UsfmTokenType.TEXT:
#                 trg_sents[-1][0] += tok.text

#     return trg_sents
