### KT sorting

In [None]:
from pathlib import Path
import silnlp.common.paratext
import json
from collections import defaultdict
from silnlp.common.corpus import load_corpus

# paths for generic Major KT lists
metadata_path = Path("silnlp/assets/Major-metadata.txt")
vrefs_path = Path("silnlp/assets/Major-vrefs.txt")

In [None]:
src_gloss_path = Path("silnlp/assets/fr-Major-glosses.txt") # en  fr
trg_gloss_path = Path("test_S/MT/terms/bcw-bcw_2024_02_21-Major-renderings.txt") # lmp-lmp_2024_02_16  bcw-bcw_2024_02_21
pair = "fr_bcw" # en_lmp  fr_bcw

proper_nouns = defaultdict(dict)
for i, (meta, vref, src_gloss, trg_gloss) in enumerate(zip(load_corpus(metadata_path), load_corpus(vrefs_path), load_corpus(src_gloss_path), load_corpus(trg_gloss_path))):
    term, pt_cat, sem_cat = meta.split("\t") # orig lang term, Paratext category (PN, FL, RE, etc.), semantic category (person, grasses, containers, etc.)
    instances = vref.split("\t") # all occurrences of the term
    src_glosses = src_gloss.split("\t") # all potential glosses for term
    trg_glosses = trg_gloss.split("\t")

    if pt_cat == "PN" and trg_glosses != [""]:
        proper_nouns[i]["glosses"] = (src_glosses, trg_glosses)
        proper_nouns[i]["instances"] = instances # might want to give this further structure, i.e. be a dict w/ book:chapter:[instances]

with open(f"zzz_PN_KTs/{pair}/KT_to_vrefs.json", "w", encoding="utf-8") as f:
    json.dump(proper_nouns, f, ensure_ascii=False, indent=4)

# Create verse-to-KTs dict
vref_to_KTs = defaultdict(list)
for i, pn_dict in proper_nouns.items():
    for vref in pn_dict["instances"]:
        vref_to_KTs[vref].append(i)
with open(f"zzz_PN_KTs/{pair}/vref_to_KTs.json", "w", encoding="utf-8") as f:
    json.dump(vref_to_KTs, f, ensure_ascii=False, indent=4)

### Fix KTs

In [None]:
from silnlp.common.corpus import load_corpus
from pathlib import Path
from machine.corpora import ScriptureRef
from silnlp.alignment.utils import compute_alignment_scores

pair = ""
book_name = "08RUT"
vrefs = [ScriptureRef.parse(ref) for ref in load_corpus(Path(f"zzz_PN_KTs/{pair}/{book_name}_vrefs.txt"))]
src_path = Path(f"zzz_PN_KTs/{pair}/{book_name}_src_sents.txt")
trg_path = Path(f"zzz_PN_KTs/{pair}/{book_name}_trg_sents.txt")

# always uses LatinWordTokenizer
sym_align_path = Path(f"zzz_PN_KTs/{pair}/{book_name}_sym-align.txt")
scores = compute_alignment_scores(src_path, trg_path, aligner_id="eflomal", sym_align_path=sym_align_path)

In [None]:
from machine.tokenization import LatinWordTokenizer
from machine.corpora import TextFileTextCorpus
from machine.scripture import VerseRef
import json
import nltk

# treat_apostrophe_as_single_quote=True didn't do anything
# confirmed that these have the same tokenization as the aligner (for Latin script)
# aligner uses LatinWordTokenizer + escape_spaces, nfc_normalize, lowercase from TextCorpus
src_lines = [line.segment for line in TextFileTextCorpus(src_path).tokenize(LatinWordTokenizer()).lowercase()]
trg_lines = [line.segment for line in TextFileTextCorpus(trg_path).tokenize(LatinWordTokenizer())]
src_lines_raw = load_corpus(src_path)
trg_lines_raw = load_corpus(trg_path)

align_lines = [[(lambda x: (int(x[0]), int(x[1])))(pair.split(":")[0].split("-")) for pair in line.split()] for line in load_corpus(sym_align_path)]

# # check for alignment coverage
# # not complete coverage, so can't assume anything about if specific words are aligned
# for i, (ref,src_line,trg_line,align_pairs) in enumerate(zip(vrefs, src_lines, trg_lines, align_lines)):
#     src_idxs = {pair[0] for pair in align_pairs}
#     trg_idxs = {pair[1] for pair in align_pairs}

#     print(i+1, ref)
#     print(f"unaligned SRC: {len(src_line) - len(src_idxs)}")
#     print(f"unaligned TRG: {len(trg_line) - len(trg_idxs)}")

pair = "en_lmp"
book = "RUT"
with open(f"zzz_PN_KTs/{pair}/vref_to_KTs.json", encoding="utf-8") as f:
    vref_to_KTs = json.load(f)
with open(f"zzz_PN_KTs/{pair}/KT_to_vrefs.json", encoding="utf-8") as f:
    KT_to_vrefs = json.load(f)

term_ids = set()
exp_vrefs = set()
for ref, ids in vref_to_KTs.items():
    if VerseRef.from_string(ref).book == book:
        term_ids.update(ids)
        exp_vrefs.add(ref)
src_terms = set()
trg_terms = set()
for id in term_ids:
    src_terms.update(KT_to_vrefs[str(id)]["glosses"][0])
    trg_terms.update(KT_to_vrefs[str(id)]["glosses"][1])
print(src_terms)

# found = defaultdict(list)
for ref,src_line,trg_line,align_pairs,trg_line_raw in zip(vrefs, src_lines, trg_lines, align_lines,trg_lines_raw):
    if str(ref.verse_ref) not in vref_to_KTs.keys():
        continue
    if ref.verse_num == 0 or ref.path[0].name != "": # the ScriptureRefs I'm testing with have an empty ScriptureElement in the path so is_verse doesn't work
        continue

    found = []
    for term_id in vref_to_KTs[str(ref.verse_ref)]:
        glosses = [gloss.lower() for gloss in KT_to_vrefs[str(term_id)]["glosses"][0]]
        min_dist = (0, 0, 100) # gloss idx of closest match, tok idx of closest match, distance
        for i, gloss in enumerate(glosses): # could adjust this to look at n-grams, where n is the number of words in the gloss
            for j, tok in enumerate(src_line):
                if (j, term_id) in found:
                    continue
                dist = nltk.edit_distance(gloss, tok) / len(tok)
                if dist < min_dist[2]:
                    min_dist = (i, j, dist)
        # print(glosses[min_dist[0]], src_line[min_dist[1]], min_dist[2])
        if min_dist[2] < .3:
            found.append((min_dist[1], term_id))
    # print(ref)
    # print(len(found), len(vref_to_KTs[str(ref.verse_ref)]))
    # print(found)
    # print(vref_to_KTs[str(ref.verse_ref)])
    # print(src_line)

    # replace word(s) in target text
    for src_idx, term_id in found:
        trg_idxs = [pair[1] for pair in align_pairs if pair[0] == src_idx]
        print(src_idx, trg_idxs)
        print(src_line[src_idx], [trg_line[idx] for idx in trg_idxs])
        print(trg_line_raw)
        print(trg_line)
        print("\n")


### Evaluation -- no good
* score orig --> no inline markers
* score orig --> only para markers

In [None]:
import sacrebleu
from pathlib import Path

out = list(load_corpus(Path("")))
ref = [list(load_corpus(Path("")))]

bleu = sacrebleu.corpus_bleu(out, ref, lowercase=True).score
spbleu = sacrebleu.corpus_bleu(out, ref, lowercase=True, tokenize="flores200").score
chrf = sacrebleu.corpus_chrf(out, ref, char_order=6, beta=3, remove_whitespace=True).score
chrfp = sacrebleu.corpus_chrf(out, ref, char_order=6, beta=3, word_order=1, remove_whitespace=True, eps_smoothing=True).score
chrfpp = sacrebleu.corpus_chrf(out, ref, char_order=6, beta=3, word_order=2, remove_whitespace=True, eps_smoothing=True).score
print(bleu, spbleu, chrf, chrfp, chrfpp)

### Construct Goal Files

In [None]:
from machine.corpora import (
    FileParatextProjectSettingsParser, 
    UsfmFileText, 
    UpdateUsfmParserHandler, 
    UsfmTokenizer, 
    UsfmTokenType, 
    parse_usfm, 
    UsfmParserState,
    UpdateUsfmBehavior
    )
from pathlib import Path

class ParagraphUpdateUsfmParserHandler(UpdateUsfmParserHandler):
    def _collect_tokens(self, state: UsfmParserState) -> None:
        self._tokens.extend(self._new_tokens)
        self._new_tokens.clear()
        while self._token_index <= state.index + state.special_token_count:
            if state.tokens[self._token_index].type == UsfmTokenType.PARAGRAPH and state.tokens[self._token_index].marker != "rem":
                num_text = 0
                rem_offset = 0
                for i in range(len(self._tokens) - 1, -1, -1):
                    if self._tokens[i].type == UsfmTokenType.TEXT:
                        num_text += 1
                    elif self._tokens[i].type == UsfmTokenType.PARAGRAPH and self._tokens[i].marker == "rem":
                        rem_offset += num_text + 1
                        num_text = 0
                    else:
                        break
                if num_text >= 2:
                    self._tokens.insert(-(rem_offset + num_text - 1), state.tokens[self._token_index])
                    self._token_index += 1
                    break # should this be continue instead? what situations are there where 
            self._tokens.append(state.tokens[self._token_index])
            self._token_index += 1

pair = "spa_zpu"
src_project = "DHH94"
src_file_suffix = "DHH94"
trg_project = "zpuAT_2025_01_15"
trg_file_suffix = "zpuAT"

book = "PSA"
book_name = f"19{book}"
trg_file_path = Path(f"test_S/Paratext/projects/{trg_project}/{book_name}{trg_file_suffix}.SFM")
out_file_path = Path(f"zzz_USFM/{pair}/{book}/{book_name}{trg_file_suffix}_goal.SFM")
trg_settings = FileParatextProjectSettingsParser(trg_file_path.parent).parse()
trg_file_text = UsfmFileText(
    trg_settings.stylesheet,
    trg_settings.encoding,
    trg_settings.get_book_id(trg_file_path.name),
    trg_file_path,
    trg_settings.versification,
    include_markers=True,
    include_all_text=True,
    project=trg_settings.name,
)

tokenizer = UsfmTokenizer(trg_settings.stylesheet)
sentence_toks = []
vrefs = []
for sent in trg_file_text:
    toks = tokenizer.tokenize(sent.text.strip())
    if len(toks) > 0:
        sentence_toks.append(toks)
        vrefs.append(sent.ref)

to_delete = ["fig"]
out_toks = []
for i, (toks, ref) in enumerate(zip(sentence_toks, vrefs)):
    out_toks.append([""])
    ignore_scope = None
    for j, tok in enumerate(toks):
        if ignore_scope is not None:
            if tok.type == UsfmTokenType.END and tok.marker[:-1] == ignore_scope.marker:
                ignore_scope = None
        elif tok.type == UsfmTokenType.NOTE or (tok.type == UsfmTokenType.CHARACTER and tok.marker in to_delete):
            ignore_scope = tok
        elif tok.type == UsfmTokenType.PARAGRAPH:
            out_toks[-1].append("")
        elif tok.type in [UsfmTokenType.TEXT, UsfmTokenType.CHARACTER, UsfmTokenType.END]:
            out_toks[-1][-1] += tok.to_usfm()

translated_rows = []
for ref, sent in zip(vrefs, out_toks):
    for segment in sent:
        translated_rows.append((ref, segment))

# Get note-type segments from src project
src_file_path = Path(f"test_S/Paratext/projects/{src_project}/{book_name}{src_file_suffix}.SFM")
src_settings = FileParatextProjectSettingsParser(src_file_path.parent).parse()
src_file_text = UsfmFileText(
    src_settings.stylesheet,
    src_settings.encoding,
    src_settings.get_book_id(src_file_path.name),
    src_file_path,
    src_settings.versification,
    include_markers=True,
    include_all_text=True,
    project=src_settings.name,
)
tokenizer = UsfmTokenizer(src_settings.stylesheet)
sentence_toks = [tokenizer.tokenize(sent.text.strip()) for sent in src_file_text]
vrefs = [s.ref for s in src_file_text]
ignored_segments = []
for i, (toks, ref) in enumerate(zip(sentence_toks, vrefs)):
    ignored_segment = ""
    ignore_scope = None
    for j, tok in enumerate(toks):
        if ignore_scope is not None:
            ignored_segment += tok.to_usfm()
            if tok.type == UsfmTokenType.END and tok.marker[:-1] == ignore_scope.marker:
                ignored_segments.append((ref, ignored_segment))
                ignored_segment = ""
                ignore_scope = None
        elif tok.type == UsfmTokenType.NOTE or (tok.type == UsfmTokenType.CHARACTER and tok.marker in to_delete):
            ignored_segment += tok.to_usfm()
            ignore_scope = tok

# Add any note-type segments back to the ends of their verses
past = 0 # number of ignored segments added as their own row
append = 0
segment_idx = 0
rows = []
for i, (ref, row_text) in enumerate(translated_rows):
    # insert into new row, is this possible for the real scenario, i.e. everything coming from the same project?
    while segment_idx < len(ignored_segments) and ignored_segments[segment_idx][0] < ref:
        rows.append(([ignored_segments[segment_idx][0]], ignored_segments[segment_idx][1]))
        segment_idx += 1
        past += 1
    # if inserting into a current row, it should only happen in the last row for each ScriptureRef
    if i == len(translated_rows) - 1 or translated_rows[i + 1][0] != ref:
        while (segment_idx < len(ignored_segments) and ignored_segments[segment_idx][0] == ref):
            row_text += ignored_segments[segment_idx][1]
            segment_idx += 1
            append += 1
    rows.append(([ref], row_text))
# add any remaining ignored segments
for segment in ignored_segments[segment_idx:]:
    rows.append(([segment[0]], segment[1]))
print(past, append)

'''Update file and write out'''
with open(trg_file_path, encoding=trg_settings.encoding) as f:
    usfm = f.read()
handler = ParagraphUpdateUsfmParserHandler(rows, behavior=UpdateUsfmBehavior.PREFER_NEW)
parse_usfm(usfm, handler, trg_settings.stylesheet, trg_settings.versification, preserve_whitespace=False)
usfm_out = handler.get_usfm(trg_settings.stylesheet)
with out_file_path.open("w", encoding=trg_settings.encoding) as f:
    f.write(usfm_out)



Find vref differences

In [1]:
from pathlib import Path

src_fpath = Path("test_S/Paratext/projects/DHH94/19PSADHH94.SFM")
trg_fpath = Path("test_S/Paratext/projects/zpuAT_2025_01_15/19PSAzpuAT.SFM")
src_out = Path("vrefs_src.txt")
trg_out = Path("vrefs_trg.txt")

# cp1252
with src_fpath.open(encoding="utf-8-sig") as f, src_out.open("w") as out:
    out.writelines([line.split(" ")[0].strip() + "\n" for line in f])
with trg_fpath.open(encoding="utf-8-sig") as f, trg_out.open("w") as out:
    out.writelines([line.split(" ")[0].strip() + "\n" for line in f])

Find vref differences -- machine.py

In [3]:
from pathlib import Path
from machine.corpora import FileParatextProjectSettingsParser, UsfmFileText

src_fpath = Path("test_S/Paratext/projects/DHH94/19PSADHH94.SFM")
trg_fpath = Path("test_S/Paratext/projects/zpuAT_2025_01_15/19PSAzpuAT.SFM")
src_out = Path("vrefs_src.txt")
trg_out = Path("vrefs_trg.txt")

src_settings = FileParatextProjectSettingsParser(src_fpath.parent).parse()
src_file_text = UsfmFileText(
    src_settings.stylesheet,
    src_settings.encoding,
    src_settings.get_book_id(src_fpath.name),
    src_fpath,
    src_settings.versification,
    include_markers=True,
    include_all_text=True,
    project=src_settings.name,
)
with src_out.open("w") as f:
    for sent in src_file_text:
        f.write(f"{sent.ref}\n")

trg_settings = FileParatextProjectSettingsParser(trg_fpath.parent).parse()
trg_file_text = UsfmFileText(
    trg_settings.stylesheet,
    trg_settings.encoding,
    trg_settings.get_book_id(trg_fpath.name),
    trg_fpath,
    trg_settings.versification,
    include_markers=True,
    include_all_text=True,
    project=trg_settings.name,
)
with trg_out.open("w") as f:
    for sent in trg_file_text:
        f.write(f"{sent.ref}\n")

### Print out all paragraph and character markers for a book
To use, set book, fpath, and out_path. fpath should be a path to a book in a Paratext project

In [30]:
from pathlib import Path
from machine.corpora import FileParatextProjectSettingsParser, UsfmFileText, UsfmTokenizer, UsfmTokenType

# this assumes fpath is a book in a Paratext project folder
src_fpath = Path("test_S/Paratext/projects/msSMBv0_2024_10_24/43LUKmsSMBv0.SFM")
trg_fpath = Path("test_S/Paratext/projects/NIrV/43LUKusNIRV14.SFM")
src_out = Path("markers_src.txt")
trg_out = Path("markers_trg.txt")

# file 1
settings = FileParatextProjectSettingsParser(src_fpath.parent).parse()
file_text = UsfmFileText(
    settings.stylesheet,
    settings.encoding,
    "",
    src_fpath,
    settings.versification,
    include_markers=True,
    include_all_text=True,
    project=settings.name,
)

to_delete = ["fig", "va", "vp"]
vrefs = []
usfm_markers = []
usfm_tokenizer = UsfmTokenizer(settings.stylesheet)
for sent in file_text:
    if len(sent.ref.path) > 0 and sent.ref.path[-1].name == "rem":
        continue

    vrefs.append(sent.ref)
    usfm_markers.append([])
    usfm_toks = usfm_tokenizer.tokenize(sent.text.strip())
    
    ignore_scope = None
    for j, tok in enumerate(usfm_toks):
        if ignore_scope is not None:
            if tok.type == UsfmTokenType.END and tok.marker[:-1] == ignore_scope.marker:
                ignore_scope = None
        elif tok.type == UsfmTokenType.NOTE or (tok.type == UsfmTokenType.CHARACTER and tok.marker in to_delete):
            ignore_scope = tok
        elif tok.type in [UsfmTokenType.PARAGRAPH, UsfmTokenType.CHARACTER, UsfmTokenType.END]:
            usfm_markers[-1].append(tok.marker)

with src_out.open("w", encoding=settings.encoding) as f:
    for ref, markers in zip(vrefs, usfm_markers):
        f.write(f"{ref} {markers}\n")

# file 2
settings = FileParatextProjectSettingsParser(trg_fpath.parent).parse()
file_text = UsfmFileText(
    settings.stylesheet,
    settings.encoding,
    "",
    trg_fpath,
    settings.versification,
    include_markers=True,
    include_all_text=True,
    project=settings.name,
)

vrefs = []
usfm_markers = []
usfm_tokenizer = UsfmTokenizer(settings.stylesheet)
for sent in file_text:
    if len(sent.ref.path) > 0 and sent.ref.path[-1].name == "rem":
        continue

    vrefs.append(sent.ref)
    usfm_markers.append([])
    usfm_toks = usfm_tokenizer.tokenize(sent.text.strip())
    
    ignore_scope = None
    for j, tok in enumerate(usfm_toks):
        if ignore_scope is not None:
            if tok.type == UsfmTokenType.END and tok.marker[:-1] == ignore_scope.marker:
                ignore_scope = None
        elif tok.type == UsfmTokenType.NOTE or (tok.type == UsfmTokenType.CHARACTER and tok.marker in to_delete):
            ignore_scope = tok
        elif tok.type in [UsfmTokenType.PARAGRAPH, UsfmTokenType.CHARACTER, UsfmTokenType.END]:
            usfm_markers[-1].append(tok.marker)

with trg_out.open("w", encoding=settings.encoding) as f:
    for ref, markers in zip(vrefs, usfm_markers):
        f.write(f"{ref} {markers}\n")

### make table of style/text types of usfm markers

In [20]:
# not covered: ChapterNumber
# not present: BackTranslation, TranslationNote
text_types = ["none", "Title", "Section", "VerseText", "NoteText", "Other", "BackTranslation", "TranslationNote"]

# not present: End, MilestoneEnd
style_types = ["none", "Character", "Note", "Paragraph", "End", "Milestone", "MilestoneEnd"]

matrix = [[[] for _ in style_types] for _ in text_types]
with open("usfm.sty") as f:
    usfm = [l.strip() for l in f.readlines()]

marker = ""
tt = 0
st = 0
for line in usfm:
    if line.startswith("\\Marker"):
        if len(marker) > 0:
            matrix[tt][st].append(marker)
        marker = line.split()[1]
    if line.startswith("\\TextType"):
        try:
            tt = text_types.index(line.split()[1])
        except:
            tt = 0
    if line.startswith("\\StyleType"):
        try:
            st = style_types.index(line.split()[1])
        except:
            st = 0

print("\t" + "\t".join(style_types))
for i, tt in enumerate(matrix):
    print(text_types[i] + "\t" + "\t".join([",".join(st) for st in tt]))

	none	Character	Note	Paragraph	End	Milestone	MilestoneEnd
none		v		imt2,c			
Title				mt,mt1,mt2,mt3,mt4,mte,mte1,mte2			
Section				ms,ms1,ms2,ms3,mr,s,s1,s2,s3,s4,sr,r,sp,sd,sd1,sd2,sd3,sd4,periph			
VerseText	b,li,li1,li2,li3,li4,lim,lim1,lim2,lim3,lim4,pub,toc,pref,intro,conc,glo,idx,maps,cov,spine	qs,th1,th2,th3,th4,th5,tc1,tc2,tc3,tc4,tc5,thc1,thc2,thc3,thc4,thc5,tcc1,tcc2,tcc3,tcc4,tcc5,thr1,thr2,thr3,thr4,thr5,tcr1,tcr2,tcr3,tcr4,tcr5,litl,lik,liv,liv1,liv2,liv3,liv4,liv5,qt,nd,tl,dc,bk,sig,pn,png,addpn,wj,k,sls,ord,add,rb,w,wh,wg,wa,ndx,wr		p,m,po,pr,cls,pmo,pm,pmc,pmr,pi,pi1,pi2,pi3,pc,mi,nb,q,q1,q2,q3,q4,qc,qr,qm,qm1,qm2,qm3,qd,d,tr,lh,lf,ph,ph1,ph2,ph3,phi,tr1,tr2,ps,psi			
NoteText		fr,ft,fk,fq,fqa,fl,fw,fp,fv,fdc,xo,xop,xt,xta,xk,xq,xot,xnt,xdc,xtSee,fs	f,fe,x				
Other	io,io1,io2,io3,io4,ili,ili1,ili2,ib,ie	ior,iqt,ca,va,vp,qac,fm,rq,no,it,bd,bdit,em,sc,sup,fig,jmp,pro,xtSeeAlso,zpa-xb,zpa-xc,zpa-xv		id,usfm,ide,h,h1,h2,h3,toc1,toc2,toc3,toca1,toca2,toca3,rem,sts,restore