### Alignment w/ more aligning data

In [None]:
from silnlp.common.corpus import load_corpus, write_corpus
from silnlp.alignment.config import get_aligner
from silnlp.alignment.machine_aligner import MachineAligner
from typing import List, Union
import tempfile
from pathlib import Path
from machine.tokenization import LatinWordTokenizer
from machine.corpora import escape_spaces, nfc_normalize, lowercase

# Tokenize and normalize sentences in the same way as the normal alignment process
def tokenize_for_alignment(sents: List[Union[str, List[str]]]) -> List[str]:
    if type(sents[0]) == str:
        tokenizer = LatinWordTokenizer()
        sents = [tokenizer.tokenize(sent) for sent in sents]
    sents_norm = [lowercase(nfc_sent) for nfc_sent in [nfc_normalize(es_sent) for es_sent in [escape_spaces(sent) for sent in sents]]]
    return [" ".join(toks) for toks in sents_norm]

def align_sents(src_sents: List[str],
                trg_sents: List[str],
                aligner_id: str = "hmm",
                sym_align_path: Path = None,
                extra_train_data_src: Path = None,
                extra_train_data_trg: Path = None
                ) -> None:
    with tempfile.TemporaryDirectory() as td:
        temp_dir = Path(td)

        # Since the sentences get tokenized before they are normalized in the normal alignment process,
        # we can wait to do the normalization until here
        src_sents_norm = tokenize_for_alignment(src_sents)
        trg_sents_norm = tokenize_for_alignment(trg_sents)

        # Prep alignment data
        align_src_path = temp_dir / "align.src.txt"
        align_trg_path = temp_dir / "align.trg.txt"
        write_corpus(align_src_path, src_sents_norm)
        write_corpus(align_trg_path, trg_sents_norm)

        # Prep training data
        if extra_train_data_src and extra_train_data_trg:
            src_sents_norm += tokenize_for_alignment(list(load_corpus(extra_train_data_src)))
            trg_sents_norm += tokenize_for_alignment(list(load_corpus(extra_train_data_trg)))
        train_src_path = temp_dir / "train.src.txt"
        train_trg_path = temp_dir / "train.trg.txt"
        write_corpus(train_src_path, src_sents_norm)
        write_corpus(train_trg_path, trg_sents_norm)

        # Train the aligner and align
        aligner: MachineAligner = get_aligner(aligner_id, temp_dir)

        aligner.train(train_src_path, train_trg_path)
        aligner.force_align(align_src_path, align_trg_path, sym_align_path)

### USFM marker preservation
* Extract footnotes and put them at the end
* Extract each instance of a marker and record its index
* Tokenize source sentences and match each marker to surrounding tokens based on their original indices
* Train aligner on all training data + translation, align translation to source
* Reinsert marker instances

In [None]:
from pathlib import Path

'''Define project values'''
pair = ""
project = ""
file_suffix = ""
trg_project = ""
trg_file_suffix = ""

book = "JHN"
book_name = f"44{book}"
src_fpath = Path(f"test_S/Paratext/projects/{project}/{book_name}{file_suffix}.SFM")
aligner = "eflomal"
pair_dir = Path(f"zzz_PN_KTs/{pair}")
align_path = pair_dir / book / f"{book_name}_sym-align_{aligner}.txt"
out_fpath = pair_dir / book / f"{book_name}{trg_file_suffix}_out.SFM"

In [None]:
from machine.corpora import FileParatextProjectSettingsParser, UsfmFileText, ScriptureRef
from machine.tokenization import LatinWordTokenizer

src_settings = FileParatextProjectSettingsParser(src_fpath.parent).parse()
src_file_text = UsfmFileText(
    src_settings.stylesheet,
    src_settings.encoding,
    book,
    src_fpath,
    src_settings.versification,
    include_markers=True, # F/T gives notes their own rows, F/F gives just the main text, T/F gives one ref per verse and all markers are inline
    include_all_text=True, # T/T includes all intro and section titles (as does F/T), all other notes/markers inline
    project=src_settings.name,
)

sentences = []
vrefs = []
for sent in src_file_text:
    if len(sent.ref.path) > 0 and sent.ref.path[-1].name == "rem":
        continue
    sentences.append(sent.text.strip())
    vrefs.append(sent.ref)

# orig_sents = sentences.copy()
# After all the markers are removed, these lists will be a mapping of the indices of the remaining characters to their original indices
# orig_indices = [[i for i in range(len(sent))] for sent in sentences]

for ref, sent in zip(vrefs, sentences):
    print(ref, sent)

In [None]:
# '''Extract inline notes'''
# note_markers = ["f", "fm", "rq", "xtSeeAlso"] # + all in stylesheet w/ TextType:NoteText
# INLINE_NOTE = re.compile(r"\\({}) (.*?)\\\1\*".format("|".join(note_markers)))
# inline_notes = []
# for i, (ref, sent) in enumerate(zip(vrefs, sentences)):
#     idxs_to_remove = []
#     for match in INLINE_NOTE.finditer(sent):
#         marker, content = match.groups()
#         # Record index and length of note
#         idxs_to_remove += [j for j in range(match.start() + 1, match.start() + len(match[0]))]  # +1 from replacing the match w/ a space
#         # Record original index and segment info. At this point, orig_indices[i][match.start()] == match.start()
#         inline_notes.append((i, orig_indices[i][match.start()], len(match[0]), ref, marker, content))
#         sentences[i] = sentences[i].replace(match[0], " ", 1)
#     for idx in reversed(idxs_to_remove):
#         orig_indices[i].pop(idx)
# # Add notes to the list of sentences so they're translated separately
# for _, _, _, ref, marker, content in inline_notes:
#     # as long as the vref is right, can put at the end of the list so it doesn't mess with indices of the "index recovery" alg
#     sentences.append(content)
#     vrefs.append(ref)  # TODO: update ref for new sent
#     # seen_notes.append([])
#     orig_indices.append([i for i in range(len(content))])


# '''Extract all instances of markers'''
# INLINE_MARKER = re.compile(r"\\(\S+?(?:\*| ))")
# inline_markers = []
# for i, (ref, sent) in enumerate(zip(vrefs, sentences)):
#     idxs_to_remove = []
#     for match in INLINE_MARKER.finditer(sent):
#         marker, = match.groups()
#         # Record index and length of markers
#         idxs_to_remove += [j for j in range(match.start() + 1, match.start() + len(match[0]))] # +1 from replacing the match w/ a space
#         # Record original index and segment info
#         inline_markers.append((i, orig_indices[i][match.start()], len(match[0]), ref, match[0])) # not all of this is necessary
#         sentences[i] = sentences[i].replace(match[0], " ", 1)
#     for idx in reversed(idxs_to_remove):
#         orig_indices[i].pop(idx)

# print("sent_idx, orig_idx, len, ref, marker, content")
# for i in range(min(5,len(inline_notes))):
#     print(inline_notes[i])
# print("sent_idx, orig_idx, len, ref, marker")
# for i in range(min(5,len(inline_markers))):
#     print(inline_markers[i])

### Only deal with paragraph markers

In [None]:
from machine.corpora import UsfmTokenizer, UsfmTokenType

'''Parse sentences'''
tokenizer = UsfmTokenizer(src_settings.stylesheet)
sentence_toks = [tokenizer.tokenize(sent) for sent in sentences]

to_delete = ["fig"]
inline_markers = []
# markers_by_verse = [[] for _ in sentence_toks]
text_only_sents = ["" for _ in sentence_toks]
sentence_text_toks = [[] for _ in sentence_toks]
for i, toks in enumerate(sentence_toks):
    ignore_scope = None
    for j, tok in enumerate(toks): # POSSIBLE TYPES: TEXT, PARAGRAPH, CHARACTER, NOTE, END
        if ignore_scope is not None:
            if tok.type == UsfmTokenType.END and tok.marker[:-1] == ignore_scope.marker:
                ignore_scope = None
        elif tok.type == UsfmTokenType.NOTE or (tok.type == UsfmTokenType.CHARACTER and tok.marker in to_delete):
            ignore_scope = tok
        elif tok.type == UsfmTokenType.PARAGRAPH:
            # len of text so far == start idx of next text tok, this won't break with other markers bc the idx is wrt only the text toks 
            inline_markers.append((i, len(text_only_sents[i]), tok.to_usfm())) # previously tok.to_usfm() so it could be inserted into the text
            # markers_by_verse[i].append(tok.to_usfm())
        elif tok.type == UsfmTokenType.TEXT:
            text_only_sents[i] += tok.text
            sentence_text_toks[i].append(tok)
        elif tok.type != UsfmTokenType.CHARACTER and tok.type != UsfmTokenType.END:
            print(tok.type, tok)

print("sent_idx, orig_idx, marker")
for marker in inline_markers:
    print(marker)
# for ref, markers in zip(vrefs, markers_by_verse):
#     if len(markers) > 0:
#         print(ref, markers)
# for ref, sent in zip(vrefs, text_only_sents):
#     print(ref, sent)

In [None]:
from machine.corpora import FileParatextProjectSettingsParser, UsfmFileText
from pathlib import Path

'''Translate sentences (use target sentences for now)'''

# Get target file and remove all markers
trg_file_path = Path(f"test_S/Paratext/projects/{trg_project}/{book_name}{trg_file_suffix}.SFM")
trg_settings = FileParatextProjectSettingsParser(trg_file_path.parent).parse()
trg_file_text = UsfmFileText(
    trg_settings.stylesheet,
    trg_settings.encoding,
    trg_settings.get_book_id(trg_file_path.name),
    trg_file_path,
    trg_settings.versification,
    include_markers=True,
    include_all_text=True,
    project=trg_settings.name,
)
tokenizer = UsfmTokenizer(trg_settings.stylesheet)
trg_sents = []
for sent in trg_file_text:
    if len(sent.ref.path) > 0 and sent.ref.path[-1].name == "rem":
        continue
    trg_sents.append("")

    sent = sent.text.strip()
    usfm_toks = tokenizer.tokenize(sent)
    ignore_scope = None
    for j, tok in enumerate(usfm_toks): # POSSIBLE TYPES: TEXT, PARAGRAPH, CHARACTER, NOTE, END
        if ignore_scope is not None:
            if tok.type == UsfmTokenType.END and tok.marker[:-1] == ignore_scope.marker:
                ignore_scope = None
        elif tok.type == UsfmTokenType.NOTE or (tok.type == UsfmTokenType.CHARACTER and tok.marker in to_delete):
            ignore_scope = tok
        elif tok.type == UsfmTokenType.TEXT:
            trg_sents[-1] += tok.text

for ref, sent in zip(vrefs,trg_sents):
    print(ref, sent)

In [None]:
from typing import Tuple, List

'''Tokenize sentences'''
tokenizer = LatinWordTokenizer()

# TODO: force ranges to not cross boundaries by tokenizing each text token separately? or bound the alignment range some other way based on surrounding markers
# is there a simpler way to force markers to be reinserted in the same relative order?
word_tok_ranges = [list(tokenizer.tokenize_as_ranges(sent)) for sent in text_only_sents]
toks = [[sent[r.start : r.end] for r in ranges] for sent, ranges in zip(text_only_sents, word_tok_ranges)]

trg_word_tok_ranges = [list(tokenizer.tokenize_as_ranges(sent)) for sent in trg_sents]
trg_toks = [[sent[r.start : r.end] for r in ranges] for sent, ranges in zip(trg_sents, trg_word_tok_ranges)]

'''Match markers to the token closest to them'''
# TODO: need to disambiguate the order of markers in the case where there are multiple markers in a row
# Returns a list of the indices of the the tokens following each of the input sequences
# The returned indices are based on the tokens post-marker removal, but the lookup is based on the character indices of the original strings
def get_toks_after_sequences(sequences: List[Tuple]) -> List[int]:
    toks_after_seqs = []
    for sequence in sequences:
        sent_idx, start_idx = sequence[0], sequence[1] # now, start_idx is wrt text_only_sents
        for i, tok_range in reversed(list(enumerate(word_tok_ranges[sent_idx]))):
            # this works fine but there's still risk of splitting words if the token goes across paragraph boundaries
            # this can be fixed by forcing tok boundaries to stay within usfm text toks
            if tok_range.start <= start_idx or i == 0:
                toks_after_seqs.append(i)
                break

    return toks_after_seqs
toks_after_markers = get_toks_after_sequences(inline_markers)

'''Test alg'''
'''out_sents = text_only_sents.copy()
for i in reversed(range(len(inline_markers))):
    sent_idx = inline_markers[i][0]
    insert_idx = word_tok_ranges[sent_idx][toks_after_markers[i]].start
    out_sents[sent_idx] = out_sents[sent_idx][:insert_idx] + inline_markers[i][2] + out_sents[sent_idx][insert_idx:]
rows = [([ref], sent) for ref, sent in zip(vrefs, out_sents)]
dest_updater = FileParatextProjectTextUpdater(src_fpath.parent)
usfm_out = dest_updater.update_usfm(
    src_file_text.id, rows, strip_all_text=True, prefer_existing_text=False
)
with open(f"zzz_PN_KTs/{pair}/{book}/test_para_idxs.SFM", "w", encoding=src_settings.encoding) as f:
    f.write(usfm_out)'''

print(word_tok_ranges[0])
print(toks[0])
print(trg_word_tok_ranges[0])
print(trg_toks[0])
print(toks_after_markers)
# for i, (sent_idx, _, marker) in enumerate(inline_markers):
#     out = toks[i][max(0,toks_after_markers[i]-3):min(len(toks[i]),toks_after_markers[i])]
#     if len(out) == 0:
#         print(inline_markers[i])
#     else:
#         print(out)

In [None]:
from silnlp.alignment.utils import compute_alignment_scores
from silnlp.common.corpus import write_corpus
'''Align sentences'''
# align_sents(toks, trg_toks, aligner, align_path) # Path(f"zzz_PN_KTs/tpi_aps/train.src.detok.txt"),Path(f"zzz_PN_KTs/tpi_aps/train.trg.detok.txt")

# eflomal
if not align_path.exists():
    src_corpus = write_corpus(pair_dir / book / "src_align.txt", text_only_sents) # orig sentences
    trg_corpus = write_corpus(pair_dir / book / "trg_align.txt", trg_sents)
    compute_alignment_scores(pair_dir / book / "src_align.txt", pair_dir / book / "trg_align.txt", aligner, align_path)

In [None]:
from collections import defaultdict

from silnlp.common.corpus import load_corpus

# probably non-exhaustive, second of same-looking single closing quotes is an apostrophe, I think
# QUOTATION_MARKS = ["'", '"', "“", "”", "‘", "’", "ʼ", "<", ">"]

'''Decide where to reinsert markers'''
if aligner == "eflomal":
    align_lines = [[(lambda x: (int(x[0]), int(x[1])))(pair.split("-")) for pair in line.split()] for line in load_corpus(align_path)]
else:
    align_lines = [[(lambda x: (int(x[0]), int(x[1])))(pair.split(":")[0].split("-")) for pair in line.split()] for line in load_corpus(align_path)]

# Gets the number of alignment pairs that "cross the line" between (src_idx - .5) and (trg_idx - .5)
def num_align_crossings(sent_idx: int, src_idx: int, trg_idx: int) -> int:
    crossings = 0
    for i,j in align_lines[sent_idx]:
        if i < src_idx and j >= trg_idx:
            crossings += 1
        if i >= src_idx and j < trg_idx:
            crossings += 1
    return crossings

# Get the index of the trg token that each sequence should be inserted before
def get_insert_indices(seqs_to_insert: List[Tuple], adj_tok_idxs: List[int]) -> List[int]:
    punct_hyp_freqs = defaultdict(int)
    hyp_freqs = defaultdict(int)
    
    insert_indices = []
    for seq, adj_src_tok in zip(seqs_to_insert, adj_tok_idxs):
        sent_idx = seq[0]

        '''
        Alternative approach
        for each 'threshold' 0 to 5, for each start position offset in [0,1,2,-1,-2], for each trg token the src is aligned, 
        return the first pairing that matches the threshold
        '''
        # # hyps = [0, 1, 2, -1, -2]
        # hyps = [0, 1, -1, 2, -2]
        # max_crossings = 0
        # insert_idx = -1
        # while insert_idx == -1 and max_crossings < 5: # if there's a limit, just make a for loop
        #     for hyp in hyps:
        #         src_hyp = adj_src_tok + hyp

        #         # this will also filter out OOB hypotheses
        #         trg_hyps = [t for (s,t) in align_lines[sent_idx] if s == src_hyp]
        #         # print(len(trg_hyps))
        #         if len(trg_hyps) == 0:
        #             continue

        #         if hyp < 0:
        #             trg_hyps = reversed(trg_hyps)
                
        #         for i, trg_hyp in enumerate(trg_hyps, 1):
        #             if num_align_crossings(sent_idx, src_hyp, trg_hyp) <= max_crossings:
        #                 insert_idx = (hyp, i, trg_hyp) # hyp and i are just for tracking purposes, can just have trg_hyp later
        #                 break

        #         if insert_idx != -1:
        #             break
        #     max_crossings += 1

        # insert_indices.append(insert_idx) # could be -1, need to handle outside of function

        '''Revision of original, checks first hypothesis of its original position plus one on either side and picks the best'''
        '''# # only 0,1,2 was close, checking for prev tokens seems to always be bad
        # hyps = [0, 1, 2]
        # best_hyp = (-1, None, len(align_lines[sent_idx]))
        # for hyp in hyps:
        #     src_hyp = adj_src_tok + hyp
        #     trg_hyps = [t for (s,t) in align_lines[sent_idx] if s == src_hyp]
        #     if len(trg_hyps) == 0:
        #         continue

        #     trg_hyp = trg_hyps[-1] if hyp < 0 else trg_hyps[0]
        #     num_crossings = num_align_crossings(sent_idx, src_hyp, trg_hyp)
        #     if num_crossings < best_hyp[2]:
        #         best_hyp = (trg_hyp, hyp, num_crossings)
        # if best_hyp[0] == -1:
        #     insert_indices.append(len(trg_toks[sent_idx])) # insert at the end of the sentence
        #     hyp_freqs["none"] += 1
        # else:
        #     insert_indices.append(best_hyp[0])
        #     hyp_freqs[best_hyp[1]] += 1'''

        '''Original, checks first hypothesis of its original position and one tok previous and picks the better of the two'''
        # If the token on either side of a hypothesis is punctuation, use that
        # can also try a less extreme version where the punct hyps are still subject to having the least crossings, but they are still always checked first

        trg_hyp = -1
        punch_hyps = [-1, 0]
        for punct_hyp in punch_hyps:
            src_hyp = adj_src_tok + punct_hyp
            if src_hyp < 0 or src_hyp >= len(toks[sent_idx]):
                continue
            # only accept pairs where both the src and trg token are punct
            # can define more specifically what the punct tokens can look like later
            if len(toks[sent_idx][src_hyp]) > 0 and not any(c.isalpha() for c in toks[sent_idx][src_hyp]):
                align_pairs = reversed(align_lines[sent_idx]) if punct_hyp < 0 else align_lines[sent_idx]
                for s,t in align_pairs:
                    if s == src_hyp and not any(c.isalpha() for c in trg_toks[sent_idx][t]):
                        trg_hyp = t
                        break
            if trg_hyp != -1:
                # if this search gets expanded beyond [-1,0] can do insert_idx -= punct_hyp
                insert_idx = trg_hyp + 1 if punct_hyp < 0 else trg_hyp
                insert_indices.append(insert_idx)
                punct_hyp_freqs[punct_hyp] += 1
                break
        if trg_hyp != -1:
            continue

        # hyps = [0, -1] # offsets to test, original and previous
        hyps = [0, 1, 2]
        # hyps = [0, 1, -1, 2, -2]
        # hyps = [0, -1, 1, -2, 2]
        # hyps = [-1, 0, 1, -2, 2]
        # hyps = [0, 1, 2, -1, -2]
        best_hyp = (-1, None, len(align_lines[sent_idx]))
        checked = set() # to prevent checking the same idx twice
        for hyp in hyps:
            align_pairs = reversed(align_lines[sent_idx]) if hyp < 0 else align_lines[sent_idx]
            src_hyp = adj_src_tok + hyp
            if src_hyp in checked:
                continue
            trg_hyp = -1
            while trg_hyp == -1 and src_hyp >= 0 and src_hyp < len(toks[sent_idx]):
                checked.add(src_hyp)
                trg_hyps = [t for (s,t) in align_pairs if s == src_hyp]
                if len(trg_hyps) > 0:
                    trg_hyp = trg_hyps[0]
                else:
                    src_hyp += -1 if hyp < 0 else 1
            if trg_hyp != -1:
                num_crossings = num_align_crossings(sent_idx, src_hyp, trg_hyp)
                if num_crossings < best_hyp[2]:
                    # replace hyp with src_hyp - adj_src_tok
                    # this is what I was trying to do before with offsetting the farther away hyps, but this way is worse
                    best_hyp = (trg_hyp, hyp, num_crossings)

        if best_hyp[0] == -1:
            insert_indices.append(len(trg_toks[sent_idx])) # insert at the end of the sentence
            hyp_freqs["none"] += 1
        else:
            insert_idx = best_hyp[0]
            # do this before or after subtracting the offset? only subtract offset if not adjacent to punct? check before and after subtraction?
            if trg_toks[sent_idx][insert_idx] in [",", ".", "!", "?"]:
                insert_idx += 1
            # subtracting hyp at the end may be bad in other cases, or make wrong answers worse/more confusing
            else:
                insert_idx = insert_idx - best_hyp[1] #  - best_hyp[1]
                if trg_toks[sent_idx][insert_idx] in [",", ".", "!", "?"]:
                    insert_idx += 1
            
            insert_indices.append(insert_idx)
            hyp_freqs[best_hyp[1]] += 1

    print(punct_hyp_freqs)
    print(hyp_freqs)

    # ret = []
    # for i, result in enumerate(insert_indices):
    #     if result == -1:
    #         print(i, vrefs[seqs_to_insert[i][0]], seqs_to_insert[i])
    #         # ret.append(-1)
    #         ret.append(len(trg_toks[seqs_to_insert[i][0]]))
    #     else:
    #         hyp, hyp_num, idx = result
    #         print(hyp, hyp_num) # how many tokens away from marker, how far away the first alignment w/ 0 crossings was
    #         ret.append(idx)
    
    return insert_indices

trg_toks_after_markers = get_insert_indices(inline_markers, toks_after_markers)

print(toks_after_markers)
print(trg_toks_after_markers)

In [None]:
'''
TODO: right now, half the word for the insertion order is done when to_insert is filled out, and the other half is done when the markers are
being inserted (with reverse). Since there there's already funky stuff going on in the order of the markers in to_insert (for disambiguation 
for the same insertion idx), it would make more sense to just do all the ordering when to_insert is being filled out, i.e. the order of a list
in to_insert is the order they need to be inserted in
'''

'''Reinsert markers'''
to_insert = [[] for _ in vrefs]

# Collect the markers to be inserted
for i, (mark, next_trg_tok) in enumerate(zip(inline_markers, trg_toks_after_markers)):
    sent_idx, _, marker = mark
    if next_trg_tok >= len(trg_word_tok_ranges[sent_idx]): # TODO: this shouldn't happen
        insert_idx = len(trg_sents[sent_idx])
    else:
        insert_idx = trg_word_tok_ranges[sent_idx][next_trg_tok].start
    
    # figure out the order of the markers in the sentence to handle ambiguity for directly adjacent markers
    insert_place = 0
    while insert_place < len(to_insert[sent_idx]) and to_insert[sent_idx][insert_place][0] <= insert_idx:
        insert_place += 1

    to_insert[sent_idx].insert(insert_place, (insert_idx, marker))  # "\n" + marker

'''construct to_insert with the actual strings instead of indices so they can be inserted manually'''
# to do this, it would be nice to have the markers for a given sentence appear in the list in the order they appear
# for i, (ref, trg_sent, markers) in enumerate(zip(vrefs, trg_sents, to_insert)):
#     for marker in reversed(markers):

'''original'''
# Insert the strings into the target sentences
# for sent_idx in range(len(trg_sents)):
#     for insert_idx, insert_str in reversed(to_insert[sent_idx]): # sorted(to_insert[sent_idx], key=lambda x: x[0], reverse=True)
#         trg_sents[sent_idx] = trg_sents[sent_idx][:insert_idx] + insert_str + trg_sents[sent_idx][insert_idx:]

'''create rows for each paragraph marker with specific ScriptureRef paths'''
# Construct rows to update the USFM file with
rows = []
for sent_idx, (ref, trg_sent) in enumerate(zip(vrefs, trg_sents)):
    marker_rows = []
    for marker_idx, (insert_idx, marker) in reversed(list(enumerate(to_insert[sent_idx], 1))):
        # marker_rows.insert(0, ([ScriptureRef(ref.verse_ref, ref.path + [ScriptureElement(marker_idx, marker[1:-1])])], trg_sent[insert_idx:]))
        marker_rows.insert(0, ([ref], trg_sent[insert_idx:]))
        trg_sent = trg_sent[:insert_idx]
    rows.append(([ref], trg_sent))
    for m_row in marker_rows:
        rows.append(m_row)

for ref, sent in rows:
    print(ref, sent)

In [None]:
from machine.corpora import UpdateUsfmParserHandler, parse_usfm, UsfmParserState, UpdateUsfmBehavior

'''class ParagraphUpdateUsfmParserHandler(UpdateUsfmParserHandler):
    def _collect_tokens(self, state: UsfmParserState) -> None:
        self._tokens.extend(self._new_tokens)
        self._new_tokens.clear()
        while self._token_index <= state.index + state.special_token_count:
            if state.tokens[self._token_index].type == UsfmTokenType.PARAGRAPH:
                num_text = 0
                for i in range(len(self._tokens) - 1, -1, -1):
                    if self._tokens[i].type == UsfmTokenType.TEXT:
                        num_text += 1
                    else:
                        break
                if num_text >= 2:
                    self._tokens.insert(-(num_text - 1), state.tokens[self._token_index])
                    self._token_index += 1
                    break # should this be continue instead? what situations are there where 
            self._tokens.append(state.tokens[self._token_index])
            self._token_index += 1'''

class ParagraphUpdateUsfmParserHandler(UpdateUsfmParserHandler):
    def _collect_tokens(self, state: UsfmParserState) -> None:
        self._tokens.extend(self._new_tokens)
        self._new_tokens.clear()
        while self._token_index <= state.index + state.special_token_count:
            if state.tokens[self._token_index].type == UsfmTokenType.PARAGRAPH and state.tokens[self._token_index].marker != "rem":
                num_text = 0
                rem_offset = 0
                for i in range(len(self._tokens) - 1, -1, -1):
                    if self._tokens[i].type == UsfmTokenType.TEXT:
                        num_text += 1
                    elif self._tokens[i].type == UsfmTokenType.PARAGRAPH and self._tokens[i].marker == "rem":
                        rem_offset += num_text + 1
                        num_text = 0
                    else:
                        break
                if num_text >= 2:
                    self._tokens.insert(-(rem_offset + num_text - 1), state.tokens[self._token_index])
                    self._token_index += 1
                    break # should this be continue instead? what situations are there where 
            self._tokens.append(state.tokens[self._token_index])
            self._token_index += 1

'''Update USFM and write out'''
# preserve_whitespace=True doesn't change anything with markers on newlines but it does take care of the \vp\*vp somehow
with open(src_fpath, encoding="utf-8-sig") as f:
    usfm = f.read()
handler = ParagraphUpdateUsfmParserHandler(rows, behavior=UpdateUsfmBehavior.PREFER_NEW)
parse_usfm(usfm, handler, src_settings.stylesheet, src_settings.versification, preserve_whitespace=False)
usfm_out = handler.get_usfm(src_settings.stylesheet)

with out_fpath.open("w", encoding=src_settings.encoding) as f:
    f.write(usfm_out)