# Traverse LCP, then MFS

* Replace earlier masked-array strategy with regular arrays, using 0 to represent a null.
* Real data (offset of token within witness) is one-based.



In [2]:
from typing import List
from linsuffarr import SuffixArray
from linsuffarr import UNIT_BYTE
import pprint
import numpy as np
import re
from dataclasses import dataclass
from heapq import * # priority heap, https://docs.python.org/3/library/heapq.html
pp = pprint.PrettyPrinter(indent=2)
from bisect import bisect_right
from IPython.core.display import display, HTML
import networkx as nx
debug = True

  from IPython.core.display import display, HTML


In [3]:
sigla = ['w0', 'w1', 'w2', 'w3', 'w4', 'w5']
filenames = ['darwin/darwin1859.txt', 'darwin/darwin1860.txt', 'darwin/darwin1861.txt', 'darwin/darwin1866.txt', 'darwin/darwin1869.txt', 'darwin/darwin1872.txt']
# sigla = ['w0', 'w1', 'w2', 'w3']
# filenames = ['darwin/darwin1859.txt', 'darwin/darwin1860.txt', 'darwin/darwin1861.txt', 'darwin/darwin1866.txt']
# sigla = ['w0', 'w1']
# filenames = ['darwin1859.txt', 'darwin1860.txt']
# sigla = ['w0', 'w1', 'w2', 'w3', 'w4', 'w5']
# filenames = ['abc/abcd.txt', 'abc/abcda.txt', 'abc/abcdb.txt', 'abc/abcdc.txt', 'abc/abcdd.txt', 'abc/abcde.txt']
first_paragraph = 0
last_paragraph = 50
how_many_paragraphs = last_paragraph - first_paragraph
raw_data_dict = {}
for siglum, filename in zip(sigla, filenames):
    with open(filename) as f:
        lines = f.readlines()
        lines = [line for line in lines if line != '\n']
        raw_data_dict[siglum] = " ".join(lines[first_paragraph : last_paragraph])

In [4]:
def tokenize_witnesses(witness_strings: List[str]): # one string per witness
    '''Return list of witnesses, each represented by a list of tokens'''
    # TODO: handle punctuation, upper- vs lowercase
    witnesses = []
    for witness_string in witness_strings:
        # witness_tokens = witness_string.split()
        witness_tokens = re.findall(r'\w+\s*|\W+', witness_string)
        witness_tokens = [token.strip().lower() for token in witness_tokens]
        witnesses.append(witness_tokens)
    return witnesses

In [5]:
def create_token_array(witness_token_lists): # list of token lists per witness
    '''Create token array (single list, with separator " # " between witnesses'''
    token_array = [] # strings
    token_membership_array = [] # witness identifiers, same offsets as in token_array
    token_witness_offset_array = [] # one-based offset of token in witness
    last_witness_offset = len(witness_token_lists) - 1
    for index, witness_token_list in enumerate(witness_token_lists):
        token_array.extend(witness_token_list)
        for token_offset, token in enumerate(witness_token_list): # don't need enumerate, just len()
            token_witness_offset_array.append(token_offset)
        token_membership_array.extend([index for token in witness_token_list])
        if index < last_witness_offset:
            separator = " #" + str(index + 1) + " "
            token_array.append(separator)
            token_membership_array.append(separator)
            token_witness_offset_array.append(-1)
    return token_array, token_membership_array, token_witness_offset_array

In [6]:
witness_sigla = [key for key in raw_data_dict.keys()]
witnesses = tokenize_witnesses([value for value in raw_data_dict.values()]) # strings
# token_list

In [7]:
token_array, token_membership_array, token_witness_offset_array = create_token_array(witnesses)
if debug:
    print(f"{token_array=}")
    print(f"{token_membership_array=}")
    print(f"{token_witness_offset_array=}")

token_array=['when', 'we', 'look', 'to', 'the', 'individuals', 'of', 'the', 'same', 'variety', 'or', 'sub', '-', 'variety', 'of', 'our', 'older', 'cultivated', 'plants', 'and', 'animals', ',', 'one', 'of', 'the', 'first', 'points', 'which', 'strikes', 'us', ',', 'is', ',', 'that', 'they', 'generally', 'differ', 'much', 'more', 'from', 'each', 'other', ',', 'than', 'do', 'the', 'individuals', 'of', 'any', 'one', 'species', 'or', 'variety', 'in', 'a', 'state', 'of', 'nature', '.', 'when', 'we', 'reflect', 'on', 'the', 'vast', 'diversity', 'of', 'the', 'plants', 'and', 'animals', 'which', 'have', 'been', 'cultivated', ',', 'and', 'which', 'have', 'varied', 'during', 'all', 'ages', 'under', 'the', 'most', 'different', 'climates', 'and', 'treatment', ',', 'i', 'think', 'we', 'are', 'driven', 'to', 'conclude', 'that', 'this', 'greater', 'variability', 'is', 'simply', 'due', 'to', 'our', 'domestic', 'productions', 'having', 'been', 'raised', 'under', 'conditions', 'of', 'life', 'not', 'so', '

In [8]:
suffix_array = SuffixArray(token_array, unit=UNIT_BYTE)
# print(suffix_array)
# LCP=0 means that the block has nothing in common with the next one

In [9]:
lcp_array = suffix_array._LCP_values
if debug:
    print(lcp_array[:5])

array('i', [0, 0, 0, 0, 0])


In [10]:
# create Block dataclass
from dataclasses import dataclass
@dataclass(unsafe_hash=True)
class Block:
    token_count: int
    start_position: int # offset into suffix array (not into token array!)
    end_position: int # start and end position give number of occurrences
    all_start_positions: [] # compute after blocks have been completed
    witnesses: set
    witness_count: int # number of witnesses in which pattern occurs, omitted temporarily because requires further computation
    frequency: int # number of times pattern occurs in whole witness set (may be more than once in a witness), end_position - start_position + 1
    # how_created: int # debug

In [11]:
@dataclass
class Lcp_interval_candidate:
    lcp_start_offset: int
    lcp_interval_token_count: int
    lcp_end_offset: int = -1

In [12]:
def expand_prefix(prefix_to_expand:Lcp_interval_candidate):
    token_start_position = suffix_array.SA[prefix_to_expand.lcp_start_offset]
    token_count = prefix_to_expand.lcp_interval_token_count
    tokens = token_array[token_start_position: token_start_position + token_count]
    return tokens

In [13]:
def check_for_depth_and_repetition(_lcp_interval:Lcp_interval_candidate, _witness_count: int) -> bool:
    """Write a docstring someday

    Number of prefixes >= total number of witnesses
    Accumulate set of witness sigla for prefixes
    if:
        no witness occurs more than once, return True to keep this block
    else:
        return False
    """
#     print(f"Checking for depth and repetition for: {_lcp_interval=}")
#     print(f"Occurs {_lcp_interval.lcp_end_offset - _lcp_interval.lcp_start_offset + 1} times in witness set")
#     print(expand_prefix(_lcp_interval))
#     print()
    block_instance_count = _lcp_interval.lcp_end_offset - _lcp_interval.lcp_start_offset + 1
    if block_instance_count != _witness_count:
        return False
    else:
        witnesses_found = []
        for lcp_interval_item_offset in range(_lcp_interval.lcp_start_offset, _lcp_interval.lcp_end_offset + 1):
            token_position = suffix_array.SA[lcp_interval_item_offset] # point from prefix to suffix array position
            witness_siglum = token_membership_array[token_position] # point from token array position to witness identifier
            if witness_siglum in witnesses_found:
                return False
            else:
                witnesses_found.append(witness_siglum)
        return True

In [14]:
def create_blocks(_lcp_array: list):
    """Write a docstring someday

    Look at changes in length of LCP array
    Initial value is 0 or -1 because it's a comparison with previous, and first has no previous
    Next value is number of tokens shared with previous
    Exact length doesn't matter, but if it changes, new pattern:
        If it stays the same, take note but do nothing yet; it means that the pattern repeats
        No change for a while, then goes to 0:
            Number of repetitions plus 1, e.g., 5 5 5 0 = 4 instances of 5
            Once it changes to 0, we've seen complete pattern
        Changer to smaller means hidden, deeper block
        Changes to longer means ???
    """
    accumulator = [] # lcp positions (not values) since most recent 0
    frequent_sequences = [] # lcp intervals to be considered for mfs
    #
    # lcp value
    # if == 0 it's a new interval, so:
    #   1. if there is already an accumulation, commit (process) it
    #      "committing the buffer" means checking for repetition and depth
    #          if it passes check: store in mfs list
    #          otherwise throw it away
    #   2. clear buffer (accumulator) and begin accumulating new buffer with the new offset with 0 value
    # otherwise it isn't zero, so there must be a buffer in place, so add to it (for now)
    for offset, value in enumerate(_lcp_array):
        if not accumulator and value == 0: # if accumulator is empty and new value is 0, do nothing
            continue
        elif not accumulator: # accumulator is empty and new value is non-zero, so begin new accumulator
            accumulator.append(Lcp_interval_candidate(lcp_start_offset = offset - 1, lcp_interval_token_count = value))
        elif value > accumulator[-1].lcp_interval_token_count: # new interval, so add to accumulator and continue
            accumulator.append(Lcp_interval_candidate(lcp_start_offset = offset - 1, lcp_interval_token_count = value))
        elif value == accumulator[-1].lcp_interval_token_count: # same block as before, so do nothing
            continue
        else: # new value is less than top of accumulator, so pop everything that is higher
            # Positions in lcp array and suffix array coincide:
            #   The lcp array value is the length of the sequence
            #   The suffix array value is the start position of the sequence
            # Assume accumulator values (offsets into lcp array) point to [3, 6] and new value is 4, so:
            #   First: Pop pointer to 6 (length value in lcp array), store in frequent_sequences
            #   Second: Push new pointer to same position in lcp array, but change value in lcp array to 4
            while accumulator and accumulator[-1].lcp_interval_token_count > value:
                # Create pointer to last closed block that is not filtered (like frequent_sequences)
                newly_closed_block = accumulator.pop()
                newly_closed_block.lcp_end_offset = offset - 1
                if check_for_depth_and_repetition(newly_closed_block, len(witnesses)):
                    frequent_sequences.append([newly_closed_block.lcp_start_offset, newly_closed_block.lcp_end_offset, newly_closed_block.lcp_interval_token_count])
            # There are three options:
            #   1. there is content in the accumulator and latest value is not 0
            #   2. accumulator is empty and latest value is 0
            #   3. accumulator is empty and latest value is not 0
            # (the fourth logical combination, content in the accumulator and 0 value, cannot occur
            #     because a 0 value will empty the accumulator)
            if value > 0 and (not accumulator or accumulator[-1].lcp_interval_token_count != value):
                accumulator.append(Lcp_interval_candidate(lcp_start_offset = newly_closed_block.lcp_start_offset, lcp_interval_token_count = value))
    # End of lcp array; run through any residual accumulator values
    while accumulator:
        newly_closed_block = accumulator.pop()
        newly_closed_block.lcp_end_offset = len(_lcp_array) - 1
        if check_for_depth_and_repetition(newly_closed_block, len(witnesses)):
            frequent_sequences.append([newly_closed_block.lcp_start_offset, len(_lcp_array)-1, newly_closed_block.lcp_interval_token_count])
    return frequent_sequences

In [15]:
# frequent_sequences is a list of lists
# the embedded lists contain LCP indices
#   LCP indices point into LCP array, but same index also points into suffix array
#   value in LCP array points to prefix length (compared to previous one)
#   value in suffix array points into token array
frequent_sequences = create_blocks(lcp_array)
# print(len(frequent_sequences))
# pp.pprint(frequent_sequences[:5])

In [16]:
# To remove embedded prefixes:
#
# 1. Create dictionary with end position in witness 0 (arbitrarily) as key
# 2. Set value of key to longest sequence with that end position
# 3. Dictionary values will contain only longest frequent sequences, removing embedded ones,
#    as tuples if (length, [token start positions for all witnesses])

@dataclass
class LongestSequence:
    length: int
    witness_start_and_end: List[int]

def find_longest_sequences(_frequent_sequences, _suffix_array):
    _largest_blocks = {} # key is token end position, value is (length, [witness-start-positions])
    for frequent_sequence in _frequent_sequences:
        length = frequent_sequence[2]
        suffix_array_values = [_suffix_array.SA[i] for i in range(frequent_sequence[0], frequent_sequence[1] + 1)]
        token_end_position = min(suffix_array_values) + length # token end position for first witness
        if token_end_position not in _largest_blocks: # first block with this end position, so create new key
            _largest_blocks[token_end_position] = (length, sorted(suffix_array_values))
        else: # if new block is longer, replace old one with same key
            if length > _largest_blocks[token_end_position][0]:
                _largest_blocks[token_end_position] = (length, sorted(suffix_array_values))
    return _largest_blocks

largest_blocks = find_longest_sequences(frequent_sequences, suffix_array)
if debug:
    print(f"{largest_blocks=}")

largest_blocks={513: (18, [495, 13283, 26134, 39197, 53028, 67259]), 488: (8, [480, 13268, 26119, 39182, 53011, 67242]), 6092: (19, [6073, 18933, 31945, 45206, 59229, 73410]), 3735: (10, [3725, 16589, 29518, 42778, 56827, 71067]), 10367: (343, [10024, 22882, 35915, 49210, 63164, 77335]), 1071: (60, [1011, 13795, 26651, 39722, 53852, 68088]), 9715: (99, [9616, 22474, 35506, 48801, 62756, 76937]), 1394: (35, [1359, 14143, 27100, 40176, 54303, 68555]), 2607: (41, [2566, 15349, 28307, 41400, 55482, 69732]), 4088: (28, [4060, 16924, 29859, 43117, 57166, 71384]), 5276: (126, [5150, 18010, 30943, 44203, 58242, 72424]), 2208: (48, [2160, 14943, 27901, 40978, 55109, 69358]), 4884: (127, [4757, 17617, 30553, 43813, 57852, 72031]), 733: (25, [708, 13494, 26350, 39413, 53242, 67470]), 12092: (10, [12082, 24933, 37968, 51275, 65206, 79342]), 2218: (9, [2209, 14992, 27950, 41027, 55158, 69407]), 6995: (156, [6839, 19700, 32724, 45998, 60012, 74183]), 11418: (33, [11385, 24238, 37273, 50570, 64511, 7

In [17]:
# block_offsets_by_witness: list of lists holds sorted start offsets per witness (offsets are into global token array)
# witness_offsets_to_blocks: dictionary points from start offsets to blocks
# score_by_block: number of tokens placed or skipped if block is placed
# Beam search requires us, given an offset in a witness, to find the next block. We do
#   that by looking up the value in block_offsets_by_witness and then using that value
#   to retrieve the block key from witness_offsets_to_blocks
# Lookup in the list of lists is:
#   block_offsets_by_witness[witness_number][bisect_right(block_offsets_by_witness[witness_number], most_recent_offset_in_witness)]
# (See: https://www.geeksforgeeks.org/python-find-smallest-element-greater-than-k/)
# FIXME: traverse largest_blocks only once and add values for all witnesses in same pass
witness_count = len(witnesses)
block_offsets_by_witness = []
witness_offsets_to_blocks = {}
first_token_offset_in_block_by_witness = [] # only tokens in blocks
first_absolute_token_by_witness = [] # all tokens, whether in block or not
for i in range(witness_count):
    first_token_offset_in_block_by_witness.append(token_membership_array.index(i))
    # Score = number of tokens either placed or skipped (we don't care which)
    # Low score is best because it leaves the highest potential
    # NB: The name "score" seems to imply that higher is better, and the
    #   opposite is the case here. Rename the variable?
    # NB: High potential is paramount during beam search, but should the
    #   difference between placed and skip matter at a later stage? Or
    #   does placing more blocks (more tiers) take care of that?
    score_by_block = {}
    for i in range(witness_count):
        witness_offset_list = []
        for key, value in largest_blocks.items():
            witness_offset_list.append(value[1][i])
            witness_offsets_to_blocks[value[1][i]] = key
        witness_offset_list.sort()
        block_offsets_by_witness.append(witness_offset_list)
for i in range(witness_count):
    first_absolute_token_by_witness.append(token_membership_array.index(i))
for key, value in largest_blocks.items():
    # to determine number of tokens that will have been placed or skipped
    #   after placing block:
    #       matrix-subtract first_token_offset_by_witness from value[1]
    #       add witness_count * value[0] (to account for block length)
    #   key by block key, value is score
    differences = [x - y for x, y in zip(value[1], first_token_offset_in_block_by_witness)]
    if debug:
        print(differences)
    score = sum(differences) + witness_count * value[0]
    score_by_block[key] = score
if debug:
    print(f"{block_offsets_by_witness=}")
    print()
    witness_offsets_to_blocks = { key: witness_offsets_to_blocks[key] for key in sorted(witness_offsets_to_blocks.keys())}
    print(f"{witness_offsets_to_blocks=}")
    print()
    print(f"{first_token_offset_in_block_by_witness=}")
    print()
    print(f"{first_absolute_token_by_witness=}")
    print()
    print(f"{score_by_block=}")

[495, 492, 492, 497, 1009, 1009]
[480, 477, 477, 482, 992, 992]
[6073, 6142, 6303, 6506, 7210, 7160]
[3725, 3798, 3876, 4078, 4808, 4817]
[10024, 10091, 10273, 10510, 11145, 11085]
[1011, 1004, 1009, 1022, 1833, 1838]
[9616, 9683, 9864, 10101, 10737, 10687]
[1359, 1352, 1458, 1476, 2284, 2305]
[2566, 2558, 2665, 2700, 3463, 3482]
[4060, 4133, 4217, 4417, 5147, 5134]
[5150, 5219, 5301, 5503, 6223, 6174]
[2160, 2152, 2259, 2278, 3090, 3108]
[4757, 4826, 4911, 5113, 5833, 5781]
[708, 703, 708, 713, 1223, 1220]
[12082, 12142, 12326, 12575, 13187, 13092]
[2209, 2201, 2308, 2327, 3139, 3157]
[6839, 6909, 7082, 7298, 7993, 7933]
[11385, 11447, 11631, 11870, 12492, 12436]
[7619, 7688, 7864, 8099, 8793, 8732]
[7240, 7309, 7488, 7704, 8398, 8339]
[7609, 7678, 7854, 8087, 8783, 8722]
[9432, 9498, 9679, 9916, 10552, 10505]
[6481, 6551, 6728, 6926, 7621, 7554]
[11569, 11631, 11815, 12055, 12671, 12612]
[735, 730, 735, 740, 1250, 1247]
[4184, 4257, 4341, 4543, 5266, 5214]
[11281, 11342, 11526, 11765

In [18]:
dir()

['Block',
 'HTML',
 'In',
 'Lcp_interval_candidate',
 'List',
 'LongestSequence',
 'Out',
 'SuffixArray',
 'UNIT_BYTE',
 '_',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dh',
 '_i',
 '_i1',
 '_i10',
 '_i11',
 '_i12',
 '_i13',
 '_i14',
 '_i15',
 '_i16',
 '_i17',
 '_i18',
 '_i2',
 '_i3',
 '_i4',
 '_i5',
 '_i6',
 '_i7',
 '_i8',
 '_i9',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 '_pydevd_bundle',
 'bisect_right',
 'block_offsets_by_witness',
 'check_for_depth_and_repetition',
 'create_blocks',
 'create_token_array',
 'dataclass',
 'debug',
 'differences',
 'display',
 'exit',
 'expand_prefix',
 'f',
 'filename',
 'filenames',
 'find_longest_sequences',
 'first_absolute_token_by_witness',
 'first_paragraph',
 'first_token_offset_in_block_by_witness',
 'frequent_sequences',
 'get_ipython',
 'heapify',
 'heappop',
 'heappush',
 'heappushpop',
 'heapreplace',
 'how_many_paragraphs',
 'i',
 'key',
 'largest_blocks',
 'last_parag

In [19]:
if debug:
    for i in range(len(witnesses)):
        print(block_offsets_by_witness[i][:10])

[4, 31, 43, 60, 93, 104, 134, 146, 172, 184]
[12795, 12822, 12832, 12849, 12882, 12893, 12923, 12934, 12960, 12972]
[25646, 25673, 25683, 25700, 25733, 25744, 25774, 25785, 25811, 25823]
[38708, 38735, 38745, 38762, 38795, 38806, 38836, 38847, 38873, 38885]
[52026, 52052, 52062, 52080, 52111, 52121, 52151, 52162, 52187, 52200]
[66257, 66283, 66293, 66311, 66342, 66352, 66382, 66391, 66416, 66429]


In [20]:
# To perform beam search
#   Create single start option (at Start node, which is a fiction [there is no Start block]
#       created for the beam search)
#   Loop: for each BeamOption on current tier
#       Evaluate score for advancing in each witness and bringing others into alignment with it
#       For β lowest (!) scores create new BeamOption (this advances to next tier)
#           Score is count of tokens placed or skipped (!)
#           Favor lowest score because that has the greatest potential

In [21]:
@dataclass(order=True, frozen=True, eq=True) # heapqueue is priority queue, so requires comparison
class BeamOption:
    score: int
    path: tuple # path through sequence of blocks leading to current BeamOption

In [22]:
# Create initial BeamOption
initial = [BeamOption(score=0, path=())] # tier 0, one-item list
def perform_beam_search_step(beam_options=initial, beta=3):
    # print("New tier with " + str(len(beam_options)) + " beam options")
    new_options = [] # candidates for next tier
    finished_options = []
    for beam_option in beam_options:
        # ###
        # 2022-09-06
        # Three possibilities for an individual beam option:
        # 1. Option leads to new option
        # 2. Option is finished
        # 3. No new option but option isn't finished (transposition)
        # NB: We check each witness in the beam option, and if any witness
        #     raises an IndexError, the whole block cannot be advanced and
        #     is finished. (This is true because of our constraints: every
        #     block is a) full-depth and b) no repetition.)
        #
        # What to do:
        #
        # 1. Perform the bisect for each witness based on the head of the
        #    path of the current beam option. This returns an offset into
        #    the witness-specific list of block offsets. Initialize a
        #    counter to 0.
        # 2. Using the initial offsets returned by the bisect operation that
        #    we performed in step #1 (and never perform again) for each witness
        #    plus the counter (which we will increment if needed in the inner
        #    loop), check that next option for each witness. There are three
        #    possibilities for each counter value (over the entire witness group):
        #    a) If the next block (returned by this method) would overrun
        #       for any witness, it will overrun for all witnesses, so the
        #       beam option can be added to the finished list and we exit the
        #       outer loop (the one that processes the beam option).
        #    b) If the next block is a viable option, add it to the options and
        #       check the next witness within this same inner loop instance
        #       because in case of transposition different blocks will suggest
        #       different next blocks, all of which could be viable options.
        #       This ends the processing for that beam option.
        #    c) If we don't find any viable next option and don't overrun for
        #       any witness, increment the counter and replay step #2 (inner
        #       loop).
        # Exit condition: Eventually we either find a viable option or overrun.
        #
        # TODO: How should we implement this to terminate the correct loop in
        # the right place? For? While? Generator? For and while start with the
        # outer loop and work inward; with a generator we start with the inner
        # and work outward.
        # ###
        new_finished_option_check = False
        new_viable_option_check = 0
        counter = 0
        while True:
            for i in range(witness_count): # advance for each witness in turn
                if not beam_option.path: # path is empty only for initial state at tier 0
                    last_offset = -1 # NB: same for all witnesses, and not 0, which will break for witness 0
                else:
                    last_offset = largest_blocks[beam_option.path[0]][1][i]
                try:
                    next_offset = bisect_right(block_offsets_by_witness[i], last_offset)
                    next_value = block_offsets_by_witness[i][next_offset + counter]
                    next_block = witness_offsets_to_blocks[next_value] # find that next block to get its length
                    # would any witness pointer move backwards?
                    # perform matrix subtraction; if signs differ, there are items that move in opposite directions
                    # first option cannot be transposed, so accept it automatically
                    if (not beam_option.path) or (len(set([np.sign(x - y) for x, y in zip(largest_blocks[next_block][1], largest_blocks[beam_option.path[0]][1])])) == 1):
                        new_score = score_by_block[next_block] # accounts for all witnesses
                        # concatenate tuples with a +;  most recent first (for priority heap)
                        new_options.append(BeamOption(score=new_score, path=((next_block,) + beam_option.path)))
                        new_viable_option_check += 1
                    else:
                        continue
                        # print('Transposition detected for beam option:', beam_option)
                except IndexError: # we've gone as far as we can with this path
                    new_finished_option_check = True
                    finished_options.append(beam_option)
                    break # if one witness overruns, they all will, so this beam option is done
            if new_viable_option_check >= witness_count or new_finished_option_check:
                break
            counter += 1
    new_options = list(set(new_options)) # deduplicate
    heapify(new_options) # sort from low score to high (low score is best)
    if not new_options and not finished_options:
        # print(beam_options)
        raise Exception("This shouldn't happen: no new options and no finished options")
    else:
        return new_options[:beta], finished_options

In [23]:
options, _ = perform_beam_search_step()
finished = [] # options that cannot go further
while options: # no more options means that we're done
    # TODO: The beam size at the moment is a magic number; can we rationalize it?
    options, end_of_life = perform_beam_search_step(beam_options=options, beta=20)
    finished.extend(end_of_life) # add any options that cannot go further
    # print(len(options), len(finished))
finished = list(set(finished)) # TODO: Remove this because we'll sort later?
# TODO: Verify that better scores are better alignments (how??)

In [24]:
# finished holds beam options that cannot go further, with duplicates removed
# BeamOption.score counts tokens placed or skipped, which is correct for traversing, but
#   for evaluation we count only most tokens placed and sub-sort by fewest blocks
# Blocks know their length, so we sum the lengths of the finalists and keep only the highest
# NB: There could be more than one
finished.sort(reverse = True, key = lambda f: (sum([largest_blocks[b][0] for b in f.path]), -1 * len(f.path)))
if debug:
    for pos, f in enumerate(finished):
        print(pos, sum([largest_blocks[b][0] for b in f.path]), len(f.path))
        print(f)
# TODO: Adaptive beam width? We can't evaluate the consequences of a suboptimal
# first-pass alignment until we follow through to a full alignment. It could be
# that saving time by sacrificing a small improvement in the first pass won't
# affect the outcome because we'll inevitably and at no addition cost fix it
# later.

0 10880 477
BeamOption(score=80421, path=(12790, 12783, 12776, 12770, 12756, 12728, 12722, 12718, 12708, 12693, 12677, 12649, 12637, 12617, 12604, 12589, 12580, 12558, 12536, 12525, 12501, 12496, 12484, 12458, 12442, 12411, 12396, 12374, 12310, 12292, 12286, 12272, 12215, 12197, 12121, 12099, 12092, 12073, 12008, 11970, 11964, 11957, 11942, 11927, 11908, 11899, 11892, 11862, 11853, 11847, 11837, 11831, 11799, 11787, 11769, 11681, 11663, 11619, 11568, 11560, 11539, 11508, 11486, 11468, 11466, 11457, 11449, 11418, 11384, 11381, 11372, 11368, 11338, 11320, 11294, 11289, 11278, 11257, 11244, 11218, 11114, 11109, 11074, 10989, 10915, 10881, 10847, 10831, 10817, 10809, 10776, 10700, 10556, 10530, 10484, 10460, 10412, 10367, 10024, 9949, 9945, 9932, 9886, 9867, 9863, 9824, 9816, 9803, 9787, 9715, 9613, 9604, 9592, 9585, 9570, 9552, 9519, 9504, 9431, 9419, 9387, 9307, 9302, 9296, 9131, 9129, 9091, 9006, 8979, 8971, 8782, 8778, 8766, 8682, 8641, 8637, 8374, 8300, 8231, 8171, 8135, 8119, 8079, 8

In [25]:
# FIXME 2022-09-20:
# We find unaligned tokens by looking between blocks, as well as leading unaligned tokens.
# TODO:
#  1. We don't find unaligned tokens after the last block.
#  2. We don't test what happens if there are no unaligned tokens before the first block.
table_top = """
    <html>
        <head>
            <style type="text/css">
                table, tr, th, td {border: 1px solid black; border-collapse: collapse;}
                th, td {padding: 3px;}
                td:first-child {text-align: right;}
            </style></head><body><table><tr style="background-color: pink;"><th>Row</th>
    """ + '\n'.join(['<th style="border: 1px black solid; border-collapse: collapse; text-align: center;">w' + str(i) + '</th>' for i in range(len(witnesses))]) + '</tr>'
table_bottom = '</table></body></html>'

block0_start_positions = largest_blocks[finished[0].path[-1]][1]
# if debug:
#     print(block0_start_positions)
#     print(first_absolute_token_by_witness)
if block0_start_positions != first_absolute_token_by_witness:
    leading_unaligned_tokens = ['<td>' + " ".join(token_array[i: j+1]) + '</td>' for i, j in zip(first_absolute_token_by_witness, block0_start_positions)]
    leading_unaligned_row = '<tr style="background-color: lightgray; border: 1px black solid; border-collapse: collapse;"><td style="background-color: pink;">unaligned</td>' + "".join(leading_unaligned_tokens) + '</tr>'

rows = []
# Rows with aligned tokens are the same in all witness by definition
# The path contains largest_blocks keys, which represent the last token of
#   a block in witness 0
# The value of a block is a tuple, the first member of which is the length
# We can retrieve the aligned tokens by slicing them from the token_array
for index, end_token_offset in enumerate(finished[0].path[::-1]): # path is ordered from last to first
    # ###
    # Information for aligned block
    # This is the same for all witnesses, taken from witness 0
    # ###
    block_length = largest_blocks[end_token_offset][0]
    start_token_offset = end_token_offset - block_length
    tokens = token_array[start_token_offset: end_token_offset]
    # ###
    # Information for preceding non-aligned block
    # This is different for each witness
    #
    # Loop over witnesses using range(len(witnesses))
    # Get start token offset for aligned block for current witness
    # Get end token offset for preceding aligned block for current witness
    # Get tokens by slicing token array
    # ###
    if index > 0:
        current_block = largest_blocks[end_token_offset]
        preceding_block = largest_blocks[finished[0].path[::-1][index - 1]]
        unaligned_row = []
        unaligned_row.append('<tr style="background-color: lightgray; border: 1px black solid; border-collapse: collapse;"><td style="background-color: pink;">unaligned</td>')
        for i in range(len(witnesses)):
            unaligned_start_token_offset = preceding_block[1][i] + preceding_block[0]
            unaligned_end_token_offset = current_block[1][i] - 1
            unaligned_tokens = token_array[unaligned_start_token_offset: unaligned_end_token_offset + 1]
            unaligned_row.append('<td style="border: 1px black solid; border-collapse: collapse;">' + " ".join(unaligned_tokens) + '</td>')
        unaligned_row.append('</tr>')
        rows.append("".join(unaligned_row))
    # ###
    # Create aligned block
    # ###
    rows.append('<tr style="background-color: beige; border: 1px black solid; border-collapse: collapse;"><td style="background-color: pink; border: 1px black solid; border-collapse: collapse;">' + str(index) + ' (' + str(end_token_offset) + ')</td><td  style="border: 1px black solid; border-collapse: collapse;" colspan="' + str(len(witnesses)) + '">' + " ".join(tokens) + '</td></tr>')
table = table_top + leading_unaligned_row + "".join(rows) + table_bottom
with open('table-output.html', 'w') as f:
    f.write(table)
HTML(table)

Row,w0,w1,w2,w3,w4,w5
unaligned,when we look to the,when we look to the,when we look to the,causes of variability . when we look to the,causes of variability . when we compare the,causes of variability . when we compare the
0 (30),"the individuals of the same variety or sub - variety of our older cultivated plants and animals , one of the first points which strikes us","the individuals of the same variety or sub - variety of our older cultivated plants and animals , one of the first points which strikes us","the individuals of the same variety or sub - variety of our older cultivated plants and animals , one of the first points which strikes us","the individuals of the same variety or sub - variety of our older cultivated plants and animals , one of the first points which strikes us","the individuals of the same variety or sub - variety of our older cultivated plants and animals , one of the first points which strikes us","the individuals of the same variety or sub - variety of our older cultivated plants and animals , one of the first points which strikes us"
unaligned,",",",",",",",",,
1 (37),"is , that they generally differ","is , that they generally differ","is , that they generally differ","is , that they generally differ","is , that they generally differ","is , that they generally differ"
unaligned,"much more from each other ,",more from each other,more from each other,more from each other,from each other more,more from each other
2 (59),than do the individuals of any one species or variety in a state of nature .,than do the individuals of any one species or variety in a state of nature .,than do the individuals of any one species or variety in a state of nature .,than do the individuals of any one species or variety in a state of nature .,than do the individuals of any one species or variety in a state of nature .,than do the individuals of any one species or variety in a state of nature .
unaligned,when,when,when,when,and if,and if
3 (91),"we reflect on the vast diversity of the plants and animals which have been cultivated , and which have varied during all ages under the most different climates and treatment ,","we reflect on the vast diversity of the plants and animals which have been cultivated , and which have varied during all ages under the most different climates and treatment ,","we reflect on the vast diversity of the plants and animals which have been cultivated , and which have varied during all ages under the most different climates and treatment ,","we reflect on the vast diversity of the plants and animals which have been cultivated , and which have varied during all ages under the most different climates and treatment ,","we reflect on the vast diversity of the plants and animals which have been cultivated , and which have varied during all ages under the most different climates and treatment ,","we reflect on the vast diversity of the plants and animals which have been cultivated , and which have varied during all ages under the most different climates and treatment ,"
unaligned,i think,i think,i think,i think,,
4 (100),we are driven to conclude that this,we are driven to conclude that this,we are driven to conclude that this,we are driven to conclude that this,we are driven to conclude that this,we are driven to conclude that this


In [26]:
# Create hybrid graph
# Aligned blocks are a single node with (except if they are initial or last)
#   separate in- and out-edges for each witness.
# Unaligned blocks are separate nodes for each witness that contained all
#   unaligned cells at that location.
# The aligned blocks are treated as if they were in a variable graph.
# Unaligned blocks can then be processed to look for new alignments (made
#   possible because the first pass removed repetition. Initially we can
#   complete the graph after the initial pass and then traverse it to
#   process the unaligned portions. Eventually we should process them in
#   place, recursively, as we construct the graph.

# Initialize graph

graph = nx.DiGraph()

# Traverse aligned and unaligned intervals as we do with alignment table, above,
#   creating nodes and edges as we proceed.
# TODO: Currently we assume unaligned tokens at the beginning and no unaligned
#   tokens at the end. This is correct for our sample but not reliable for
#   arbitrary input.

In [28]:
graph

<networkx.classes.digraph.DiGraph at 0x119baf6a0>

In [27]:
# 2022-09-10
#
# Next steps:
#
# 1. Do we fix output table (currently it misses initial and final
#    unaligned tokens) or create a variant graph and use existing
#    CollateX code to derive an alignment table from the variant
#    graph?
# 2. Unaligned blocks may contain full-depth non-repeating tokens
#    because the first pass partitioning removed repetition. Should
#    we make our process a callable function and call it, recursively,
#    on the unaligned tokens between aligned blocks until it bottoms
#    out?
# 3. Even if we do #2, above, at some point we have no more full-depth
#    non-repeating alignments in one of our small sub-problems, and
#    our method has nothing to do. Should we call old CollateX to deal
#    with those (even though we give it none of what it likes to find)?
#    Old CollateX can deal with non-full-depth, which our new method
#    ignores.
#    NB: Old CollateX is not order-independent, which matters less with
#    smaller input, but it nonetheless contradicts our order-independent
#    orientation.
#    NB: The issue on non-aligned tokens is more problematic with more
#    witnesses, since more witnesses means less full-depth alignment.
# 4. Our first pass has false positives (see alignment #38 ". i may add
#    , that"). Can we reject these by allowing interconnected (not fully
#    independent) passes?