# Traverse LCP, then MFS

* Replace earlier masked-array strategy with regular arrays, using 0 to represent a null.
* Real data (offset of token within witness) is one-based.



In [536]:
from typing import List
from linsuffarr import SuffixArray
from linsuffarr import UNIT_BYTE
import pprint
import numpy as np
from dataclasses import dataclass
from heapq import * # priority heap, https://docs.python.org/3/library/heapq.html
pp = pprint.PrettyPrinter(indent=2)
from bisect import bisect_right
from IPython.core.display import display, HTML
debug = True

  from IPython.core.display import display, HTML


In [537]:
# sigla = ['w0', 'w1', 'w2', 'w3', 'w4', 'w5']
# filenames = ['darwin1859.txt', 'darwin1860.txt', 'darwin1861.txt', 'darwin1866.txt', 'darwin1869.txt', 'darwin1872.txt']
# sigla = ['w0', 'w1', 'w2', 'w3']
# filenames = ['darwin1859.txt', 'darwin1860.txt', 'darwin1861.txt', 'darwin1866.txt']
# sigla = ['w0', 'w1']
# filenames = ['darwin1859.txt', 'darwin1860.txt']
sigla = ['w0', 'w1', 'w2', 'w3', 'w4']
filenames = ['abc/abcd.txt', 'abc/abcda.txt', 'abc/abcdb.txt', 'abc/abcdc.txt', 'abc/abcdd.txt']
first_paragraph = 0
last_paragraph = 10
how_many_paragraphs = last_paragraph - first_paragraph
raw_data_dict = {}
for siglum, filename in zip(sigla, filenames):
    with open(filename) as f:
        lines = f.readlines()
        lines = [line for line in lines if line != '\n']
        raw_data_dict[siglum] = " ".join(lines[first_paragraph : last_paragraph])

In [538]:
def tokenize_witnesses(witness_strings: List[str]): # one string per witness
    '''Return list of witnesses, each represented by a list of tokens'''
    # TODO: handle punctuation, upper- vs lowercase
    witnesses = []
    for witness_string in witness_strings:
        witness_tokens = witness_string.split()
        witnesses.append(witness_tokens)
    return witnesses

In [539]:
def create_token_array(witness_token_lists): # list of token lists per witness
    '''Create token array (single list, with separator " # " between witnesses'''
    token_array = [] # strings
    token_membership_array = [] # witness identifiers, same offsets as in token_array
    token_witness_offset_array = [] # one-based offset of token in witness
    last_witness_offset = len(witness_token_lists) - 1
    for index, witness_token_list in enumerate(witness_token_lists):
        token_array.extend(witness_token_list)
        for token_offset, token in enumerate(witness_token_list): # don't need enumerate, just len()
            token_witness_offset_array.append(token_offset)
        token_membership_array.extend([index for token in witness_token_list])
        if index < last_witness_offset:
            separator = " #" + str(index + 1) + " "
            token_array.append(separator)
            token_membership_array.append(separator)
            token_witness_offset_array.append(-1)
    return token_array, token_membership_array, token_witness_offset_array

In [540]:
witness_sigla = [key for key in raw_data_dict.keys()]
witnesses = tokenize_witnesses([value for value in raw_data_dict.values()]) # strings
# token_list

In [541]:
token_array, token_membership_array, token_witness_offset_array = create_token_array(witnesses)
print(f"{token_array=}")
print(f"{token_membership_array=}")
print(f"{token_witness_offset_array=}")

token_array=['Darwin', '1859', 'WHEN', 'we', 'look', 'to', 'the', 'individuals', 'of', 'the', 'same', 'variety', 'or', 'sub-variety', 'of', 'our', 'older', 'cultivated', 'plants', 'and', 'animals,', 'one', 'of', 'the', 'first', 'points', 'which', 'strikes', 'us,', 'is,', 'that', 'they', 'generally', 'differ', 'much', 'more', 'from', 'each', 'other,', 'than', 'do', 'the', 'individuals', 'of', 'any', 'one', 'species', 'or', 'variety', 'in', 'a', 'state', 'of', 'nature.', 'When', 'we', 'reflect', 'on', 'the', 'vast', 'diversity', 'of', 'the', 'plants', 'and', 'animals', 'which', 'have', 'been', 'cultivated,', 'and', 'which', 'have', 'varied', 'during', 'all', 'ages', 'under', 'the', 'most', 'different', 'climates', 'and', 'treatment,', 'I', 'think', 'we', 'are', 'driven', 'to', 'conclude', 'that', 'this', 'greater', 'variability', 'is', 'simply', 'due', 'to', 'our', 'domestic', 'productions', 'having', 'been', 'raised', 'under', 'conditions', 'of', 'life', 'not', 'so', 'uniform', 'as,', '

In [542]:
suffix_array = SuffixArray(token_array, unit=UNIT_BYTE)
# print(suffix_array)
# LCP=0 means that the block has nothing in common with the next one

In [543]:
lcp_array = suffix_array._LCP_values
lcp_array[:5]

array('i', [0, 0, 0, 0, 0])

In [544]:
# create Block dataclass
from dataclasses import dataclass
@dataclass(unsafe_hash=True)
class Block:
    token_count: int
    start_position: int # offset into suffix array (not into token array!)
    end_position: int # start and end position give number of occurrences
    all_start_positions: [] # compute after blocks have been completed
    witnesses: set
    witness_count: int # number of witnesses in which pattern occurs, omitted temporarily because requires further computation
    frequency: int # number of times pattern occurs in whole witness set (may be more than once in a witness), end_position - start_position + 1
    # how_created: int # debug

In [545]:
def create_blocks_old (_lcp_array):
    '''Create blocks from lcp array

    Skip first lcp value, which is a fake; otherwise compare lcp value to length of block at top of stack.
    Four possibilities:

        stack is empty
            * if lcp value == 0, proceed to next lcp value (continue)
            * if lcp value > 0, create block and push onto stack, then proceed to next lcp value (continue)

        lcp value (cannot equal 0) matches block length at top of stack
            * proceed to next lcp value (continue)

        lcp value (cannot equal 0) is longer than block length at top of stack
            * create and push new block

        lcp value is shorter than block length at top of stack
            * (recursive) if block at top of stack is longer than current lcp value, pop and append to _blocks
            * if block at top of stack is equal to lcp value, proceed to next lcp value (continue)
            * if block at top of stack is shorter than current lcp value ...
            *   create and push new block starting at start position of most recently closed block, then proceed to next lcp value (continue)

    In other words:

        We proceed to next lcp value if:
            * stack is empty and lcp value == 0
            * lcp value matches block length at top of stack (can we combine this with the preceding, since an empty stack effectively has a zero-length block on top?)

        We push a new value on stack and then proceed to next lcp value if:
            * stack is empty and lcp value > 0
            * lcp value is longer than block length at top of stack (where is the start position?)

        We pop from the stack to _blocks and then check the next stack value (stick with same lcp) if:
            * lcp value is shorter than current block value

cases (occurrences are always one more than number of repetitions):
    5 5 2     --> 1 block of 5 occurs 3 times, 1 block of 2 occures 4 times
    2 5 5 2   --> 1 block of 2 occurs 5 times, 1 block of 5 occures 3 times
    5 5 0 2   --> 1 block of 5 occurs 3 times, 1 block of 2 occures 2 times
    2 5 5 2 3 --> 


Nested while structures:

(Create blocks in two places because they have different start positions)
(Nested while loops because we traverse two things: lcp array and, sometimes, stack)

while next-lcp-value: # traverse lcp array
    if something
    elif something else
    elif perhaps yet another something else
    else: # possible hidden block (or possibly not)
        while something-on-the-stack: # traverse stack for some lcp value situations
            pop larger values
        if hidden-block:
            create and push
clean-up-stack-after-last-lcp-value # or tack a 0 onto the end of the lcp to avoid extra clean-up code
'''
    from collections import deque # deque has faster append and pop than list
    _blocks = []
    open_block_stack = deque()
    for offset, lcp in enumerate(lcp_array):
        # three situations: next one is same value, higher that last, or lower than last
        # if same value: same pattern
        # if higher or lower, new pattern (may overlap with previous, unless one or the other value is 0)
        peek = open_block_stack[-1] if open_block_stack else None
        peek_token_count = peek.token_count if peek else 0
        if offset == 0: # skip the first one, which is a transition from a fake start value
            continue # resume loop with next item in lcp array
        elif lcp == peek_token_count:
            pass # same pattern (happens with repetition), so do nothing
        elif lcp > peek_token_count: # new prefix is longer than previous one, so start new pattern
            # can fill in end_position and frequency only when we encounter a shorter value in the LCP array
            # start_position is number of patterns that are the same 
            open_block_stack.append(Block(token_count = lcp, start_position = offset - 1, end_position = -1, all_start_positions = [], witnesses = (), witness_count = -1, frequency = -1))
        else: # new prefix is shorter than previous one, so:
                # 1. close open blocks with higher values
                # 2. do something else
            while open_block_stack and open_block_stack[-1].token_count > lcp: # if an open block is longer than the current length, pop and close it
                block_being_modified = open_block_stack.pop()
                block_being_modified.end_position = offset - 1
                block_being_modified.frequency = block_being_modified.end_position - block_being_modified.start_position + 1
                _blocks.append(block_being_modified)
            if lcp > 0 and (not open_block_stack or open_block_stack[-1].token_count < lcp):
                open_block_stack.append(Block(token_count = lcp, start_position = _blocks[-1].start_position, end_position = -1, all_start_positions = [], witnesses = (), witness_count = -1, frequency = -1))

    while open_block_stack: # pop anything left in open_block_stack
        block_being_modified = open_block_stack.pop()
        block_being_modified.end_position = len(lcp_array) - 1
        block_being_modified.frequency = block_being_modified.end_position - block_being_modified.start_position + 1
        _blocks.append(block_being_modified)

    # add all_start_positions and then witness_count properties to blocks
    for _index, _block in enumerate(_blocks):
        # block_start_position through block_end_position gives offsets of all start positions in suffix_array
        _block.all_start_positions = sorted([suffix_array.SA[x] for x in range(_block.start_position,_block.end_position + 1)])
        # use all start positions to find witness count
        _block.witnesses = set(token_membership_array[offset] for offset in _block.all_start_positions)
        _block.witness_count = len(_block.witnesses)
    return _blocks

In [546]:
@dataclass
class Lcp_interval_candidate:
    lcp_start_offset: int
    lcp_interval_token_count: int
    lcp_end_offset: int = -1

In [547]:
def expand_prefix(prefix_to_expand:Lcp_interval_candidate):
    token_start_position = suffix_array.SA[prefix_to_expand.lcp_start_offset]
    token_count = prefix_to_expand.lcp_interval_token_count
    tokens = token_array[token_start_position: token_start_position + token_count]
    return tokens

In [548]:
def check_for_depth_and_repetition(_lcp_interval:Lcp_interval_candidate, _witness_count: int) -> bool:
    """Write a docstring someday

    Number of prefixes >= total number of witnesses
    Accumulate set of witness sigla for prefixes
    if:
        no witness occurs more than once, return True to keep this block
    else:
        return False
    """
#     print(f"Checking for depth and repetition for: {_lcp_interval=}")
#     print(f"Occurs {_lcp_interval.lcp_end_offset - _lcp_interval.lcp_start_offset + 1} times in witness set")
#     print(expand_prefix(_lcp_interval))
#     print()
    block_instance_count = _lcp_interval.lcp_end_offset - _lcp_interval.lcp_start_offset + 1
    if block_instance_count != _witness_count:
        return False
    else:
        witnesses_found = []
        for lcp_interval_item_offset in range(_lcp_interval.lcp_start_offset, _lcp_interval.lcp_end_offset + 1):
            token_position = suffix_array.SA[lcp_interval_item_offset] # point from prefix to suffix array position
            witness_siglum = token_membership_array[token_position] # point from token array position to witness identifier
            if witness_siglum in witnesses_found:
                return False
            else:
                witnesses_found.append(witness_siglum)
        return True

In [549]:
def create_blocks(_lcp_array: list):
    """Write a docstring someday

    Look at changes in length of LCP array
    Initial value is 0 or -1 because it's a comparison with previous, and first has no previous
    Next value is number of tokens shared with previous
    Exact length doesn't matter, but if it changes, new pattern:
        If it stays the same, take note but do nothing yet; it means that the pattern repeats
        No change for a while, then goes to 0:
            Number of repetitions plus 1, e.g., 5 5 5 0 = 4 instances of 5
            Once it changes to 0, we've seen complete pattern
        Changer to smaller means hidden, deeper block
        Changes to longer means ???
    """
    accumulator = [] # lcp positions (not values) since most recent 0
    frequent_sequences = [] # lcp intervals to be considered for mfs
    #
    # lcp value
    # if == 0 it's a new interval, so:
    #   1. if there is already an accumulation, commit (process) it
    #      "committing the buffer" means checking for repetition and depth
    #          if it passes check: store in mfs list
    #          otherwise throw it away
    #   2. clear buffer (accumulator) and begin accumulating new buffer with the new offset with 0 value
    # otherwise it isn't zero, so there must be a buffer in place, so add to it (for now)
    for offset, value in enumerate(_lcp_array):
        if not accumulator and value == 0: # if accumulator is empty and new value is 0, do nothing
            continue
        elif not accumulator: # accumulator is empty and new value is non-zero, so begin new accumulator
            accumulator.append(Lcp_interval_candidate(lcp_start_offset = offset - 1, lcp_interval_token_count = value))
        elif value > accumulator[-1].lcp_interval_token_count: # new interval, so add to accumulator and continue
            accumulator.append(Lcp_interval_candidate(lcp_start_offset = offset - 1, lcp_interval_token_count = value))
        elif value == accumulator[-1].lcp_interval_token_count: # same block as before, so do nothing
            continue
        else: # new value is less than top of accumulator, so pop everything that is higher
            # Positions in lcp array and suffix array coincide:
            #   The lcp array value is the length of the sequence
            #   The suffix array value is the start position of the sequence
            # Assume accumulator values (offsets into lcp array) point to [3, 6] and new value is 4, so:
            #   First: Pop pointer to 6 (length value in lcp array), store in frequent_sequences
            #   Second: Push new pointer to same position in lcp array, but change value in lcp array to 4
            while accumulator and accumulator[-1].lcp_interval_token_count > value:
                # Create pointer to last closed block that is not filtered (like frequent_sequences)
                newly_closed_block = accumulator.pop()
                newly_closed_block.lcp_end_offset = offset - 1
                if check_for_depth_and_repetition(newly_closed_block, len(witnesses)):
                    frequent_sequences.append([newly_closed_block.lcp_start_offset, newly_closed_block.lcp_end_offset, newly_closed_block.lcp_interval_token_count])
            # There are three options:
            #   1. there is content in the accumulator and latest value is not 0
            #   2. accumulator is empty and latest value is 0
            #   3. accumulator is empty and latest value is not 0
            # (the fourth logical combination, content in the accumulator and 0 value, cannot occur
            #     because a 0 value will empty the accumulator)
            if value > 0 and (not accumulator or accumulator[-1].lcp_interval_token_count != value):
                accumulator.append(Lcp_interval_candidate(lcp_start_offset = newly_closed_block.lcp_start_offset, lcp_interval_token_count = value))
    # End of lcp array; run through any residual accumulator values
    while accumulator:
        newly_closed_block = accumulator.pop()
        newly_closed_block.lcp_end_offset = len(_lcp_array) - 1
        if check_for_depth_and_repetition(newly_closed_block, len(witnesses)):
            frequent_sequences.append([newly_closed_block.lcp_start_offset, len(_lcp_array)-1, newly_closed_block.lcp_interval_token_count])
    return frequent_sequences

In [550]:
# frequent_sequences is a list of lists
# the embedded lists contain LCP indices
#   LCP indices point into LCP array, but same index also points into suffix array
#   value in LCP array points to prefix length (compared to previous one)
#   value in suffix array points into token array
frequent_sequences = create_blocks(lcp_array)
# print(len(frequent_sequences))
# pp.pprint(frequent_sequences[:5])

In [551]:
# # A sequence like [40, 41, 42, 43] represents the same prefix. Each of these is a position in the LCP array that represents the length of the prefix.
# for i in range(40, 44):
#     print(f"{lcp_array[i]=}")
#     print(f"{token_array[suffix_array.SA[i]]=}")
# # Returns: 193, 30, 78, 0
# # The length of a block is the lowest value higher than 0, so in this case 30.
# # The suffix_array is in suffix_array.SA. Each of the four values is for a specific witness, so choose the first one arbitrarily, so 193.
# # Examine 193rd value in suffix array:
# print(f"{suffix_array.SA[193]=}")
# # This returns 3378. The suffix array value is a pointer into the token array. So:
# print(f"{token_array[3378]=}")
# # The blocks are in alphabetical order.
# # Look at part of token string
# " ".join(token_array[3368:3388])

In [552]:
# Look at results
# NB: This is debug output only
# print(f"{suffix_array._LCP_values=}")
# print(f"{suffix_array.SA=}")
# pp.pprint(witnesses)
#
# print("Values are lcp_start_offset, lcp_end_offset, and lcp_interval_token_count")
# print()
# largest_blocks = {} # key is token end position, value is (length, [witness-start-positions])
# for frequent_sequence in frequent_sequences:
# #     print(f"Before filtering: examining frequent sequence {frequent_sequence}")
#     length = frequent_sequence[2]
#     suffix_array_values = [suffix_array.SA[i] for i in range(frequent_sequence[0], frequent_sequence[1] + 1)]
#     tokens = [token_array[i] for i in range(suffix_array_values[0], suffix_array_values[0] + length)]
#     token_end_position = min(suffix_array_values) + length # token end position for first witness
#     if token_end_position not in largest_blocks: # first block with this end position, so create new key
#         largest_blocks[token_end_position] = (length, suffix_array_values)
#     else: # if new block is longer, replace old one with same key
#         if length > largest_blocks[token_end_position][0]:
#             largest_blocks[token_end_position] = (length, suffix_array_values)
# for b in frequent_sequences:
#     lcp_start_value = b[0]
#     token_start_position = suffix_array.SA[lcp_start_value]
#     token_count = b[2]
#     tokens = token_array[token_start_position: token_start_position + token_count]
#     print(b, tokens)

In [553]:
# print(suffix_array)

In [554]:
# To remove embedded prefixes:
#
# 1. Create dictionary with end position in witness 0 (arbitrarily) as key
# 2. Set value of key to longest sequence with that end position
# 3. Dictionary values will contain only longest frequent sequences, removing embedded ones,
#    as tuples if (length, [token start positions for all witnesses])

@dataclass
class LongestSequence:
    length: int
    witness_start_and_end: List[int]

def find_longest_sequences(_frequent_sequences, _suffix_array):
    _largest_blocks = {} # key is token end position, value is (length, [witness-start-positions])
    for frequent_sequence in _frequent_sequences:
        length = frequent_sequence[2]
        suffix_array_values = [_suffix_array.SA[i] for i in range(frequent_sequence[0], frequent_sequence[1] + 1)]
        token_end_position = min(suffix_array_values) + length # token end position for first witness
        if token_end_position not in _largest_blocks: # first block with this end position, so create new key
            _largest_blocks[token_end_position] = (length, sorted(suffix_array_values))
        else: # if new block is longer, replace old one with same key
            if length > _largest_blocks[token_end_position][0]:
                _largest_blocks[token_end_position] = (length, sorted(suffix_array_values))
    return _largest_blocks

largest_blocks = find_longest_sequences(frequent_sequences, suffix_array)
print(f"{largest_blocks=}")

largest_blocks={154: (25, [129, 364, 599, 837, 1072]), 1: (1, [0, 236, 471, 706, 944]), 204: (11, [193, 428, 663, 901, 1131]), 219: (10, [209, 444, 679, 917, 1146]), 128: (5, [123, 358, 593, 831, 1066]), 4: (2, [2, 238, 473, 711, 949]), 54: (15, [39, 274, 509, 747, 984]), 84: (29, [55, 290, 525, 763, 1001]), 179: (8, [171, 406, 641, 879, 1110]), 28: (22, [6, 242, 477, 715, 952]), 122: (25, [97, 332, 567, 805, 1040]), 235: (15, [220, 455, 690, 928, 1156]), 93: (7, [86, 321, 556, 794, 1030]), 166: (11, [155, 390, 625, 863, 1097]), 191: (11, [180, 415, 650, 888, 1119]), 207: (2, [205, 440, 675, 913, 1143]), 34: (5, [29, 265, 500, 738, 975]), 38: (2, [36, 271, 506, 744, 980]), 36: (1, [35, 270, 505, 743, 983]), 169: (2, [167, 402, 637, 875, 1108]), 193: (2, [191, 426, 661, 899, 1144]), 96: (2, [94, 329, 564, 802, 1038])}


In [555]:
# block_offsets_by_witness: list of lists holds sorted start offsets per witness (offsets are into global token array)
# witness_offsets_to_blocks: dictionary points from start offsets to blocks
# score_by_block: number of tokens placed or skipped if block is placed
# Beam search requires us, given an offset in a witness, to find the next block. We do
#   that by looking up the value in block_offsets_by_witness and then using that value
#   to retrieve the block key from witness_offsets_to_blocks
# Lookup in the list of lists is:
#   block_offsets_by_witness[witness_number][bisect_right(block_offsets_by_witness[witness_number], most_recent_offset_in_witness)]
# (See: https://www.geeksforgeeks.org/python-find-smallest-element-greater-than-k/)
# FIXME: traverse largest_blocks only once and add values for all witnesses in same pass
witness_count = len(witnesses)
block_offsets_by_witness = []
witness_offsets_to_blocks = {}
first_token_offset_in_block_by_witness = [] # only tokens in blocks
first_absolute_token_by_witness = [] # all tokens, whether in block or not
for i in range(witness_count):
    first_token_offset_in_block_by_witness.append(token_membership_array.index(i))
    # Score = number of tokens either placed or skipped (we don't care which)
    # Low score is best because it leaves the highest potential
    # NB: The name "score" seems to imply that higher is better, and the
    #   opposite is the case here. Rename the variable?
    # NB: High potential is paramount during beam search, but should the
    #   difference between placed and skip matter at a later stage? Or
    #   does placing more blocks (more tiers) take care of that?
    score_by_block = {}
    for i in range(witness_count):
        witness_offset_list = []
        for key, value in largest_blocks.items():
            witness_offset_list.append(value[1][i])
            witness_offsets_to_blocks[value[1][i]] = key
        witness_offset_list.sort()
        block_offsets_by_witness.append(witness_offset_list)
for i in range(witness_count):
    first_absolute_token_by_witness.append(token_membership_array.index(i))
for key, value in largest_blocks.items():
    # to determine number of tokens that will have been placed or skipped
    #   after placing block:
    #       matrix-subtract first_token_offset_by_witness from value[1]
    #       add witness_count * value[0] (to account for block length)
    #   key by block key, value is score
    differences = [x - y for x, y in zip(value[1], first_token_offset_in_block_by_witness)]
    if debug:
        print(differences)
    score = sum(differences) + witness_count * value[0]
    score_by_block[key] = score
if debug:
    print(f"{block_offsets_by_witness=}")
    witness_offsets_to_blocks = { key: witness_offsets_to_blocks[key] for key in sorted(witness_offsets_to_blocks.keys())}
    print(f"{witness_offsets_to_blocks=}")
    print(f"{first_token_offset_in_block_by_witness=}")
    print(f"{first_absolute_token_by_witness=}")
    print(f"{score_by_block=}")

[129, 128, 128, 131, 128]
[0, 0, 0, 0, 0]
[193, 192, 192, 195, 187]
[209, 208, 208, 211, 202]
[123, 122, 122, 125, 122]
[2, 2, 2, 5, 5]
[39, 38, 38, 41, 40]
[55, 54, 54, 57, 57]
[171, 170, 170, 173, 166]
[6, 6, 6, 9, 8]
[97, 96, 96, 99, 96]
[220, 219, 219, 222, 212]
[86, 85, 85, 88, 86]
[155, 154, 154, 157, 153]
[180, 179, 179, 182, 175]
[205, 204, 204, 207, 199]
[29, 29, 29, 32, 31]
[36, 35, 35, 38, 36]
[35, 34, 34, 37, 39]
[167, 166, 166, 169, 164]
[191, 190, 190, 193, 200]
[94, 93, 93, 96, 94]
block_offsets_by_witness=[[0, 2, 6, 29, 35, 36, 39, 55, 86, 94, 97, 123, 129, 155, 167, 171, 180, 191, 193, 205, 209, 220], [236, 238, 242, 265, 270, 271, 274, 290, 321, 329, 332, 358, 364, 390, 402, 406, 415, 426, 428, 440, 444, 455], [471, 473, 477, 500, 505, 506, 509, 525, 556, 564, 567, 593, 599, 625, 637, 641, 650, 661, 663, 675, 679, 690], [706, 711, 715, 738, 743, 744, 747, 763, 794, 802, 805, 831, 837, 863, 875, 879, 888, 899, 901, 913, 917, 928], [944, 949, 952, 975, 980, 983, 984, 10

In [556]:
# To perform beam search
#   Create single start option (at Start node, which is a fiction [there is no Start block]
#       created for the beam search)
#   Loop: for each BeamOption on current tier
#       Evaluate score for advancing in each witness and bringing others into alignment with it
#       For β lowest (!) scores create new BeamOption (this advances to next tier)
#           Score is count of tokens placed or skipped (!)
#           Favor lowest score because that has the greatest potential

In [557]:
@dataclass(order=True, frozen=True, eq=True) # heapqueue is priority queue, so requires comparison
class BeamOption:
    score: int
    path: tuple # path through sequence of blocks leading to current BeamOption

In [558]:
# Create initial BeamOption
initial = [BeamOption(score=0, path=())] # tier 0, one-item list
def perform_beam_search_step(beam_options=initial, beta=3):
    new_options = [] # candidates for next tier
    finished_options = []
    for beam_option in beam_options:
        for i in range(witness_count): # advance for each witness in turn
            if not beam_option.path: # path is empty only for initial state at tier 0
                last_offset = -1 # NB: same for all witnesses, and not 0, which will break for witness 0
            else:
                last_offset = largest_blocks[beam_option.path[0]][1][i]
            try:
                next_offset = block_offsets_by_witness[i][bisect_right(block_offsets_by_witness[i], last_offset)]
                next_block = witness_offsets_to_blocks[next_offset] # find that next block to get its length
                # would any witness pointer move backwards?
                # perform matrix subtraction; if signs differ, there are items that move in opposite directions
                # first option cannot be transposed, so accept it automatically
                if (not beam_option.path) or (len(set([np.sign(x - y) for x, y in zip(largest_blocks[next_block][1], largest_blocks[beam_option.path[0]][1])])) == 1):
                    new_score = score_by_block[next_block] # accounts for all witnesses
                    # concatenate tuples with a +;  most recent first (for priority heap)
                    new_options.append(BeamOption(score=new_score, path=((next_block,) + beam_option.path)))
            except IndexError: # we've gone as far as we can with this path
                finished_options.append(beam_option)
                continue
    new_options = list(set(new_options)) #deduplicate
    heapify(new_options) # sort from low score to high (low score is best)
    return new_options[:3], finished_options

In [559]:
options, _ = perform_beam_search_step()
finished = [] # options that cannot go further
for i in range(30):
    options, end_of_life = perform_beam_search_step(options)
    finished.extend(end_of_life) # add any options that cannot go further
    print(options, finished, i)
finished = list(set(finished))
# TODO:
#   1. Verify the scores
#   2. Hold on to beam options when they cannot be extended (currently we throw them away). Hold results in global?
# When new_options is empty, all options have overrun the end of the blocks, and best results will be in new global

[BeamOption(score=26, path=(4, 1))] [] 0
[BeamOption(score=145, path=(28, 4, 1))] [] 1
[BeamOption(score=175, path=(34, 28, 4, 1))] [] 2
[BeamOption(score=184, path=(36, 34, 28, 4, 1)), BeamOption(score=190, path=(38, 34, 28, 4, 1))] [] 3
[BeamOption(score=271, path=(54, 36, 34, 28, 4, 1)), BeamOption(score=271, path=(54, 38, 34, 28, 4, 1))] [] 4
[BeamOption(score=422, path=(84, 54, 36, 34, 28, 4, 1)), BeamOption(score=422, path=(84, 54, 38, 34, 28, 4, 1))] [] 5
[BeamOption(score=465, path=(93, 84, 54, 36, 34, 28, 4, 1)), BeamOption(score=465, path=(93, 84, 54, 38, 34, 28, 4, 1))] [] 6
[BeamOption(score=480, path=(96, 93, 84, 54, 36, 34, 28, 4, 1)), BeamOption(score=480, path=(96, 93, 84, 54, 38, 34, 28, 4, 1))] [] 7
[BeamOption(score=609, path=(122, 96, 93, 84, 54, 36, 34, 28, 4, 1)), BeamOption(score=609, path=(122, 96, 93, 84, 54, 38, 34, 28, 4, 1))] [] 8
[BeamOption(score=639, path=(128, 122, 96, 93, 84, 54, 36, 34, 28, 4, 1)), BeamOption(score=639, path=(128, 122, 96, 93, 84, 54, 

In [560]:
# finished holds beam options that cannot go further, with duplicates removed
# BeamOption.score counts tokens placed or skipped, which is correct for traversing, but
#   for evaluation we count only most tokens placed and sub-sort by fewest blocks
# Blocks know their length, so we sum the lengths of the finalists and keep only the highest
# NB: There could be more than one
finished.sort(reverse = True, key = lambda f: (sum([largest_blocks[b][0] for b in f.path]), -len(f.path)))
for f in finished: #diagnostic
    print(len(f.path), sum([largest_blocks[b][0] for b in f.path]))
print(finished[0])

20 209
19 199
19 198
BeamOption(score=1167, path=(235, 219, 207, 204, 191, 179, 169, 166, 154, 128, 122, 96, 93, 84, 54, 36, 34, 28, 4, 1))


In [590]:
table_top = """
    <html>
        <head>
            <style type="text/css">
                table, tr, th, td {border: 1px solid black; border-collapse: collapse;}
                th, td {padding: 3px;}
                td:first-child {text-align: right;}
            </style></head><body><table><tr style="background-color: pink;"><th>Row</th>
    """ + '\n'.join(['<th style="border: 1px black solid; border-collapse: collapse; text-align: center;">w' + str(i) + '</th>' for i in range(len(witnesses))]) + '</tr>'
table_bottom = '</table></body></html>'
rows = []
# Rows with aligned tokens are the same in all witness by definition
# The path contains largest_blocks keys, which represent the last token of
#   a block in witness 0
# The value of a block is a tuple, the first member of which is the length
# We can retrieve the aligned tokens by slicing them from the token_array
for index, end_token_offset in enumerate(finished[0].path[::-1]): # path is ordered from last to first
    # ###
    # Information for aligned block
    # This is the same for all witnesses, taken from witness 0
    # ###
    block_length = largest_blocks[end_token_offset][0]
    start_token_offset = end_token_offset - block_length
    tokens = token_array[start_token_offset: end_token_offset]
    # ###
    # Information for preceding non-aligned block
    # This is different for each witness
    #
    # Loop over witnesses using range(len(witnesses))
    # Get start token offset for aligned block for current witness
    # Get end token offset for preceding aligned block for current witness
    # Get tokens by slicing token array
    # ###
    if index > 0:
        current_block = largest_blocks[end_token_offset]
        preceding_block = largest_blocks[finished[0].path[::-1][index - 1]]
        unaligned_row = []
        unaligned_row.append('<tr style="background-color: lightgray; border: 1px black solid; border-collapse: collapse;"><td style="background-color: pink;">unaligned</td>')
        for i in range(len(witnesses)):
            unaligned_start_token_offset = preceding_block[1][i] + preceding_block[0]
            unaligned_end_token_offset = current_block[1][i] - 1
            unaligned_tokens = token_array[unaligned_start_token_offset: unaligned_end_token_offset + 1]
            unaligned_row.append('<td style="border: 1px black solid; border-collapse: collapse;">' + " ".join(unaligned_tokens) + '</td>')
        unaligned_row.append('</tr>')
        rows.append("".join(unaligned_row))
    # ###
    # Create aligned block
    # ###
    rows.append('<tr style="background-color: beige; border: 1px black solid; border-collapse: collapse;"><td style="background-color: pink; border: 1px black solid; border-collapse: collapse;">' + str(index) + '</td><td  style="border: 1px black solid; border-collapse: collapse;" colspan="' + str(len(witnesses)) + '">' + " ".join(tokens) + '</td></tr>')
display(HTML(table_top + "".join(rows) + table_bottom))
# print("".join(rows))

Row,w0,w1,w2,w3,w4
0,Darwin,Darwin,Darwin,Darwin,Darwin
unaligned,1859,1860,1861,1866 Causes of Variability.,1869 Causes of Variability.
1,WHEN we,WHEN we,WHEN we,WHEN we,WHEN we
unaligned,look to,look to,look to,look to,compare
2,"the individuals of the same variety or sub-variety of our older cultivated plants and animals, one of the first points which strikes","the individuals of the same variety or sub-variety of our older cultivated plants and animals, one of the first points which strikes","the individuals of the same variety or sub-variety of our older cultivated plants and animals, one of the first points which strikes","the individuals of the same variety or sub-variety of our older cultivated plants and animals, one of the first points which strikes","the individuals of the same variety or sub-variety of our older cultivated plants and animals, one of the first points which strikes"
unaligned,"us,","us,","us,","us,",us
3,"is, that they generally differ","is, that they generally differ","is, that they generally differ","is, that they generally differ","is, that they generally differ"
unaligned,much,,,,from each other
4,more,more,more,more,more
unaligned,"from each other,",from each other,from each other,from each other,


In [562]:
# from IPython.core.display import HTML
# table_top = """
#     <html>
#         <head>
#             <style type="text/css">
#                 table, tr, th, td {border: 1px solid black; border-collapse: collapse;}
#                 th, td {padding: 3px;}
#                 td:first-child {text-align: right;}
#                 .aligned { background-color: beige;}
#                 .nonaligned { background-color: lightgray;}
#             </style></head><body><table><tr><th>Row</th>
#     """ + '\n'.join(['<th>w' + str(i) + '</th>' for i in range(len(witnesses))]) + '</tr>'
# table_bottom = '</table></body></html>'
# table_contents = ''
# rows = []
# sorted_keys = sorted(largest_blocks.keys())
# #
# # Check for leading nonaligned tokens
# #
# first_block_offsets = [token_witness_offset_array[i] for i in sorted(largest_blocks[sorted_keys[0]][1])]
# leading_nonaligned_block = []
# leading_nonaligned_witness_count = 0
# leading_tokens = ''
# for witness_number in range(len(first_block_offsets)):
#     if first_block_offsets[witness_number] == 0:
#         content = '[None]'
#     else:
#         content = " ".join(witnesses[witness_number][0:first_block_offsets[witness_number]])
#         leading_nonaligned_witness_count += 1
#     leading_nonaligned_block.append('<td>' + content + '</td>')
# if leading_nonaligned_witness_count:
#     leading_tokens = '<tr class="nonaligned"><td>Nonaligned</td>' + "".join(leading_nonaligned_block) + '</tr>'
# #
# # Creates aligned rows (preceded by any non-aligned tokens)
# #
# row_number = -1
# for key_position, sorted_key in enumerate(sorted_keys):
#     block_data = largest_blocks[sorted_key]
#     block_length = block_data[0]
#     token_start_positions = sorted(block_data[1])
#     #
#     # Check for preceding non-aligned row
#     #
#     preceding_nonaligned_block = ''
#     if key_position != 0:
#         end_positions_of_previous_block = []
#         for start_position in largest_blocks[sorted_keys[key_position - 1]][1]:
#             end_positions_of_previous_block.append(start_position + largest_blocks[sorted_keys[key_position - 1]][0])
#         end_positions_of_previous_block.sort()
#         content = ['<tr class="nonaligned"><td>Nonaligned</td>']
#         for nonaligned_segment_group in zip(end_positions_of_previous_block, token_start_positions):
#             content.append('<td>' + " ".join(token_array[nonaligned_segment_group[0]:nonaligned_segment_group[1]]) + '</td>')
#         content.append('</td>')
#         preceding_nonaligned_block = "".join(content)
#     #
#     # Create aligned row
#     #
#     row = []
#     row_number += 1
#     row_start = '<tr class="aligned"><td>' + str(row_number) + '</td>'
#     row_end = '</tr>'
#     row.append(row_start)
#     contents = token_array[token_start_positions[0]:token_start_positions[0] + block_length]
#     row.append('<td colspan="' + str(len(token_start_positions)) + '">' + ' '.join(contents) + '</td>')
#     row.append(row_end)
#     rows.append(preceding_nonaligned_block)
#     rows.append(''.join(row))
# table_contents = '\n'.join(rows)
# #
# # Check for trailing nonaligned tokens, create row if needed
# #
# last_aligned_block = largest_blocks[sorted_keys[-1]]
# last_aligned_block_length = last_aligned_block[0]
# last_aligned_block_end_positions = [start_position + last_aligned_block_length - 1 for start_position in sorted(last_aligned_block[1])]
# witness_lengths = [len(witness) for witness in witnesses]
# last_aligned_token_pos = [token_witness_offset_array[i] for i in last_aligned_block_end_positions]
# trailing_unaligned_token_counts = [witness_lengths[i] - last_aligned_token_pos[i] - 1 for i in range(len(witnesses))]
#
# trailing_nonaligned_block = []
# trailing_nonaligned_witness_count = 0
# trailing_tokens = ''
# for witness_number, token_count in enumerate(trailing_unaligned_token_counts):
#     if token_count == 0:
#         content = "[None]"
#     else:
#         content = " ".join(witnesses[witness_number][-token_count])
#         trailing_nonaligned_witness_count += 1
#     trailing_nonaligned_block.append('<td>' + content + '</td>')
# if trailing_nonaligned_witness_count:
#     trailing_tokens = '<tr class="nonaligned"><td>Nonaligned</td>' + "".join(trailing_nonaligned_block) + '</tr>'
# #
# # Create and render table
# #
# # print(witnesses)
# HTML(table_top + leading_tokens + table_contents + trailing_tokens + table_bottom)
#
# # 2022-06-14
# #
# # Where we are today
# #
# # We are not checking for transpositions (using the beam search);
# #   instead we assume no transpositions (correct for our current test data)
# # We number and output aligned blocks correctly
# # We output nonaligned blocks correctly
# #
# # TODO
# # Reimplement beam search to check for transpositions

# RESUME HERE

1. We are not yet looking at a skip option, and we should.
1. We have duplicate states and we shouldn't. If moving more than one witness takes us to the same state, we should keep only one (arbitrarily).
1. We are not yet keeping track of our paths, so we can't reconstruct the best search result from start to finish

Ad 2: With our red and black cat example, moving from the initial "the" to red and to black produces the same score but different states, so keep both. With our current non-diverging Darwin example, all successors have not only the same score, but also the same state, so we should simplify (in this case, our beam would contain only one item).

In [563]:
# Dependency graph, with lots of simplifying assumptions
#
# Relations are from larger to smaller
# Each value in LCP array corresponds to position in suffix array,
#     which corresponds to position in token array,
#     which corresponds to position in witness arrays
# If there is already a path (chain) from A to B, do not create a direct edge
# Requires that the length of the next block be shorter than the current one
#   and that the start position in the next block be one less than that in the current block
# The witnesses for a dependent block must be a subset (possibly equivalent) of the source of the dependency
# dependencies = {}
# for block_position in range(len(sorted_blocks) - 1):
#     current_block = sorted_blocks[block_position]
#     next_block = sorted_blocks[block_position + 1]
#     if current_block.token_count > next_block.token_count \
#     and current_block.all_start_positions[0] == next_block.all_start_positions[0] - 1 \
#     and current_block.witnesses.issuperset(next_block.witnesses):
#         dependencies[block_position] = block_position + 1
# # print(dependencies)

# What to do next (in order)

- [x] Create dataclass for beam option \(initially score, witness, state\)
- [x] Remove witness identifier from beam option object; keep only score and state
- [x] Save state as tuple instead of list
- [x] Save history as tuple of tuples \(stack\), where most recent value is the current state
- [x] Deduplicate options where score plus most recent state are the same. Earlier history may be different, but subsequent history can't be different if the most recent state is the same. We keep **a** \(potential\) best path, but not **all** potential best paths of the same score.
- [x] Create visualization (vertical alignment table)
- [ ] Process skip-one options for each witness
- [ ] Deduplicate skip-one options before processing
- [ ] Make beam size variable, depending on nature and extent of options (currently keep β best results, choosing arbitrarily in case of ties; perhaps keep β best scores, which may have more than β options)



In [564]:
suffix_array._LCP_values

array('i', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 96, 96, 40, 0, 5, 0, 1, 1, 1, 1, 0, 151, 95, 9, 1, 24, 105, 105, 49, 0, 2, 83, 83, 27, 0, 14, 95, 95, 39, 0, 39, 39, 39, 8, 0, 24, 24, 24, 8, 0, 27, 108, 52, 1, 0, 3, 0, 2, 232, 176, 32, 0, 181, 125, 39, 0, 4, 185, 129, 43, 1, 33, 33, 33, 2, 0, 159, 103, 17, 8, 0, 160, 104, 18, 9, 0, 25, 106, 106, 50, 0, 60, 60, 4, 16, 0, 171, 115, 29, 20, 1, 9, 215, 159, 15, 1, 9, 122, 66, 15, 1, 57, 57, 1, 13, 1, 153, 97, 11, 2, 1, 165, 109, 23, 14, 0, 9, 9, 9, 9, 1, 170, 114, 28, 19, 0, 8, 214, 158, 14, 0, 62, 62, 6, 18, 1, 10, 191, 135, 49, 0, 61, 61, 5, 17, 0, 9, 148, 92, 6, 1, 8, 8, 8, 8, 0, 19, 19, 19, 3, 0, 10, 123, 67, 16, 0, 6, 75, 75, 19, 1, 9, 90, 90, 34, 1, 28, 28, 28, 0, 167, 111, 25, 16, 1, 31, 112, 56, 5, 1, 19, 132, 76, 25, 0, 50, 50, 50, 6, 0, 31, 31, 31, 0, 8, 77, 77, 21, 0, 16, 97, 97, 41, 0, 6, 6, 6, 6, 0, 38, 38, 38, 7, 0, 63, 63, 7, 19, 0, 30, 30, 30, 2, 0, 11, 80, 80, 24, 0, 154, 98, 12, 3, 0, 0, 6, 145, 89, 3, 0, 16, 129, 73, 22,

In [565]:
# 1072	'also,Ithin'	LCP=0 
# 129	'also,Ithin'	LCP=25 
# 364	'also,Ithin'	LCP=106 
# 599	'also,Ithin'	LCP=106 
# 837	'also,Ithin'	LCP=50 

print("Values are:\n  offset into suffix array,\n  value in suffix array (pointer to first token in token_array),\n  value in LCP array (length of prefix),\n  tokens (if any)")
print()
for offset, sa_item in enumerate(suffix_array.SA):
    lcp_value = suffix_array._LCP_values[offset]
    tokens = token_array[sa_item: sa_item + lcp_value]
    if offset in range(87, 93):
        print(offset, sa_item, lcp_value, " ".join(tokens))
        print()

# 2022-06-17 RESUME HERE
# For each step in this interval check accumulator and frequent sequences

Values are:
  offset into suffix array,
  value in suffix array (pointer to first token in token_array),
  value in LCP array (length of prefix),
  tokens (if any)

87 1072 0 

88 129 25 also, I think, some probability in the view propounded by Andrew Knight, that this variability may be partly connected with excess of food. It seems

89 364 106 also, I think, some probability in the view propounded by Andrew Knight, that this variability may be partly connected with excess of food. It seems pretty clear that organic beings must be exposed during several generations to the new conditions of life to cause any appreciable amount of variation; and that when the organisation has once begun to vary, it generally continues to vary for many generations. No case is on record of a variable being ceasing to be variable under cultivation. Our oldest cultivated plants, such as wheat, still often yield new varieties: our oldest domesticated animals are still capable of rapid improvement or modifica

In [566]:
# def priority(block: Block) -> float:
#     '''Priority ranges from 0 to ∞

#     depth (number of witnesses) / (frequency * length)
#         modified (by trial and error) to weight the components
#     scale: # TODO: how can we set these in a generally meaningful way?
#         high depth (more witnesses) is most important
#         low frequency (less repetition) is next most important
#         high length (token count) is least important
#     higher numbers are better
#         distance between neighboring values is irrelevant; all that matters is order
#     '''
#     # score = pow(block.witness_count,4) / (pow(block.frequency,3) * block.token_count)
#     score = pow(block.witness_count,6)  * block.token_count / pow(block.frequency,3)
#     return score

In [567]:
# def sort_blocks_by_priority (_blocks: List[Block]) -> List[tuple]:
#     blocks_to_tuples = [(_block, index) for index, _block in enumerate(_blocks)]
#     return sorted(blocks_to_tuples, key=lambda x: priority(x[0]), reverse=True)

In [568]:
# prioritized_blocks = sort_blocks_by_priority(sorted_blocks) # sorted_blocks has been sorted for dependencies

In [569]:
# witness_sigla

In [570]:
# prioritized_blocks[:5]