In [1]:
# We will need to call create blocks.
# From the blocks we create the dependency structure.
# Easiest is to start from a single witness and go from there.
# By going in the token order there is a decent chance that the parent of a adjacent block already exists within the structure.

In [2]:
from typing import List
from linsuffarr import SuffixArray
from linsuffarr import UNIT_BYTE
import pprint
pp = pprint.PrettyPrinter(indent=2)

In [3]:
# sigla = ['w0', 'w1', 'w2', 'w3', 'w4', 'w5']
sigla = ['w0', 'w1', 'w2', 'w3']
# filenames = ['darwin1859.txt', 'darwin1860.txt', 'darwin1861.txt', 'darwin1866.txt', 'darwin1869.txt', 'darwin1872.txt']
filenames = ['darwin1859.txt', 'darwin1860.txt', 'darwin1861.txt', 'darwin1866.txt']
first_paragraph = 0
last_paragraph = 3
how_many_paragraphs = last_paragraph - first_paragraph
raw_data_dict = {}
for siglum, filename in zip(sigla, filenames):
    with open(filename) as f:
        lines = f.readlines()
        lines = [line for line in lines if line != '\n']
        raw_data_dict[siglum] = " ".join(lines[first_paragraph : last_paragraph])

In [4]:
def tokenize_witnesses(witness_strings: List[str]): # one string per witness
    '''Return list of witnesses, each represented by a list of tokens'''
    # TODO: handle punctuation, upper- vs lowercase
    witnesses = []
    for witness_string in witness_strings:
        witness_tokens = witness_string.split()
        witnesses.append(witness_tokens)
    return witnesses

In [5]:
def create_token_array(witness_token_lists): # list of token lists per witness
    '''Create token array (single list, with separator " # " between witnesses'''
    token_array = []
    token_membership_array = []
    token_witness_offset_array = []
    last_witness_offset = len(witness_token_lists) - 1
    for index, witness_token_list in enumerate(witness_token_lists):
        token_array.extend(witness_token_list)
        for token_offset, token in enumerate(witness_token_list):
            token_witness_offset_array.append(token_offset)
        token_membership_array.extend([index for token in witness_token_list])
        if index < last_witness_offset:
            separator = " #" + str(index + 1) + " "
            token_array.append(separator)
            token_membership_array.append(separator)
            token_witness_offset_array.append(-1)
    return token_array, token_membership_array, token_witness_offset_array

In [6]:
witness_sigla = [key for key in raw_data_dict.keys()]
witnesses = tokenize_witnesses([value for value in raw_data_dict.values()]) # strings
# token_list

In [7]:
token_array, token_membership_array, token_witness_offset_array = create_token_array(witnesses)
# print(f"{token_array=}")
# print(f"{token_membership_array=}")
# print(f"{token_witness_offset_array=}")

In [8]:
suffix_array = SuffixArray(token_array, unit=UNIT_BYTE)
# print(suffix_array)
# LCP=0 means that the block has nothing in common with the next one

In [9]:
lcp_array = suffix_array._LCP_values
# lcp_array

In [10]:
# create Block dataclass
from dataclasses import dataclass
@dataclass(unsafe_hash=True)
class Block:
    token_count: int
    start_position: int # offset into suffix array (not into token array!)
    end_position: int # start and end position give number of occurrences
    all_start_positions: [] # compute after blocks have been completed
    witnesses: set
    witness_count: int # number of witnesses in which pattern occurs, omitted temporarily because requires further computation
    frequency: int # number of times pattern occurs in whole witness set (may be more than once in a witness), end_position - start_position + 1
    # how_created: int # debug

In [11]:
def create_blocks (_lcp_array):
    '''Create blocks from lcp array

    Skip first lcp value, which is a fake; otherwise compare lcp value to length of block at top of stack.
    Four possibilities:

        stack is empty
            * if lcp value == 0, proceed to next lcp value (continue)
            * if lcp value > 0, create block and push onto stack, then proceed to next lcp value (continue)

        lcp value (cannot equal 0) matches block length at top of stack
            * proceed to next lcp value (continue)

        lcp value (cannot equal 0) is longer than block length at top of stack
            * create and push new block

        lcp value is shorter than block length at top of stack
            * (recursive) if block at top of stack is longer than current lcp value, pop and append to _blocks
            * if block at top of stack is equal to lcp value, proceed to next lcp value (continue)
            * if block at top of stack is shorter than current lcp value ...
            *   create and push new block starting at start position of most recently closed block, then proceed to next lcp value (continue)

    In other words:

        We proceed to next lcp value if:
            * stack is empty and lcp value == 0
            * lcp value matches block length at top of stack (can we combine this with the preceding, since an empty stack effectively has a zero-length block on top?)

        We push a new value on stack and then proceed to next lcp value if:
            * stack is empty and lcp value > 0
            * lcp value is longer than block length at top of stack (where is the start position?)

        We pop from the stack to _blocks and then check the next stack value (stick with same lcp) if:
            * lcp value is shorter than current block value

cases (occurrences are always one more than number of repetitions):
    5 5 2     --> 1 block of 5 occures 3 times, 1 block of 2 occures 4 times
    2 5 5 2   --> 1 block of 2 occures 5 times, 1 block of 5 occures 3 times
    5 5 0 2   --> 1 block of 5 occures 3 times, 1 block of 2 occures 2 times
    2 5 5 2 3 --> 


Nested while structures:

(Create blocks in two places because they have different start positions)
(Nested while loops because we traverse two things: lcp array and, sometimes, stack)

while next-lcp-value: # traverse lcp array
    if something
    elif something else
    elif perhaps yet another something else
    else: # possible hidden block (or possibly not)
        while something-on-the-stack: # traverse stack for some lcp value situations
            pop larger values
        if hidden-block:
            create and push
clean-up-stack-after-last-lcp-value # or tack a 0 onto the end of the lcp to avoid extra clean-up code
'''
    from collections import deque # deque has faster append and pop than list
    _blocks = []
    open_block_stack = deque()
    for offset, lcp in enumerate(lcp_array):
        # three situations: next one is same value, higher that last, or lower than last
        # if same value: same pattern
        # if higher or lower, new pattern (may overlap with previous, unless one or the other value is 0)
        peek = open_block_stack[-1] if open_block_stack else None
        peek_token_count = peek.token_count if peek else 0
        if offset == 0: # skip the first one, which is a transition from a fake start value
            continue # resume loop with next item in lcp array
        elif lcp == peek_token_count:
            pass # same pattern (happens with repetition), so do nothing
        elif lcp > peek_token_count: # new prefix is longer than previous one, so start new pattern
            # can fill in end_position and frequency only when we encounter a shorter value in the LCP array
            # start_position is number of patterns that are the same 
            open_block_stack.append(Block(token_count = lcp, start_position = offset - 1, end_position = -1, all_start_positions = [], witnesses = (), witness_count = -1, frequency = -1))
        else: # new prefix is shorter than previous one, so:
                # 1. close open blocks with higher values
                # 2. do something else
            while open_block_stack and open_block_stack[-1].token_count > lcp: # if an open block is longer than the current length, pop and close it
                block_being_modified = open_block_stack.pop()
                block_being_modified.end_position = offset - 1
                block_being_modified.frequency = block_being_modified.end_position - block_being_modified.start_position + 1
                _blocks.append(block_being_modified)
            if lcp > 0 and (not open_block_stack or open_block_stack[-1].token_count < lcp):
                open_block_stack.append(Block(token_count = lcp, start_position = _blocks[-1].start_position, end_position = -1, all_start_positions = [], witnesses = (), witness_count = -1, frequency = -1))

    while open_block_stack: # pop anything left in open_block_stack
        block_being_modified = open_block_stack.pop()
        block_being_modified.end_position = len(lcp_array) - 1
        block_being_modified.frequency = block_being_modified.end_position - block_being_modified.start_position + 1
        _blocks.append(block_being_modified)

    # add all_start_positions and then witness_count properties to blocks
    for _index, _block in enumerate(_blocks):
        # block_start_position through block_end_position gives offsets of all start positions in suffix_array
        _block.all_start_positions = sorted([suffix_array.SA[x] for x in range(_block.start_position,_block.end_position + 1)])
        # use all start positions to find witness count
        _block.witnesses = set(token_membership_array[offset] for offset in _block.all_start_positions)
        _block.witness_count = len(_block.witnesses)
    return _blocks

In [12]:
blocks = create_blocks(lcp_array)
# pp.pprint(blocks) # take a look

In [13]:
blocks_we_care_about = [block for block in blocks if block.witness_count == block.frequency] # remove repetition
# sort by token order in w0
sorted_blocks = sorted(blocks_we_care_about, key=lambda x: x.all_start_positions[0])
# pp.pprint(sorted_blocks)

In [14]:
# for index, block in enumerate(sorted_blocks):
#     print('block', index, 'occurs', int(block.frequency / 2), 'time(s) per witness and contains',token_array[block.all_start_positions[0]:block.all_start_positions[0] + block.token_count])

In [15]:
# block.witnesses is a set of witnesses represented in the block
# pp.pprint(sorted_blocks[:5])

In [16]:
# Dependency graph, with lots of simplifying assumptions
#
# Relations are from larger to smaller
# If there is already a path (chain) from A to B, do not create a direct edge
# Requires that the length of the next block be shorter than the current one
#   and that the start position in the next block be one less than that in the current block
# The witnesses for a dependent block must be a subset (possibly equivalent) of the source of the dependency
dependencies = {}
for block_position in range(len(sorted_blocks) - 1):
    current_block = sorted_blocks[block_position]
    next_block = sorted_blocks[block_position + 1]
    if current_block.token_count > next_block.token_count \
    and current_block.all_start_positions[0] == next_block.all_start_positions[0] - 1 \
    and current_block.witnesses.issuperset(next_block.witnesses):
        dependencies[block_position] = block_position + 1
# print(dependencies)

In [17]:
# Do the following in the calling notebook (not here)
#
# We can skip a block iff we have already aligned (not just seen) a block that controls it
#
# Keep track of blocks already processed (bitarray)
# Traverse blocks in priority queue (not the same as the sorted order for identifying dependencies)
# When we process a block:
#     If marked as already processed in bitarray, move on
#     Otherwise mark its dependencies as processed recursively

In [18]:
def priority(block: Block) -> float:
    '''Priority ranges from 0 to ∞

    depth (number of witnesses) / (frequency * length)
        modified (by trial and error) to weight the components
    scale: # TODO: how can we set these in a generally meaningful way?
        high depth (more witnesses) is most important
        low frequency (less repetition) is next most important
        high length (token count) is least important
    higher numbers are better
        distance between neighboring values is irrelevant; all that matters is order
    '''
    # score = pow(block.witness_count,4) / (pow(block.frequency,3) * block.token_count)
    score = pow(block.witness_count,6)  * block.token_count / pow(block.frequency,3)
    return score

In [19]:
def sort_blocks_by_priority (_blocks: List[Block]) -> List[tuple]:
    blocks_to_tuples = [(_block, index) for index, _block in enumerate(_blocks)]
    return sorted(blocks_to_tuples, key=lambda x: priority(x[0]), reverse=True)

In [20]:
prioritized_blocks = sort_blocks_by_priority(sorted_blocks) # sorted_blocks has been sorted for dependencies

In [21]:
# prioritized_blocks[:5]

In [22]:
# len(blocks), len(sorted_blocks), len(prioritized_blocks)

In [23]:
# dependencies.clear()
# print(dependencies)

In [24]:
witness_sigla

['w0', 'w1', 'w2', 'w3']