# Create blocks

To be imported into other notebooks

In [1]:
# Configure to show multiple value for development and debugging
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from typing import List
from linsuffarr import SuffixArray
from linsuffarr import UNIT_BYTE
import pprint
pp = pprint.PrettyPrinter(indent=2)

In [2]:
# dir(linsuffarr)

## Data

In [3]:
# two witnesses, with repetition and transposition

# Original example, single leaf node
# raw_data_dict = {
#     'w0' : '''the red and the black cat''',
#     'w1' : '''the black and the red cat'''
# }

In [4]:
# Different lengths
# w0 = '''the red and the black and blue cat'''
# w1 = '''the black and the red cat'''

In [5]:
# Three witnesses
# w0 = '''the red and the black cat'''
# w1 = '''the black and the red cat'''
# w2 = '''the black and red and the blue and green cat'''

In [6]:
# Three witnesses with transposition
# w0 = '''the red and the black cat'''
# w1 = '''the black and the red cat'''
# w2 = '''the black and red and the blue and green cat'''

In [7]:
# Adjacent transposition
# w1 = '''the red striped cat'''
# w2 = '''the striped red cat'''

In [8]:
# Two leaf nodes
# w1 = '''cat red black'''
# w2 = '''cat black red'''

In [9]:
# Branches meet in the middle at koala and then split again, with two leaf nodes
# w1 = """cat red black koala brown gray"""
# w2 = """cat black red koala gray brown"""

In [10]:
# Two split and rejoin
# w1 = '''the gray koala'''
# w2 = '''the brown koala'''

In [11]:
# medium example
# w0 = '''WHEN we look to the individuals of the same variety or sub-variety of
# our older cultivated plants and animals, one of the first points which strikes us, is,
# that they generally differ much more from each other, than do the individuals of any one
# species or variety in a state of nature.'''
# w1 = '''WHEN we look to the individuals of the same variety or sub-variety of
# our older cultivated plants and animals, one of the first points which strikes us, is,
# that they generally differ more from each other than do the individuals of any one
# species or variety in a state of nature.'''
# w2 = '''WHEN we look to the individuals of the same variety or sub-variety of
# our older cultivated plants and animals, one of the first points which strikes us, is,
# that they generally differ more from each other than do the individuals of any one
# species or variety in a state of nature.'''

In [12]:
# Larger example with three witnesses
# w0 = '''WHEN we look to the individuals of the same variety or sub-variety of
# our older cultivated plants and animals, one of the first points which strikes us, is,
# that they generally differ much more from each other, than do the individuals of any one
# species or variety in a state of nature. When we reflect on the vast diversity of the
# plants and animals which have been cultivated, and which have varied during all ages
# under the most different climates and treatment, I think we are driven to conclude that
# this greater variability is simply due to our domestic productions having been raised
# under conditions of life not so uniform as, and somewhat different from, those to which
# the parent-species have been exposed under nature. There is, also, I think, some
# probability in the view propounded by Andrew Knight, that this variability may be partly
# connected with excess of food. It seems pretty clear that organic beings must be exposed
# during several generations to the new conditions of life to cause any appreciable amount
# of variation; and that when the organisation has once begun to vary, it generally
# continues to vary for many generations. No case is on record of a variable being ceasing
# to be variable under cultivation. Our oldest cultivated plants, such as wheat, still
# often yield new varieties: our oldest domesticated animals are still capable of rapid
# improvement or modification.'''
# w1 = '''WHEN we look to the individuals of the same variety or sub-variety of
# our older cultivated plants and animals, one of the first points which strikes us, is,
# that they generally differ more from each other than do the individuals of any one
# species or variety in a state of nature. When we reflect on the vast diversity of the
# plants and animals which have been cultivated, and which have varied during all ages
# under the most different climates and treatment, I think we are driven to conclude that
# this great variability is simply due to our domestic productions having been raised
# under conditions of life not so uniform as, and somewhat different from, those to which
# the parent-species have been exposed under nature. There is also, I think, some
# probability in the view propounded by Andrew Knight, that this variability may be partly
# connected with excess of food. It seems pretty clear that organic beings must be exposed
# during several generations to the new conditions of life to cause any appreciable amount
# of variation; and that when the organisation has once begun to vary, it generally
# continues to vary for many generations. No case is on record of a variable being ceasing
# to be variable under cultivation. Our oldest cultivated plants, such as wheat, still
# often yield new varieties: our oldest domesticated animals are still capable of rapid
# improvement or modification.'''
# w2 = '''WHEN we look to the individuals of the same variety or sub-variety of
# our older cultivated plants and animals, one of the first points which strikes us, is,
# that they generally differ more from each other than do the individuals of any one
# species or variety in a state of nature. When we reflect on the vast diversity of the
# plants and animals which have been cultivated, and which have varied during all ages
# under the most different climates and treatment, I think we are driven to conclude that
# this great variability is simply due to our domestic productions having been raised
# under conditions of life not so uniform as, and somewhat different from, those to which
# the parent-species have been exposed under nature. There is also, I think, some
# probability in the view propounded by Andrew Knight, that this variability may be partly
# connected with excess of food. It seems pretty clear that organic beings must be exposed
# during several generations to the new conditions of life to cause any appreciable amount
# of variation; and that when the organisation has once begun to vary, it generally
# continues to vary for many generations. No case is on record of a variable being ceasing
# to be variable under cultivation. Our oldest cultivated plants, such as wheat, still
# often yield new varieties: our oldest domesticated animals are still capable of rapid
# improvement or modification.'''

In [13]:
# # Larger example with six witnesses
# raw_data_dict = {'w0' : '''WHEN we look to the individuals of the same variety or sub-variety of
# our older cultivated plants and animals, one of the first points which strikes us, is,
# that they generally differ much more from each other, than do the individuals of any one
# species or variety in a state of nature. When we reflect on the vast diversity of the
# plants and animals which have been cultivated, and which have varied during all ages
# under the most different climates and treatment, I think we are driven to conclude that
# this greater variability is simply due to our domestic productions having been raised
# under conditions of life not so uniform as, and somewhat different from, those to which
# the parent-species have been exposed under nature. There is, also, I think, some
# probability in the view propounded by Andrew Knight, that this variability may be partly
# connected with excess of food. It seems pretty clear that organic beings must be exposed
# during several generations to the new conditions of life to cause any appreciable amount
# of variation; and that when the organisation has once begun to vary, it generally
# continues to vary for many generations. No case is on record of a variable being ceasing
# to be variable under cultivation. Our oldest cultivated plants, such as wheat, still
# often yield new varieties: our oldest domesticated animals are still capable of rapid
# improvement or modification.''',
# 'w1' : '''WHEN we look to the individuals of the same variety or sub-variety of
# our older cultivated plants and animals, one of the first points which strikes us, is,
# that they generally differ more from each other than do the individuals of any one
# species or variety in a state of nature. When we reflect on the vast diversity of the
# plants and animals which have been cultivated, and which have varied during all ages
# under the most different climates and treatment, I think we are driven to conclude that
# this great variability is simply due to our domestic productions having been raised
# under conditions of life not so uniform as, and somewhat different from, those to which
# the parent-species have been exposed under nature. There is also, I think, some
# probability in the view propounded by Andrew Knight, that this variability may be partly
# connected with excess of food. It seems pretty clear that organic beings must be exposed
# during several generations to the new conditions of life to cause any appreciable amount
# of variation; and that when the organisation has once begun to vary, it generally
# continues to vary for many generations. No case is on record of a variable being ceasing
# to be variable under cultivation. Our oldest cultivated plants, such as wheat, still
# often yield new varieties: our oldest domesticated animals are still capable of rapid
# improvement or modification.''',
# 'w2' : '''WHEN we look to the individuals of the same variety or sub-variety of
# our older cultivated plants and animals, one of the first points which strikes us, is,
# that they generally differ more from each other than do the individuals of any one
# species or variety in a state of nature. When we reflect on the vast diversity of the
# plants and animals which have been cultivated, and which have varied during all ages
# under the most different climates and treatment, I think we are driven to conclude that
# this great variability is simply due to our domestic productions having been raised
# under conditions of life not so uniform as, and somewhat different from, those to which
# the parent-species have been exposed under nature. There is also, I think, some
# probability in the view propounded by Andrew Knight, that this variability may be partly
# connected with excess of food. It seems pretty clear that organic beings must be exposed
# during several generations to the new conditions of life to cause any appreciable amount
# of variation; and that when the organisation has once begun to vary, it generally
# continues to vary for many generations. No case is on record of a variable being ceasing
# to be variable under cultivation. Our oldest cultivated plants, such as wheat, still
# often yield new varieties: our oldest domesticated animals are still capable of rapid
# improvement or modification.''',
# 'w3' : '''Causes of Variability. WHEN we look to the individuals of the same
# variety or sub-variety of our older cultivated plants and animals, one of the first
# points which strikes us, is, that they generally differ more from each other than do the
# individuals of any one species or variety in a state of nature. When we reflect on the
# vast diversity of the plants and animals which have been cultivated, and which have
# varied during all ages under the most different climates and treatment, I think we are
# driven to conclude that this great variability is simply due to our domestic productions
# having been raised under conditions of life not so uniform as, and somewhat different
# from, those to which the parent-species have been exposed under nature. There is also, I
# think, some probability in the view propounded by Andrew Knight, that this variability
# may be partly connected with excess of food. It seems pretty clear that organic beings
# must be exposed during several generations to the new conditions of life to cause any
# appreciable amount of variation; and that, when the organisation has once begun to vary,
# it generally continues to vary for many generations. No case is on record of a variable
# being ceasing to be variable under cultivation. Our oldest cultivated plants, such as
# wheat, still often yield new varieties: our oldest domesticated animals are still
# capable of rapid improvement or modification.''',
# 'w4' : '''Causes of Variability. WHEN we compare the individuals of the same variety or
# sub-variety of our older cultivated plants and animals, one of the first points which
# strikes us is, that they generally differ from each other more than do the individuals
# of any one species or variety in a state of nature. And if we reflect on the vast
# diversity of the plants and animals which have been cultivated, and which have varied
# during all ages under the most different climates and treatment, we are driven to
# conclude that this great variability is due to our domestic productions having been
# raised under conditions of life not so uniform as, and somewhat different from, those
# to which the parent-species had been exposed under nature. There is also, I think,
# some probability in the view propounded by Andrew Knight, that this variability may
# be partly connected with excess of food. It seems clear that organic beings must be
# exposed during several generations to new conditions to cause any appreciable amount
# of variation; and that, when the organisation has once begun to vary, it generally
# continues varying for many generations. No case is on record of a variable organism
# ceasing to vary under cultivation. Our oldest cultivated plants, such as wheat, still
# yield new varieties: our oldest domesticated animals are still capable of rapid
# improvement or modification.''',
# 'w5' : '''Causes of Variability. WHEN we compare the individuals of the same variety or
# sub-variety of our older cultivated plants and animals, one of the first points which
# strikes us is, that they generally differ more from each other than do the individuals
# of any one species or variety in a state of nature. And if we reflect on the vast
# diversity of the plants and animals which have been cultivated, and which have varied
# during all ages under the most different climates and treatment, we are driven to conclude
# that this great variability is due to our domestic productions having been raised under
# conditions of life not so uniform as, and somewhat different from, those to which the
# parent-species had been exposed under nature. There is, also, some probability in the
# view propounded by Andrew Knight, that this variability may be partly connected with
# excess of food. It seems clear that organic beings must be exposed during several
# generations to new conditions to cause any great amount of variation; and that, when
# the organisation has once begun to vary, it generally continues varying for many
# generations. No case is on record of a variable organism ceasing to vary under cultivation.
# Our oldest cultivated plants, such as wheat, still yield new varieties: our oldest
# domesticated animals are still capable of rapid improvement or modification.'''}

In [14]:
# For discussion
# raw_data_dict = {
#     'w0': '''The red and the black cat''',
#     'w1': '''The black and the red cat''',
#     'w2': '''The big black and small green cat'''
#     }

In [15]:
# Load first chapter of six editions of the Origin of species from disk
# Each paragraph is a line, with blank lines between (which we filter out)
# sigla = ['w0', 'w1', 'w2', 'w3', 'w4', 'w5']
# filenames = ['darwin1859.txt', 'darwin1860.txt', 'darwin1861.txt', 'darwin1866.txt', 'darwin1869.txt', 'darwin1872.txt', ]
sigla = ['w0', 'w3', 'w4']
filenames = ['darwin1859.txt', 'darwin1866.txt', 'darwin1869.txt']
first_paragraph = 0
last_paragraph = 2
how_many_paragraphs = last_paragraph - first_paragraph
raw_data_dict = {}
for siglum, filename in zip(sigla, filenames):
    with open(filename) as f:
        lines = f.readlines()
        lines = [line for line in lines if line != '\n']
        raw_data_dict[siglum] = " ".join(lines[first_paragraph : last_paragraph])
# pp.pprint(raw_data_dict)
print(f"{how_many_paragraphs} paragraphs from {len(sigla)} witnesses")

2 paragraphs from 3 witnesses


## Work plan

1. Create token array (Python **list**)
1. Create suffix array
1. Create LCP (**longest common prefix**) array
1. Calculate LCP intervals
1. Create patterns



## Construct list of ngrams shared by witnesses

Find ngrams and positions in witnesses

### Tokenize witnesses

In [16]:
def tokenize_witnesses(witness_strings: List[str]): # one string per witness
    '''Return list of witnesses, each represented by a list of tokens'''
    # TODO: handle punctuation, upper- vs lowercase
    witnesses = []
    for witness_string in witness_strings:
        witness_tokens = witness_string.split()
        witnesses.append(witness_tokens)
    return witnesses

In [17]:
witness_sigla = [key for key in raw_data_dict.keys()]
witnesses = tokenize_witnesses([value for value in raw_data_dict.values()]) # strings
# witnesses # take a look

In [18]:
def create_token_array(witness_token_lists): # list of token lists per witness
    '''Create token array (single list, with separator " # " between witnesses'''
    token_array = []
    token_membership_array = []
    token_witness_offset_array = []
    last_witness_offset = len(witness_token_lists) - 1
    for index, witness_token_list in enumerate(witness_token_lists):
        token_array.extend(witness_token_list)
        for token_offset, token in enumerate(witness_token_list):
            token_witness_offset_array.append(token_offset)
        token_membership_array.extend([index for token in witness_token_list])
        if index < last_witness_offset:
            separator = " #" + str(index + 1) + " "
            token_array.append(separator)
            token_membership_array.append(separator)
            token_witness_offset_array.append(-1)
    return token_array, token_membership_array, token_witness_offset_array

In [19]:
token_array, token_membership_array, token_witness_offset_array = create_token_array(witnesses)
# print(token_array) # take a look
# print(token_membership_array)
# print(token_witness_offset_array)
# list(zip(token_array, token_membership_array))

In [20]:
# def create_suffix_array(_token_array):
#     '''Add docstring'''
#     _suffixes = []
#     for index, tokens in enumerate(_token_array):
#         suffix = _token_array[index:]
#         _suffixes.append((suffix, index))
#     _suffixes.sort() # sort in place
#     return [x[1] for x in _suffixes]

In [21]:
suffix_array = SuffixArray(token_array, unit=UNIT_BYTE)
# suffix_array = create_suffix_array(token_array)
# print(suffix_array)

In [22]:
# def create_lcp_array(_suffix_array, _token_array):
#     '''compute LCP array
#     which is a sequence of integers representing the number of tokens shared by consecutive alphabetically sorted suffixes
#     sequential pairs of values in suffix array, which are two offsets in the sorted suffixes
#     '''
#     _lcp_array = [0]
#     for i in range(0, len(_suffix_array) - 1): # for each pair of suffixes, retrieve list of tokens starting at that position
#         pair = _suffix_array[i:i+2] # for each pair of suffixes
#         suffix_1 = _token_array[pair[0]:] # tokens starting at first position
#         suffix_2 = _token_array[pair[1]:] # tokens starting at second position
#         # print(suffix_1, suffix_2) # diagnostic: verify that they're paired correctly
#         # pair the tokens up by position, return (number of matches, first non-match)
#         _lcp_value = next(filter(lambda t: t[1][0] != t[1][1], enumerate(zip(suffix_1, suffix_2))), min(len(suffix_1), len(suffix_2)))
#         _lcp_array.append(_lcp_value[0] if type(_lcp_value) == tuple else _lcp_value) # most are tuples, but some are just an integer
#     return _lcp_array

In [23]:
lcp_array = suffix_array._LCP_values
# lcp_array = create_lcp_array(suffix_array, token_array)
print(lcp_array[:20]) # take a look

array('i', [0, 0, 0, 0, 0, 48, 0, 0, 15, 40, 0, 0, 55, 0, 5, 0, 0, 0, 82, 0])


In [24]:
# uncomment to verify the accuracy of the lcp array
# for offset in suffix_array:
#     print(token_array[offset: offset + 5])

In [25]:
# Use LCP array to calculate patterns (efficiently)
#   1. 0 means that whatever follows will have nothing in common with it
#   2. Repetition of same number means same pattern
#   3. Consecutive non-zero values identify how much of the pattern they have in common,
#      Counts are always +1, so there must be two instances of "the red", two of "the black", etc.

In [26]:
# create Block dataclass
from dataclasses import dataclass
@dataclass(unsafe_hash=True)
class Block:
    token_count: int
    start_position: int # offset into suffix array (not into token array!)
    end_position: int # start and end position give number of occurrences
    all_start_positions: [] # compute after blocks have been completed
    witnesses: set
    witness_count: int # number of witnesses in which pattern occurs, omitted temporarily because requires further computation
    frequency: int # number of times pattern occurs in whole witness set (may be more than once in a witness), end_position - start_position + 1
    # how_created: int # debug

In [27]:
def create_blocks (_lcp_array):
    '''Create blocks from lcp array

    Skip first lcp value, which is a fake; otherwise compare lcp value to length of block at top of stack.
    Four possibilities:

        stack is empty
            * if lcp value == 0, proceed to next lcp value (continue)
            * if lcp value > 0, create block and push onto stack, then proceed to next lcp value (continue)

        lcp value (cannot equal 0) matches block length at top of stack
            * proceed to next lcp value (continue)

        lcp value (cannot equal 0) is longer than block length at top of stack
            * create and push new block

        lcp value is shorter than block length at top of stack
            * (recursive) if block at top of stack is longer than current lcp value, pop and append to _blocks
            * if block at top of stack is equal to lcp value, proceed to next lcp value (continue)
            * if block at top of stack is shorter than current lcp value ...
            *   create and push new block starting at start position of most recently closed block, then proceed to next lcp value (continue)

    In other words:

        We proceed to next lcp value if:
            * stack is empty and lcp value == 0
            * lcp value matches block length at top of stack (can we combine this with the preceding, since an empty stack effectively has a zero-length block on top?)

        We push a new value on stack and then proceed to next lcp value if:
            * stack is empty and lcp value > 0
            * lcp value is longer than block length at top of stack (where is the start position?)

        We pop from the stack to _blocks and then check the next stack value (stick with same lcp) if:
            * lcp value is shorter than current block value

cases (occurrences are always one more than number of repetitions):
    5 5 2     --> 1 block of 5 occures 3 times, 1 block of 2 occures 4 times
    2 5 5 2   --> 1 block of 2 occures 5 times, 1 block of 5 occures 3 times
    5 5 0 2   --> 1 block of 5 occures 3 times, 1 block of 2 occures 2 times
    2 5 5 2 3 --> 


Nested while structures:

(Create blocks in two places because they have different start positions)
(Nested while loops because we traverse two things: lcp array and, sometimes, stack)

while next-lcp-value: # traverse lcp array
    if something
    elif something else
    elif perhaps yet another something else
    else: # possible hidden block (or possibly not)
        while something-on-the-stack: # traverse stack for some lcp value situations
            pop larger values
        if hidden-block:
            create and push
clean-up-stack-after-last-lcp-value # or tack a 0 onto the end of the lcp to avoid extra clean-up code
'''
    from collections import deque # deque has faster append and pop than list
    _blocks = []
    open_block_stack = deque()
    for offset, lcp in enumerate(lcp_array):
        # three situations: next one is same value, higher that last, or lower than last
        # if same value: same pattern
        # if higher or lower, new pattern (may overlap with previous, unless one or the other value is 0)
        peek = open_block_stack[-1] if open_block_stack else None
        peek_token_count = peek.token_count if peek else 0
        if offset == 0: # skip the first one, which is a transition from a fake start value
            continue # resume loop with next item in lcp array
        elif lcp == peek_token_count:
            pass # same pattern (happens with repetition), so do nothing
        elif lcp > peek_token_count: # new prefix is longer than previous one, so start new pattern
            # can fill in end_position and frequency only when we encounter a shorter value in the LCP array
            # start_position is number of patterns that are the same 
            open_block_stack.append(Block(token_count = lcp, start_position = offset - 1, end_position = -1, all_start_positions = [], witnesses = (), witness_count = -1, frequency = -1))
        else: # new prefix is shorter than previous one, so:
                # 1. close open blocks with higher values
                # 2. do something else
            while open_block_stack and open_block_stack[-1].token_count > lcp: # if an open block is longer than the current length, pop and close it
                block_being_modified = open_block_stack.pop()
                block_being_modified.end_position = offset - 1
                block_being_modified.frequency = block_being_modified.end_position - block_being_modified.start_position + 1
                _blocks.append(block_being_modified)
            if lcp > 0 and (not open_block_stack or open_block_stack[-1].token_count < lcp):
                open_block_stack.append(Block(token_count = lcp, start_position = _blocks[-1].start_position, end_position = -1, all_start_positions = [], witnesses = (), witness_count = -1, frequency = -1))

    while open_block_stack: # pop anything left in open_block_stack
        block_being_modified = open_block_stack.pop()
        block_being_modified.end_position = len(lcp_array) - 1
        block_being_modified.frequency = block_being_modified.end_position - block_being_modified.start_position + 1
        _blocks.append(block_being_modified)

    # add all_start_positions and then witness_count properties to blocks
    for _index, _block in enumerate(_blocks):
        # block_start_position through block_end_position gives offsets of all start positions in suffix_array
        _block.all_start_positions = sorted([suffix_array.SA[x] for x in range(_block.start_position,_block.end_position + 1)])
        # use all start positions to find witness count
        _block.witnesses = set(token_membership_array[offset] for offset in _block.all_start_positions)
        _block.witness_count = len(_block.witnesses)
    return _blocks

In [28]:
blocks = create_blocks(lcp_array)
# blocks # take a look

In [29]:
# diagnostic: are the blocks complete and correct
# print('Token array:', token_array)
# for block in blocks:
#     print(token_array[suffix_array[block.start_position]:suffix_array[block.start_position] + block.token_count])

---


## What to do

NOTE: This is a simplified greedy approach; will later be integrated into decision tree / graph

### Create data structure to store results

Eventually this will be a variant graph, but it's easier to build otherwise and construct the graph later. Interim
structure is a list of occurrences, which can be compared to a topological sort of a variant graph, and can be used
to construct a full variant graph later.

### Assign priority to blocks

1. Favor those with high witness count, high token count, low frequency (and no transpositions, which we don't know yet); sort blocks by descending priority
1. Priority = number of witnesses (depth) divided by (frequency * length) (er … we no longer remember why we selected this formula, but it seemed like a Good Idea)
1. Sort blocks by priority from higher to lower, break ties arbitrarily from beginning of alphabet (blocks are already sorted alphabetically)

NOTE: We do not take transpositions into consideration, although ultimately the matter, because in the graph 
approach we'll consider more than one possibility.

### Select highest-priority remaining block

1. Take all occurrences of current block

---


In [30]:
def priority(block: Block) -> float:
    '''Priority ranges from 0 to ∞

    depth (number of witnesses) / (frequency * length)
        modified (by trial and error) to weight the components
    scale: # TODO: how can we set these in a generally meaningful way?
        high depth (more witnesses) is most important
        low frequency (less repetition) is next most important
        high length (token count) is least important
    higher numbers are better
        distance between neighboring values is irrelevant; all that matters is order
    '''
    # score = pow(block.witness_count,4) / (pow(block.frequency,3) * block.token_count)
    score = pow(block.witness_count,6)  * block.token_count / pow(block.frequency,3)
    return score

In [31]:
from typing import List
def sort_blocks_by_priority (_blocks: List[Block]) -> List[Block]:
    return sorted(_blocks, key=lambda x: priority(x), reverse=True)

In [32]:
prioritized_blocks = sort_blocks_by_priority(blocks)
# take a look
# [(token_array[suffix_array[_block.start_position]:suffix_array[_block.start_position] + _block.token_count], priority(_block)) for _block in prioritized_blocks]

In [33]:
print(f"{len(prioritized_blocks)=}")
print(f"{len(lcp_array)=}")
prioritized_blocks[:5]

len(prioritized_blocks)=1063
len(lcp_array)=1916


[Block(token_count=174, start_position=1848, end_position=1849, all_start_positions=[178, 864], witnesses={0, 1}, witness_count=2, frequency=2),
 Block(token_count=173, start_position=1540, end_position=1541, all_start_positions=[179, 865], witnesses={0, 1}, witness_count=2, frequency=2),
 Block(token_count=172, start_position=1178, end_position=1179, all_start_positions=[180, 866], witnesses={0, 1}, witness_count=2, frequency=2),
 Block(token_count=171, start_position=722, end_position=723, all_start_positions=[181, 867], witnesses={0, 1}, witness_count=2, frequency=2),
 Block(token_count=170, start_position=1135, end_position=1136, all_start_positions=[182, 868], witnesses={0, 1}, witness_count=2, frequency=2)]

In [34]:
blocks[:5]

[Block(token_count=48, start_position=4, end_position=5, all_start_positions=[632, 1321], witnesses={0, 1}, witness_count=2, frequency=2),
 Block(token_count=40, start_position=8, end_position=9, all_start_positions=[137, 823], witnesses={0, 1}, witness_count=2, frequency=2),
 Block(token_count=15, start_position=7, end_position=9, all_start_positions=[137, 823, 1507], witnesses={0, 1, 2}, witness_count=3, frequency=3),
 Block(token_count=55, start_position=11, end_position=12, all_start_positions=[297, 983], witnesses={0, 1}, witness_count=2, frequency=2),
 Block(token_count=5, start_position=13, end_position=14, all_start_positions=[684, 1371], witnesses={1, 2}, witness_count=2, frequency=2)]

In [35]:
# maintain token-to-block multivalued dictionary (defaultdict, imported from collections below)
from collections import defaultdict

def add_token_to_block_mapping(token_to_block_dict: defaultdict, token_offset: int, block: Block) -> defaultdict:
    token_to_block_dict[token_offset].append(block)
    return token_to_block_dict

def create_token_to_block_dict(blocks: List[Block]) -> defaultdict:
    token_to_block_dict = defaultdict(list) # mapping from token offset to list of blocks that contain the token
    for index, block in enumerate(prioritized_blocks):
        for block_start_position in block.all_start_positions:
            for block_token_offset in range(block.token_count):
                add_token_to_block_mapping(token_to_block_dict, block_start_position + block_token_offset, index)
    return token_to_block_dict

token_to_block_dict = create_token_to_block_dict(prioritized_blocks)

In [36]:
# diagnostic
import pprint
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(enumerate(token_array))
pp.pprint(prioritized_blocks)
for token_position, block_ids in sorted(token_to_block_dict.items()):
    print("Token:", token_array[token_position] , "(", token_position , "in witness", token_membership_array[token_position] , ")")
    for block_id in block_ids:
        print(block_id, ":", blocks[block_id])
    print("====================")

