# linked list

We need insert, search and remove
Floor and ceiling can be done in the search function by returning a tuple of (result or None, value lower than key, value higher than key)



In [1]:
%load_ext autoreload
%autoreload 2

import pprint, typing
pp = pprint.PrettyPrinter(indent=2)

In [2]:
%run create-blocks.ipynb # data will be in prioritized_blocks
# pp.pprint(prioritized_blocks) # take a look
# pp.pprint(token_to_block_dict) # and another look
# print(token_array, len(token_array)) # and one more look

# Linked list of variant graph nodes kept in a partial order

Keys are variant graph nodes.
The floor and ceiling should contain not only the siglum of the token we want to insert but also of another witness.



In [3]:
# for index,node in enumerate(dumpNodes(vg)):
#     pp.pprint((index,node))

In [4]:
from functools import total_ordering

# Find all witness that are present on both items (ignore those on only one item)
#     If none, raise an error
#     If some, compare in arbitrary but consistent order
#         If first pair differs, return result
#         If first pair matches, check next pair

class ComparisonError(Exception):
    def __init__(self, message):
        self.message = message
    def __str__(self):
        return self.message

magic_number = -2

@total_ordering
class VG_node():
    """Variant graph node, used as key in skiplist

    We don't care about the value in the skiplist; the node contains all information"""
    def __init__(self, block_id=None, token_offset=None, sigla:list=[], **kwargs):
        self.__dict__.update({siglum: magic_number for siglum in sigla}) # default all to magic number
        self.__dict__.update(kwargs) # overwrite the real value
        self._sigla = [key for key in kwargs.keys()]
        self.block_id = block_id
        self.token_offset = token_offset
    def __repr__(self):
        return "|".join([":".join([str(key), str(getattr(self, key))]) for key in self.sigla()])
    def __setitem__(self, key, value):
        self._sigla.append(key)
        self.__dict__[key] = value
    def __getitem__(self, key):
        return self.__dict__[key]
    def __contains__(self, key):
        return key in self.__dict__
    def values(self):
        return self.__dict__.values()
    def sigla(self):
        return self._sigla
    def sigla_non_magic_number(self):
        return [siglum for siglum in self.sigla() if self[siglum] != magic_number]

    # #####
    # TODO: remove find_shared_sigla() and remove dependency on it from comparisons below
    # #####
    def find_shared_sigla(self, other):
        """Return shared sigla for use in comparison"""
#         print(f"{self.__dict__=}")
#         print(f"{self.sigla()}")
#         print(f"{[item for item in self.sigla()]}")
        # print(f"{other.sigla()=}")
        list_self = [siglum for siglum in self.sigla() if self[siglum] != magic_number]
#         print(f"{list_self=}")
        non_magic_self = set(list_self)
#         print(f"{non_magic_self=}")
        non_magic_other = set([siglum for siglum in other.sigla() if getattr(other, siglum) != magic_number])
#         print(f"{non_magic_other=}")
        shared_sigla = sorted(non_magic_self.intersection(non_magic_other))
#         print(f"{shared_sigla=}")
#         if not shared_sigla: # this should never happen
#             raise ComparisonError("No shared sigla; cannot be compared")
        return shared_sigla
    def __eq__(self, other):
        # Compare shared non-magic sigla with highest priority, but also the rest, with lower priority
        shared_sigla = self.find_shared_sigla(other) # there may or may not be shared non-magic sigla
        if shared_sigla: # are the shared non-magic values equal?
            shared_equal = [getattr(self, siglum) for siglum in shared_sigla] == [getattr(other, siglum) for siglum in shared_sigla]
        else:
            shared_equal = False
        # now compare non-shared sigla in stable order
        # includes cases where one or both values are magic
        non_shared_equal = \
        [getattr(self, siglum) for siglum in self.sigla() if shared_sigla and siglum not in shared_sigla] == \
        [getattr(other, siglum) for siglum in other.sigla() if shared_sigla and siglum not in shared_sigla]
        return shared_equal and non_shared_equal
    def __lt__(self, other):
        shared_sigla = self.find_shared_sigla(other)
        if shared_sigla: # are the shared non-magic values equal?
            if [getattr(self, siglum) for siglum in shared_sigla] > [getattr(other, siglum) for siglum in shared_sigla]:
                return False
            else:
                shared_less_than = [getattr(self, siglum) for siglum in shared_sigla] < [getattr(other, siglum) for siglum in shared_sigla]
        else: # there are no shared sigla, so check non-shared
            shared_less_than = True # was False
        return shared_less_than
#         # now compare non-shared sigla in stable order
#         # includes cases where one or both values are magic
#         # print(self.sigla())
#         # print(shared_sigla)
#         non_shared_less_than = \
#             [getattr(self, siglum) for siglum in self.sigla() if shared_sigla and siglum not in shared_sigla] < \
#             [getattr(other, siglum) for siglum in other.sigla() if shared_sigla and siglum not in shared_sigla]
#         return shared_less_than or non_shared_less_than

In [5]:
from dataclasses import dataclass

@dataclass
class LinkedListNode: # access only indirectly, through LinkedList
    next_node: 'LinkedListNode'
    key: object # VG_node objects

    def is_tail(self):
        return self.next_node == None

class LinkedList:
    def __init__(self):
        self.head = None

    def nodes(self): # internal nodes, just for debugging (use items instead)
        if self.head == None:
            return

        yield self.head
        current_node = self.head.next_node

        while(current_node != None):
            node, current_node = current_node, current_node.next_node
            yield node

    def items(self): # generator that returns keys of nodes
        return (node.key for node in self.nodes())

    def __len__(self): # call with len() function
        count:int = 0
        for item in self.items():
            count += 1
        return count

    # TODO: use search with a filter where only nodes with the same sigla as the key we are looking for are allowed to pass.
    def insert(self, key):
        # debug
#         print("We want to insert: "+str(key))
#         print("List before the insert:")
#         print([item for item in vg.items()])

        if self.head == None: # list is empty; there is no head
            self.head = LinkedListNode(None, key)
            return

        filter_ = lambda node: node.key.find_shared_sigla(key)
        # filter_ = lambda node: True
        result_node, _ = self.search_with_filter(key, filter_)
        # three possibilities when while loop ends or breaks:
        # current_node == None --> key is not in the list, key has to be inserted in front of the list
        # current_node.key == key. Key is already in the list on the current_node (not with our data)
        # current_node.key < key < next_node.key. key needs to be placed immediately to the right of current node.

        if result_node == None: # Only when new item should be prepended to current head
            # insert as new head at far left
            self.head = LinkedListNode(self.head, key)
        elif result_node.next_node and result_node.next_node.key == key:
            raise Exception("DuplicateKeyException. key " + str(key) + " is already in the list")
        else: # either no next node (end of list), or next node is higher
            new_node = LinkedListNode(result_node.next_node, key)
            result_node.next_node = new_node

        # debug
        # check the order of the witnesses
        current_offset={}
        current_offset['w0'] = -2
        current_offset['w1'] = -2
        current_offset['w2'] = -2
        current_offset['w3'] = -2

        vg_nodes = self.items()
        for vg_node in vg_nodes:
            non_magic_sigla = vg_node.sigla_non_magic_number()
            if 'w0' in non_magic_sigla and vg_node['w0'] <= current_offset['w0']:
                raise Exception("We want to place: "+str(key)+", w0 is out of order, list is: " + str(list(self.items())))
            if 'w1' in non_magic_sigla and vg_node['w1'] <= current_offset['w1']:
                raise Exception("We want to place: "+str(key)+", w1 is out of order, list is: " + str(list(self.items())))
            if 'w2' in non_magic_sigla and vg_node['w2'] <= current_offset['w2']:
                raise Exception("We want to place: "+str(key)+", w2 is out of order, list is: " + str(list(self.items())))
            if 'w3' in non_magic_sigla and vg_node['w3'] <= current_offset['w3']:
                raise Exception("We want to place: "+str(key)+", w3 is out of order, list is: " + str(list(self.items())))

            if 'w0' in non_magic_sigla:
                current_offset['w0'] = vg_node['w0']
            if 'w1' in non_magic_sigla:
                current_offset['w1'] = vg_node['w1']
            if 'w2' in non_magic_sigla:
                current_offset['w2'] = vg_node['w2']
            if 'w3' in non_magic_sigla:
                current_offset['w3'] = vg_node['w3']


    def remove(self, key):
        """Remove node with key as its key value

        Removal target must exist, otherwise (search() returns None) raise error"""
#         debug
#         print("We want to remove: "+str(key))
#         print("List before the remove:")
#         print([item for item in vg.items()])



        if self.head == None:
            raise Exception("EmptyListException. key " + str(key) + " cannot be removed from an empty list.")

        filter_ = lambda node: node.key.find_shared_sigla(key)
        result_node, _ = self.search_with_filter(key, filter_)
        # three possibilities when while loop ends or breaks:
        # current_node == None --> head needs to be replaced
        # current_node.next_node.key == key; found node to remove, so reset pointer to point beyond it
        # key < current_node.next_node.key; trying to remove something that doesn't exist -> error

        if result_node == None: # no node points to result, so node to remove is head
            # change head (removing old head from list)
            self.head = self.head.next_node

        # filter out nodes that do not pass the filter
        # the result node already passed the filter, so we have to check the next on the current node
        while (result_node.next_node and not filter_(result_node.next_node)):
            result_node = result_node.next_node

        if result_node.next_node.key == key:
            # found node to remove, so reset pointer to point beyond it
            result_node.next_node = result_node.next_node.next_node
        else: # we're trying to remove something that doesn't exist, so raaise exception
            raise Exception("Key " + str(key) + " is not in the list, so cannot remove!")


    def search(self, key):
        """Return node that points to key (for remove()) or to insertion point (for insert()):

        If key doesn't exist, points to node immediately after
            where key would be located if it existed (use for insert())
        If key exists, points to key itself (use for remove())

        Returns None if first node in list is >= key"""

        if self.head == None:
            return None

        generator = self.nodes()
        current_node, next_node = None, next(generator)

        while (next_node):
            if next_node.key < key:
                # move on to the next node
                current_node, next_node = next_node, next(generator, None)
            else:
                break
        return current_node

    #TODO: This function can be refactored by using: search_with_filter function and a specific floor and ceiling related filter
    def find_floor_and_ceiling(self, key: VG_node) -> (VG_node, VG_node):
        """Find floor and ceiling for node to place
        Parameter:
            key: VG_node (only one non-magic value, the token position, for witness of interest)
            (Filter is not passed because it can be computed from the key)

        Return:
            (floor: VG_node, ceiling: VG_node) (floor and ceiling nodes as objects)
            floor is rightmost node in list with token position for current witness
                that is less than the token position for that witness on the key
            ceiling is leftmost node in list with toekn position for current witness
                that iss greater than the token position for that witness on the key

        Floor and ceiling must have:
            1. Non-magic value for witness we're looking at at the moment
            2. At least one other non-magic value

        TODO: Test cell for this method"""
        if self.head == None:
            return None, None

        generator = self.nodes()
        current_node, next_node = None, next(generator)

        while (next_node):
            # check whether the node has:
            # 1) the witness on it of the token that we want to place, and
            # 2) at least one other witness
            if next_node.key.find_shared_sigla(key) and len(list(next_node.key.sigla())) > 1:
                # print(f"{next_node.key=}", len(list(next_node.key.sigla())))
                if next_node.key < key:
                    # move on to the next node
                    current_node, next_node = next_node, next(generator, None)
                else:
                    break
            else:
                next_node = next(generator, None)
        floor = current_node
        ceiling = next_node
        return floor, ceiling

    def search_with_filter(self, key, filter_):
        if self.head == None:
            return None, None

        generator = self.nodes()
        current_node, next_node = None, next(generator)

        while (next_node):
            # check whether the linked list node passses the supplied filter
            # if the filter needs to know a domain specific thing it needs to ask for the key.
            if filter_(next_node):
                # print(f"{next_node.key=}", len(list(next_node.key.sigla())))
                if next_node.key < key:
                    # move on to the next node
                    current_node, next_node = next_node, next(generator, None)
                else:
                    break
            else:
#                 print("Debug: we are looking for: "+str(key)+ " and are skipping: "+str(next_node.key))
                next_node = next(generator, None)
        return current_node, next_node

    def items_floor_to_ceiling(self, floor: VG_node, ceiling: VG_node):
        current_node = floor
        yield current_node.key

        while(current_node != ceiling):
            node, current_node = current_node, current_node.next_node
            yield node.key

    def __str__(self) -> str:
        return "\n".join(
            [
                ','.join(
                    [token_array[item[item.sigla()[0]]],
                    ','.join(item.sigla())]
                ) for item in list(self.items())[1:-1]
            ]
        )

    # test
# tail_value=99
# witness_sigla = ['w0', 'w1', 'w2', 'w3']
# start_node = VG_node(None, None, witness_sigla, w0=-1, w1=-1, w2=-1, w3=-1)
# end_node = VG_node(None, None, witness_sigla, w0=tail_value, w1=tail_value, w2=tail_value, w3=tail_value)

# print(f"{start_node > end_node=}")

# list = LinkedList()
# print(f"{len(list)=}")
# list.insert(start_node)

# print(f"{len(list)=}")
# for index, item in enumerate(list.items()):
#     print(f"{index=} {item=}")

# list.insert(end_node)

# print(f"{len(list)=}")
# for index, item in enumerate(list.items()):
#     print(f"{index=} {item=}")

In [6]:
# we can use token-string frequency (overall) to align rare tokens first within each block
from collections import Counter
all_token_frequencies = Counter(token_array)
# exclude singletons (not in blocks)
token_frequencies = {k:v for k, v in all_token_frequencies.items() if v > 1}
# bitarray keeps track of which tokens we've already placed in skiplist
from bitarray import bitarray
placed_tokens = bitarray(len(token_array))
placed_tokens.setall(0)
# print(placed_tokens) # take a look

In [7]:
# pp.pprint(token_frequencies) # take a look

# Skiplist setup code above, now create vg (variant graph) skiplist below

# Import code to generate SVG of skiplist from render_skiplist_as_svg.ipynb

In [8]:
def debug(vg: LinkedList) -> str:
    return " : ".join([token_array[item[item.sigla()[0]]] for item in list(vg.items())[1:-1]])

In [9]:
vg = LinkedList()
tail_value = len(token_array) + 1
vg.insert(VG_node(None, None, witness_sigla, w0=-1, w1=-1, w2=-1, w3=-1))
vg.insert(VG_node(None, None, witness_sigla, w0=tail_value, w1=tail_value, w2=tail_value, w3=tail_value))

for index, prioritized_block in enumerate(prioritized_blocks):
#     if index: # limit number of blocks for testing; open the limit for production
        all_offsets = range(prioritized_block.token_count)
        sorted_offsets = sorted(all_offsets, key=lambda x: token_frequencies[token_array[prioritized_block.all_start_positions[0] + x]])

        for sorted_offset in sorted_offsets: # process infrequent token strings first (less duplication)
            for start_position in prioritized_block.all_start_positions:
                # print(prioritized_block, debug(vg))
                witness_token_pos = sorted_offset + start_position
                if placed_tokens[witness_token_pos]:
                    continue # already placed, so look at the next token
                placed_tokens[witness_token_pos] = 1 # we're going to place it below
                current_witness = 'w' + str(token_membership_array[witness_token_pos])
                siglum_filter = lambda x: current_witness in x.sigla() and len(x.sigla()) > 1
                key_to_check = VG_node(index, sorted_offset, witness_sigla, **{current_witness: witness_token_pos})
                string_to_look_for = token_array[witness_token_pos]

                floor_node, ceiling_node = vg.find_floor_and_ceiling(key_to_check)
                # debug
                #                 print(f"{key_to_check=} {token_array[key_to_check[key_to_check.sigla()[0]]]} {floor_node.key=} {ceiling_node.key=}")
                gen = vg.items_floor_to_ceiling(floor_node, ceiling_node)
                zzz = next(gen) # we have to skip the floor node
                current_slice = list(gen) # TODO: we listify only to check length; do something smarter
                if not len(current_slice): # floor and ceiling are adjacent, so insert between them
                    vg.insert(key_to_check)
                else: # slice not empty, so check each node until we find (or don't find) same string value
                    for current_node in current_slice: # current_node is a VG_node
                         if current_witness not in current_node.sigla() and token_array[current_node[current_node.sigla()[0]]] == token_array[witness_token_pos]:
                            vg.remove(current_node) # found one; remove matching node by key
                            current_node[current_witness] = witness_token_pos # add new token to key
                            # debug
                            #                             print("Node after the merge: "+str(current_node))
                            vg.insert(current_node) # update key and value, insert
                            break
                    else:
                        vg.insert(key_to_check)

# place tokens that are not parts of blocks
unplaced_tokens = [offset for offset in range(len(token_array))
                   if offset not in token_to_block_dict
                   and isinstance(token_membership_array[offset], int)]
for unplaced_token in unplaced_tokens:
    current_witness = 'w' + str(token_membership_array[unplaced_token])
    new_node = VG_node(None, None, witness_sigla, **{current_witness: unplaced_token})
    vg.insert(new_node)

Exception: We want to place: w1:264|w2:497|w3:733|w0:187, w1 is out of order, list is: [w0:-1|w1:-1|w2:-1|w3:-1, w0:178, w0:180, w0:181, w0:182, w0:183, w0:185, w0:186, w1:264|w2:497|w3:733|w0:187, w0:188, w0:190, w0:191, w0:192, w0:193, w0:194, w0:195, w0:198, w0:202, w0:203, w0:208, w0:209, w0:212, w0:213, w0:214, w0:215, w0:217, w0:218, w0:220, w0:223, w0:227, w0:229, w0:230, w0:232, w1:234|w2:467|w3:703, w1:235|w2:468|w3:704, w1:236|w2:469|w3:705, w1:237|w2:470|w3:706, w1:238|w2:471|w3:707, w1:239|w2:472|w3:708, w1:240|w2:473|w3:709, w1:241|w2:474|w3:710, w1:242|w2:475|w3:711, w1:243|w2:476|w3:712, w1:244|w2:477|w3:713, w1:245|w2:478|w3:714, w1:246|w2:479|w3:715, w1:247|w2:480|w3:716, w1:248|w2:481|w3:717, w1:249|w2:482|w3:718, w1:250|w2:483|w3:719, w1:251|w2:484|w3:720, w1:252|w2:485|w3:721, w1:253|w2:486|w3:722, w1:254|w2:487|w3:723, w1:255|w2:488|w3:724, w1:256|w2:489|w3:725, w1:257|w2:490|w3:726, w1:258|w2:491|w3:727, w1:259|w2:492|w3:728, w1:260|w2:493|w3:729, w1:261|w2:494|w3:730, w1:262|w2:495|w3:731, w1:263|w2:496|w3:732, w1:265|w2:498|w3:734, w1:266|w2:499|w3:735, w1:267|w2:500|w3:736, w1:268|w2:501|w3:737, w1:269|w2:502|w3:738, w1:270|w2:503|w3:739, w1:271|w2:504|w3:740, w1:272|w2:505|w3:741, w1:273|w2:506|w3:742, w1:274|w2:507|w3:743, w1:275|w2:508|w3:744, w1:276|w2:509|w3:745, w1:277|w2:510|w3:746, w1:278|w2:511|w3:747, w1:279|w2:512|w3:748, w1:280|w2:513|w3:749, w1:281|w2:514|w3:750, w1:282|w2:515|w3:751, w1:283|w2:516|w3:752, w1:284|w2:517|w3:753, w1:285|w2:518|w3:754, w1:286|w2:519|w3:755, w1:287|w2:520|w3:756, w1:288|w2:521|w3:757, w1:289|w2:522|w3:758, w1:290|w2:523|w3:759, w1:291|w2:524|w3:760, w1:292|w2:525|w3:761, w1:293|w2:526|w3:762, w1:294|w2:527|w3:763, w1:295|w2:528|w3:764, w1:296|w2:529|w3:765, w1:297|w2:530|w3:766, w1:298|w2:531|w3:767, w1:299|w2:532|w3:768, w1:300|w2:533|w3:769, w1:301|w2:534|w3:770, w1:302|w2:535|w3:771, w1:303|w2:536|w3:772, w1:304|w2:537|w3:773, w1:305|w2:538|w3:774, w1:306|w2:539|w3:775, w1:307|w2:540|w3:776, w1:308|w2:541|w3:777, w1:309|w2:542|w3:778, w1:310|w2:543|w3:779, w1:311|w2:544|w3:780, w1:312|w2:545|w3:781, w1:313|w2:546|w3:782, w1:314|w2:547|w3:783, w1:315|w2:548|w3:784, w1:316|w2:549|w3:785, w1:317|w2:550|w3:786, w1:318|w2:551|w3:787, w1:319|w2:552|w3:788, w1:320|w2:553|w3:789, w1:321|w2:554|w3:790, w1:322|w2:555|w3:791, w1:323|w2:556|w3:792, w1:324|w2:557|w3:793, w1:325|w2:558|w3:794, w1:326|w2:559|w3:795, w1:327|w2:560|w3:796, w1:328|w2:561|w3:797, w1:329|w2:562|w3:798, w1:330|w2:563|w3:799, w1:331|w2:564|w3:800, w1:332|w2:565|w3:801, w1:333|w2:566|w3:802, w1:334|w2:567|w3:803, w1:335|w2:568|w3:804, w1:336|w2:569|w3:805, w1:337|w2:570|w3:806, w1:338|w2:571|w3:807, w1:339|w2:572|w3:808, w1:340|w2:573|w3:809, w1:341|w2:574|w3:810, w1:342|w2:575|w3:811, w1:343|w2:576|w3:812, w1:344|w2:577|w3:813, w1:345|w2:578|w3:814, w1:346|w2:579|w3:815, w1:347|w2:580|w3:816, w1:348|w2:581|w3:817, w1:349|w2:582|w3:818, w1:350|w2:583|w3:819, w1:351|w2:584|w3:820, w1:352|w2:585|w3:821, w1:353|w2:586|w3:822, w1:354|w2:587|w3:823, w1:355|w2:588|w3:824, w1:356|w2:589|w3:825, w1:357|w2:590|w3:826, w1:358|w2:591|w3:827, w1:359|w2:592|w3:828, w1:360|w2:593|w3:829, w1:361|w2:594|w3:830, w1:362|w2:595|w3:831, w1:363|w2:596|w3:832, w1:364|w2:597|w3:833, w1:365|w2:598|w3:834, w1:366|w2:599|w3:835, w1:367|w2:600|w3:836, w1:368|w2:601|w3:837, w1:369|w2:602|w3:838, w1:370|w2:603|w3:839, w1:371|w2:604|w3:840, w1:372|w2:605|w3:841, w1:373|w2:606|w3:842, w1:374|w2:607|w3:843, w1:375|w2:608|w3:844, w1:376|w2:609|w3:845, w1:377|w2:610|w3:846, w1:378|w2:611|w3:847, w1:379|w2:612|w3:848, w1:380|w2:613|w3:849, w1:381|w2:614|w3:850, w1:382|w2:615|w3:851, w1:383|w2:616|w3:852, w1:384|w2:617|w3:853, w1:385|w2:618|w3:854, w1:386|w2:619|w3:855, w1:387|w2:620|w3:856, w1:388|w2:621|w3:857, w1:389|w2:622|w3:858, w1:390|w2:623|w3:859, w1:391|w2:624|w3:860, w1:392|w2:625|w3:861, w1:393|w2:626|w3:862, w1:394|w2:627|w3:863, w1:395|w2:628|w3:864, w1:396|w2:629|w3:865, w1:397|w2:630|w3:866, w1:398|w2:631|w3:867, w1:399|w2:632|w3:868, w1:400|w2:633|w3:869, w1:401|w2:634|w3:870, w1:402|w2:635|w3:871, w1:403|w2:636|w3:872, w1:404|w2:637|w3:873, w1:405|w2:638|w3:874, w1:406|w2:639|w3:875, w1:407|w2:640|w3:876, w1:408|w2:641|w3:877, w1:409|w2:642|w3:878, w1:411|w2:644|w3:880, w1:413|w2:646|w3:882, w1:414|w2:647|w3:883, w1:415|w2:648|w3:884, w1:416|w2:649|w3:885, w1:418|w2:651|w3:887, w1:419|w2:652|w3:888, w1:421|w2:654|w3:890, w1:423|w2:656|w3:892, w1:424|w2:657|w3:893, w1:425|w2:658|w3:894, w1:426|w2:659|w3:895, w1:427|w2:660|w3:896, w1:428|w2:661|w3:897, w1:431|w2:664|w3:900, w1:435|w2:668|w3:904, w1:436|w2:669|w3:905, w1:441|w2:674|w3:910, w1:442|w2:675|w3:911, w1:445|w2:678|w3:914, w1:446|w2:679|w3:915, w1:447|w2:680|w3:916, w1:448|w2:681|w3:917, w1:450|w2:683|w3:919, w1:451|w2:684|w3:920, w1:453|w2:686|w3:922, w1:456|w2:689|w3:925, w1:460|w2:693|w3:929, w1:462|w2:695|w3:931, w1:463|w2:696|w3:932, w1:465|w2:698|w3:934, w0:936|w1:936|w2:936|w3:936]

In [0]:
# for block in prioritized_blocks:
#     print(' '. join(token_array[block.all_start_positions[0]: block.all_start_positions[0] + block.token_count]))

# Import code to generate SVG of variant graph from render_variant_graph_as_svg.ipynb

Rendering instruction in imported module doesn’t work; we need it issue it here

In [0]:
# get ready to visualize the decision tree in SVG
import graphviz
from IPython.display import SVG
from collections import defaultdict

# node id values must be strings for graphviz
a = graphviz.Digraph(format="svg")
a.attr(rankdir = "LR")

for index, vg_node in enumerate(vg.items()): # list of SkiplistNode objects, omit head and tail
# create nodes
    node_id = str(index)
    token_offset = [vg_node[siglum] for siglum in vg_node.sigla()][0] # arbitrary offset in token_array
    if token_offset < 0:
        label = 'START'
    elif token_offset == tail_value:
        label = 'END'
    else:
        label = token_array[token_offset]
    a.node(node_id, label=(node_id + ':' + label))

# create edges witness by witness (not for production; we merge edges below)
# for w in range(len(witnesses)): # w is offset of witness in list of witnesses
#     siglum = 'w' + str(w)
#     witness_nodes = []
#     for index, vg_node in enumerate(vg.items()):
#         if siglum in vg_node.sigla():
#             witness_nodes.append(index)
#     for source, target in zip(witness_nodes, witness_nodes[1:]):
#         a.edge(str(source), str(target), label=str(siglum))

# create dictionary of all edges; key is (source, target) node id, value is list of sigla
all_edges = defaultdict(list)
for w in range(len(witnesses)): # w is offset of witness in list of witnesses
    siglum = 'w' + str(w) # siglum as recorded in SkiplistNode keys
    witness_nodes = []
    for index, vg_node in enumerate(vg.items()):
        if siglum in vg_node.sigla():
            witness_nodes.append(index)
    for source, target in zip(witness_nodes, witness_nodes[1:]):
        all_edges[(source, target)].append(siglum)

# add edges to graph
for edge_endpoints, sigla in all_edges.items():
    a.edge(str(edge_endpoints[0]), str(edge_endpoints[1]), label=",".join(sigla))

SVG(a.view())

In [0]:
# " : ".join([token_array[item[item.sigla()[0]]] for item in list(vg.items())[1:-1]])

w0: the(2), red(2), and(0), the(0, 1), black(1)
w1: the(1), black(1), and(0), the(0, 2), red(2)

Topological: the, red ~ black, and, the, red ~ black, cat