# Collation with decision tree

## Data

In [1]:
# two witnesses, with repetition and transposition
w1 = """the red and the black cat"""
w2 = """the black and the red cat"""

## Find all ngrams in each witness

In [2]:
def tokenize_witnesses(w1_string, w2_string):
    '''Return list of witnesses, each represented by a list of tokens'''
    w1_tokens = w1.split()
    w2_tokens = w2.split()
    witnesses = [w1_tokens, w2_tokens]
    return witnesses
witnesses = tokenize_witnesses(w1, w2)
witnesses

[['the', 'red', 'and', 'the', 'black', 'cat'],
 ['the', 'black', 'and', 'the', 'red', 'cat']]

## Define function to find positions of ngrams in witnesses

In [3]:
from collections import defaultdict

def compute_ngrams_all(witness):
   '''Create a function that creates n-grams and returns the offsets'''
   output = defaultdict(list)
   output2 = {}
   for n in range(1, len(witness) + 1):
       for i in range(len(witness)-n+1):
           g = ' '.join(witness[i:i+n])
           output[g].append(i)
           output2[g] = n
   return output, output2

## Store ngrams lengths in ngram_length dictionary

Creates two dictionaries:

1. *ngram_length* (key is ngram text, value is ngram token count)
1. *ngram_offset_by_witness_dict* (key is witness, value is dictionary keyed by ngram, value is list of offsets of that ngram in that witness)

*ngram_length* is used frequently. *ngram_offset_by_witness_dict* is used one, immediately below, to create combined representation of ngram text
plus location in both witnesses

In [4]:
ngram_offset_by_witness_dict = {} # used to find shared ngrams (next cell)
ngram_length = {} # used to look up ngram length (frequently)
for index, witness in enumerate(witnesses):
    map1, map2 = compute_ngrams_all(witness)
    ngram_offset_by_witness_dict['w' + str(index + 1)] = map1
    ngram_length.update(map2)

for key in ngram_offset_by_witness_dict.keys():
    print(key, ngram_offset_by_witness_dict[key])

print(ngram_length)

w1 defaultdict(<class 'list'>, {'the': [0, 3], 'red': [1], 'and': [2], 'black': [4], 'cat': [5], 'the red': [0], 'red and': [1], 'and the': [2], 'the black': [3], 'black cat': [4], 'the red and': [0], 'red and the': [1], 'and the black': [2], 'the black cat': [3], 'the red and the': [0], 'red and the black': [1], 'and the black cat': [2], 'the red and the black': [0], 'red and the black cat': [1], 'the red and the black cat': [0]})
w2 defaultdict(<class 'list'>, {'the': [0, 3], 'black': [1], 'and': [2], 'red': [4], 'cat': [5], 'the black': [0], 'black and': [1], 'and the': [2], 'the red': [3], 'red cat': [4], 'the black and': [0], 'black and the': [1], 'and the red': [2], 'the red cat': [3], 'the black and the': [0], 'black and the red': [1], 'and the red cat': [2], 'the black and the red': [0], 'black and the red cat': [1], 'the black and the red cat': [0]})
{'the': 1, 'red': 1, 'and': 1, 'black': 1, 'cat': 1, 'the red': 2, 'red and': 2, 'and the': 2, 'the black': 2, 'black cat': 2, '

## Find keys shared by all (both) witnesses

In [5]:
shared_ngrams = set(ngram_offset_by_witness_dict['w1'].keys()).intersection(set(ngram_offset_by_witness_dict['w2'].keys()))
shared_ngrams # take a look

{'and', 'and the', 'black', 'cat', 'red', 'the', 'the black', 'the red'}

## Use shared keys to find potential alignments

In [6]:
# Output format: {ngram : [(0,1), (2,3)]}, where 
#   the two entries in each tuple are for witnesses A and B
# NOTE: works for only two witnesses
from collections import defaultdict

potential_alignments = defaultdict(list)
for ngram in shared_ngrams:
    for w1_offset in ngram_offset_by_witness_dict['w1'][ngram]:
        for w2_offset in ngram_offset_by_witness_dict['w2'][ngram]:
            potential_alignments[ngram].append((w1_offset, w2_offset))

potential_alignments # take a look

defaultdict(list,
            {'the red': [(0, 3)],
             'the': [(0, 0), (0, 3), (3, 0), (3, 3)],
             'and': [(2, 2)],
             'cat': [(5, 5)],
             'the black': [(3, 0)],
             'black': [(4, 1)],
             'and the': [(2, 2)],
             'red': [(1, 4)]})

## Define function to compute weight (relative length and frequency) of ngram

In [7]:
# Formula to compute weight: depth / frequency * length
#
# depth is the number of witnesses a pattern appears in
# frequency is the overall times a pattern occurs in the witness set
# length is the number of tokens of a pattern

def compute_ngram_weight(ngram):
    depth = 2 # constant here; variable if we allow more than two witnesses
    frequency = len(potential_alignments[ngram])
    length = ngram_length[ngram]
    return depth / frequency * length

## Create ordered dictionary, keyed by ngram text and sorted by ngram weight (high to low)

Dictionaries preserve insertion order since Python 3.7

In [8]:
ngram_weights = defaultdict(float)
for ngram in potential_alignments.keys():
    ngram_weights[ngram] = (compute_ngram_weight(ngram))
sorted_ngram_weights = {k: v for k, v in sorted(ngram_weights.items(), key=lambda item: item[1], reverse=True)}
sorted_ngram_weights # take a look

{'the red': 4.0,
 'the black': 4.0,
 'and the': 4.0,
 'and': 2.0,
 'cat': 2.0,
 'black': 2.0,
 'red': 2.0,
 'the': 0.5}

## Use pandas dataframe instead of dictionary; columns are offsets, single ngrams, ngram weights
## Create initial dictionary of all potential ngram alignments, keyed by offsets

In [9]:
# Find actual alignments (ngrams with positions in witnesses)
#
# Build dictionary of offset_tuple : list_of_ngrams
# Tuples of offsets are keys, sorted; values are lists of ngrams at those offsets, unsorted

alignments_unsorted = defaultdict(list)
for key,value in potential_alignments.items():
    for t in value:
        alignments_unsorted[t].append(key)
# Sort ngrams for each key (pair of offsets) from longest to shortest
alignments_sorted_values = {k: sorted(v, key=lambda x: ngram_length[x], reverse=True) for k, v in alignments_unsorted.items()}
# Sort dictionary by weight of longest ngram at offset
alignments = {k: v for k, v in sorted(alignments_sorted_values.items(), key=lambda item: ngram_weights[item[1][0]], reverse=True)}
alignments # take a look

{(0, 3): ['the red', 'the'],
 (3, 0): ['the black', 'the'],
 (2, 2): ['and the', 'and'],
 (5, 5): ['cat'],
 (4, 1): ['black'],
 (1, 4): ['red'],
 (0, 0): ['the'],
 (3, 3): ['the']}

## Initialize pandas

In [10]:
import pandas as pd
import numpy as np

## Create function to return token positions covered by ngram

In [11]:
def token_positions_involved_in_ngram(ngram: str, start_position: int) -> range:
    return range(start_position, start_position + ngram_length[ngram])

## Create dataframe for alignment data

The ngrams column holds a (mutable) list, which is risky

In [12]:
df = pd.DataFrame([[k[0], k[1], v, sorted_ngram_weights[v[0]]] for k, v in alignments.items()], columns=['A', 'B', 'ngrams', 'weight'])
df # take a look

Unnamed: 0,A,B,ngrams,weight
0,0,3,"[the red, the]",4.0
1,3,0,"[the black, the]",4.0
2,2,2,"[and the, and]",4.0
3,5,5,[cat],2.0
4,4,1,[black],2.0
5,1,4,[red],2.0
6,0,0,[the],0.5
7,3,3,[the],0.5


## Add bitarrays for the two witnesses for the longest ngram in each pattern instance

In [13]:
def find_committed_tokens(df_columns, token_offsets):
    '''Return bitarray of tokens in witness committed by longest ngram in row

    Parameters:

    df_columns -- df view of rows x columns, where columns are:
        0) start position of ngram in witness
        1) list of ngrams in that row (extracts first, which is longest)
    token_offsets -- Series of all token positions in witness
        (sequential integers from 0, may be different for different witnesses)
    '''
    committed_tokens = np.arange(df_columns[0], df_columns[0] + ngram_length[df_columns[1][0]], 1)
    return np.in1d(token_offsets, committed_tokens) # https://stackoverflow.com/questions/7088625/what-is-the-most-efficient-way-to-check-if-a-value-exists-in-a-numpy-array

In [14]:
# Avoid itertuples; see https://realpython.com/numpy-array-programming/#what-is-vectorization
# TODO: 
token_offsets_A = np.arange(0, len(witnesses[0]), 1) # list of token offsets (need to do witnesses separately, may be different lengths))
token_offsets_B = np.arange(0, len(witnesses[1]), 1) # list of token offsets (need to do witnesses separately, may be different lengths))
df['ba_A'] = df[['A', 'ngrams']].apply(find_committed_tokens, args=(token_offsets_A,), axis=1)
df['ba_B'] = df[['B', 'ngrams']].apply(find_committed_tokens, args=(token_offsets_B,), axis=1)
df

Unnamed: 0,A,B,ngrams,weight,ba_A,ba_B
0,0,3,"[the red, the]",4.0,"[True, True, False, False, False, False]","[False, False, False, True, True, False]"
1,3,0,"[the black, the]",4.0,"[False, False, False, True, True, False]","[True, True, False, False, False, False]"
2,2,2,"[and the, and]",4.0,"[False, False, True, True, False, False]","[False, False, True, True, False, False]"
3,5,5,[cat],2.0,"[False, False, False, False, False, True]","[False, False, False, False, False, True]"
4,4,1,[black],2.0,"[False, False, False, False, True, False]","[False, True, False, False, False, False]"
5,1,4,[red],2.0,"[False, True, False, False, False, False]","[False, False, False, False, True, False]"
6,0,0,[the],0.5,"[True, False, False, False, False, False]","[True, False, False, False, False, False]"
7,3,3,[the],0.5,"[False, False, False, True, False, False]","[False, False, False, True, False, False]"


## Create root node of decision tree

In [15]:
decision_tree = {}

# Add root node
def create_root_node():
    # Create outer dictionary for decision tree nodes
    global decision_tree
    decision_tree = {}

    decision_tree[0] = {}

    # Supply properties for root node
    decision_tree[0]['id'] = 0 # what's my key?
    decision_tree[0]['type'] = None # normally 'align' or 'transpose'
    decision_tree[0]['parent'] = None # integer
    decision_tree[0]['children'] = []
    decision_tree[0]['aligned-patterns'] = []
    decision_tree[0]['aligned-patterns-text'] = [] # debug
    decision_tree[0]['transposed-patterns'] = []
    decision_tree[0]['transposed-patterns-text'] = [] # debug
    decision_tree[0]['blocked-patterns'] = []
    decision_tree[0]['blocked-patterns-text'] = [] # debug



In [16]:
# calculate the highest weight in the table. At the moment without masking.
# Given the heightest weight make a selection of the rows equal to that weight.
max = df['weight'].max()
max


4.0

In [17]:
rarest_patterns = df.loc[df['weight'] == max]
rarest_patterns

Unnamed: 0,A,B,ngrams,weight,ba_A,ba_B
0,0,3,"[the red, the]",4.0,"[True, True, False, False, False, False]","[False, False, False, True, True, False]"
1,3,0,"[the black, the]",4.0,"[False, False, False, True, True, False]","[True, True, False, False, False, False]"
2,2,2,"[and the, and]",4.0,"[False, False, True, True, False, False]","[False, False, True, True, False, False]"


In [18]:
#         # child looks like: (1, 2, ('hi mon', 'hi'))
#         # start location in A, in B, ngrams at that location
#         child = (child[0], child[1], child[2][0]) # prune ngrams to keep only longest; NB: redefining "child" variable
#         current_ngram_length = ngram_length[child[2]]
#         id = len(decision_tree)
#         decision_tree[id] = {}
#         decision_tree[id]['id'] = id
#         if child[0] == decision_tree[parent_id]['potential-alignments-by-A'][0][0] and child[1] == decision_tree[parent_id]['potential-alignments-by-B'][0][1]:
#             decision_tree[id]['type'] = 'closest-in-both'
#         else:
#             decision_tree[id]['type'] = 'closest-in-one'
#         decision_tree[id]['current-location-in-A'] = child[0] + current_ngram_length - 1
#         decision_tree[id]['current-location-in-B'] = child[1] + current_ngram_length - 1
#         decision_tree[id]['parent'] = parent_id
#         decision_tree[id]['children'] = []
#         aligned_patterns = decision_tree[parent_id]['aligned-patterns'].copy()
#         aligned_patterns.append(child)
#         decision_tree[id]['aligned-patterns'] = aligned_patterns
#         transposed_patterns = decision_tree[parent_id]['transposed-patterns'].copy()
#         decision_tree[id]['transposed-patterns'] = transposed_patterns

#         # TODO: Because lists are sorted, once one is far enough to the right to avoid transposition,
#         #   all following ones are also okay (in this case, though, must process A and B separately)
#         # Similarly, once one ngram is safe, all shorter ones are also safe, and don't need to be checked
#         decision_tree[id]['potential-alignments-by-A'] = []

#         # while figuring the new potential alignments by a and b we will find transpositions that we need to store.
#         # We use bitarrays to track (avoid) overlap (subsequences) between detected transposed patterns.
#         # ba1 and ba2 record transposed patterns
#         from bitarray import bitarray
#         ba1 = bitarray(len(witnesses[0]))
#         ba1.setall(0)
#         ba2 = bitarray(len(witnesses[1]))
#         ba2.setall(0)

#         #TEMP: transposed-patterns': [(2, 2, 'and the'), (4, 1, 'black')],
#         for tp in decision_tree[id]['transposed-patterns']:
#             # now we need to fill the bitarray; We need the start position in each witness and the length of the pattern.
#             tp_ngram_length = ngram_length[tp[2]]
#             ba1[tp[0]:tp[0] + tp_ngram_length] = 1
#             ba2[tp[1]:tp[1] + tp_ngram_length] = 1

#         # ba3 and ba4 record aligned pattern being added
#         ba3 = bitarray(len(witnesses[0]))
#         ba3.setall(0)
#         ba4 = bitarray(len(witnesses[1]))
#         ba4.setall(0)
#         ba3[child[0]:child[0] + current_ngram_length] = 1
#         ba4[child[1]:child[1] + current_ngram_length] = 1

#         for p in decision_tree[parent_id]['potential-alignments-by-A']: # check for potentials and transpositions
#             if p[0] > decision_tree[id]['current-location-in-A'] and p[1] > decision_tree[id]['current-location-in-B']:
#                 decision_tree[id]['potential-alignments-by-A'].append(p) # both are to the right, so it's still potential
#             else: # check whether it's a transposition or an overlap
#                 for q in p[2]: # iterate over the different n-grams
#                     q_ngram_length = ngram_length[q] # length of current ngram inside current potential
#                     if ba3[p[0]:p[0] + q_ngram_length].any() or ba4[p[1]:p[1] + q_ngram_length].any(): # overlap; throw it away
#                         continue
#                     else: # transposition but is it alreadyin the transposed patterns property?
#                         if ba1[p[0]:p[0] + q_ngram_length].any() or ba2[p[1]:p[1] + q_ngram_length].any(): # already among transpositions
#                             continue
#                         decision_tree[id]['transposed-patterns'].append((p[0], p[1], q)) # update bitarrays with the new transposed pattern
#                         ba1[p[0]:p[0] + q_ngram_length] = 1
#                         ba2[p[1]:p[1] + q_ngram_length] = 1

#         # potentials by A and B are the same tuples, but sorted differently
#         decision_tree[id]['potential-alignments-by-B'] = sorted(decision_tree[id]['potential-alignments-by-A'], key=lambda x: (x[1], x[0]))

#         decision_tree[parent_id]['children'].append(id) # add new child to parent
#         if decision_tree[id]['potential-alignments-by-A']:
#             add_children(id) # recur to process children of new child

#     # add type #3 (skip) node
#     if len(decision_tree[parent_id]['children']) > 1:
#         skip_node_id = len(decision_tree)
#         decision_tree[skip_node_id] = {}
#         decision_tree[skip_node_id]['id'] = skip_node_id
#         decision_tree[skip_node_id]['type'] = 'skip'
#         decision_tree[skip_node_id]['current-location-in-A'] = nearest_A_matches[0][0] + ngram_length[nearest_A_matches[0][2][0]] - 1
#         decision_tree[skip_node_id]['current-location-in-B'] = nearest_B_matches[0][1] + ngram_length[nearest_B_matches[0][2][0]] - 1
#         decision_tree[skip_node_id]['parent'] = parent_id
#         decision_tree[skip_node_id]['children'] = []
#         aligned_patterns = decision_tree[parent_id]['aligned-patterns'].copy() # aligned patterns don't change
#         decision_tree[skip_node_id]['aligned-patterns'] = aligned_patterns
#         transposed_patterns = decision_tree[parent_id]['transposed-patterns'].copy()
#         decision_tree[skip_node_id]['transposed-patterns'] = transposed_patterns
#         potential_alignments_by_A = decision_tree[parent_id]['potential-alignments-by-A'].copy() # keep only potential greater than current position
#         decision_tree[skip_node_id]['potential-alignments-by-A'] = [t for t in potential_alignments_by_A if t[0] > decision_tree[skip_node_id]['current-location-in-A'] and t[1] > decision_tree[skip_node_id]['current-location-in-B']]
#         decision_tree[skip_node_id]['potential-alignments-by-B'] = sorted(decision_tree[skip_node_id]['potential-alignments-by-A'], key=lambda x: (x[1], x[0]))
#         decision_tree[parent_id]['children'].append(skip_node_id)

#         # while figuring the new potential alignments by a and b we will find transpositions that we need to store.
#         # We use bitarrays to track (avoid) overlap (subsequences) between detected transposed patterns.
#         # ba1 and ba2 record transposed patterns
#         from bitarray import bitarray
#         ba1 = bitarray(len(witnesses[0]))
#         ba1.setall(0)
#         ba2 = bitarray(len(witnesses[1]))
#         ba2.setall(0)

#         #INFO: 'transposed-patterns' looks like: [(2, 2, 'and the'), (4, 1, 'black')],
#         for tp in decision_tree[skip_node_id]['transposed-patterns']:
#             # now we need to fill the bitarray; We need the start position in each witness and the length of the pattern.
#             tp_ngram_length = ngram_length[tp[2]]
#             ba1[tp[0]:tp[0] + tp_ngram_length] = 1
#             ba2[tp[1]:tp[1] + tp_ngram_length] = 1

#         for t in potential_alignments_by_A:
#             if t[0] <= decision_tree[skip_node_id]['current-location-in-A'] or t[1] <= decision_tree[skip_node_id]['current-location-in-B']:
#                 # transposition, but is it already in the transposed-patterns property?
#                 t_ngram_length = ngram_length[t[2][0]]
#                 if ba1[t[0]:t[0] + t_ngram_length].any() or ba2[t[1]:t[1] + t_ngram_length].any(): # already among transpositions
#                     continue
#                 decision_tree[skip_node_id]['transposed-patterns'].append((t[0], t[1], t[2][0])) # update bitarrays with the new transposed pattern
#                 ba1[t[0]:t[0] + t_ngram_length] = 1
#                 ba2[t[1]:t[1] + t_ngram_length] = 1

#         if decision_tree[skip_node_id]['potential-alignments-by-A']:
#             add_children(skip_node_id)

In [19]:
# params:
# aligned_pattern_as_tuple: The pattern that is just aligned in this decision tree node and that might cause transpositions as a result.
# id: id of the decision tree node that we are operating on
def find_transpositions(aligned_pattern_as_tuple, id):
    # Transposed patterns can be found by looking at the potential patterns. If a pattern has a start position before the selected pattern in witness a
    # and a start position after the selected pattern in witness B it is transposed. Likewise, if a pattern has a start position after the selected pattern
    # in witness A and a start position before in witness B it is a transposition.
    sbf = df[['A', 'B', 'ngrams']]
    sbf
    # while figuring the new potential alignments by a and b we will find transpositions that we need to store.
    # We use bitarrays to track (avoid) overlap (subsequences) between detected transposed patterns.
    # ba1 and ba2 record transposed patterns
    from bitarray import bitarray
    ba1 = bitarray(len(witnesses[0]))
    ba1.setall(0)
    ba2 = bitarray(len(witnesses[1]))
    ba2.setall(0)

#         #INFO: 'transposed-patterns' looks like: [(2, 2, 'and the'), (4, 1, 'black')],
#         for tp in decision_tree[skip_node_id]['transposed-patterns']:
#             # now we need to fill the bitarray; We need the start position in each witness and the length of the pattern.
#             tp_ngram_length = ngram_length[tp[2]]
#             ba1[tp[0]:tp[0] + tp_ngram_length] = 1
#             ba2[tp[1]:tp[1] + tp_ngram_length] = 1


    # go over the rows of this subframe
    for row in sbf.itertuples():
#            print(type(row))
#            print(row)
        # if the row is not the same as the aligned pattern.
        # We can check whether a pattern is transposed.
        # we need to keep track of already detected transpositions to cover overlap between transpositions.
        if row.Index == aligned_pattern_as_tuple[0]:
            continue
        # check whether the row is in the blocked patterns property
        if row.Index in decision_tree[id]['blocked-patterns']:
            continue
        # print("Check for transposition: ", row)
        if row.A > aligned_pattern_as_tuple[1] and row.B < aligned_pattern_as_tuple[2] or row.A < aligned_pattern_as_tuple[1] and row.B > aligned_pattern_as_tuple[2]:
            t = (row.A, row.B, row.ngrams)
            # transposition, but is it already in the transposed-patterns property?
            # We need to try different lengths of the transposition
            for q in t[2]: # iterate over the different n-grams
                q_ngram_length = ngram_length[q] # length of current ngram inside current potential
                if ba1[t[0]:t[0] + q_ngram_length].any() or ba2[t[1]:t[1] + q_ngram_length].any(): # already among transpositions
                    if not row.Index in decision_tree[id]['blocked-patterns']:
                        decision_tree[id]['blocked-patterns'].append(row.Index)
                    continue
                # print("Transposed!")
                decision_tree[id]['transposed-patterns'].append(row.Index)
                decision_tree[id]['transposed-patterns-text'].append((t[0], t[1], q))
                decision_tree[id]['blocked-patterns'].append(row.Index)
                # update bitarrays with the new transposed pattern
                ba1[t[0]:t[0] + q_ngram_length] = 1
                ba2[t[1]:t[1] + q_ngram_length] = 1



In [20]:
# Function to add child nodes recursively
# Within children:
#   Add parent
#   Update aligned patterns
#   Update transposed patterns
#   Update potential alignments
#
# Add one types of nodes:
#
#   1. Align pattern
#
# Assign consecutive id values by counting size of dictionary = could break with multithreading

# we need to do two things. 1 calculate potential alignments. Remove patterns that overlap with the currently aligned patterns.
# Then calculate the transposed patterns.
# 2. We need to be able to mask the list of potential patterns from the main table with a list of integers, or later on a bit array to create a subset
# (or view) of the table.

# if we know which row is the pattern that is aligned (we need to store that) we can look at with which rows the selected pattern overlaps.
# These rows then need to be masked.
def add_children(parent_id: int):
    # Get the rarest patterns from the table
    max = df['weight'].max()
    rarest_patterns = df.loc[df['weight'] == max].itertuples()

    for columns_per_row in rarest_patterns:
        aligned_pattern_as_tuple = (columns_per_row.Index, columns_per_row.A, columns_per_row.B, columns_per_row.ngrams)
        # print(aligned_pattern_as_tuple)
        id = len(decision_tree)
        decision_tree[id] = {}
        decision_tree[id]['id'] = id
        decision_tree[id]['type'] = 'align'
        decision_tree[id]['parent'] = parent_id
        decision_tree[id]['children'] = []
        decision_tree[id]['aligned-patterns'] = decision_tree[parent_id]['aligned-patterns'].copy()
        decision_tree[id]['aligned-patterns-text'] = decision_tree[parent_id]['aligned-patterns-text'].copy() # debug
        decision_tree[id]['transposed-patterns'] = decision_tree[parent_id]['transposed-patterns'].copy()
        decision_tree[id]['transposed-patterns-text'] = decision_tree[parent_id]['transposed-patterns'].copy() # debug
        decision_tree[id]['blocked-patterns'] = decision_tree[parent_id]['blocked-patterns'].copy()
        decision_tree[id]['blocked-patterns-text'] = decision_tree[parent_id]['blocked-patterns-text'].copy() # debug

        # We need to add the selected pattern to the aligned patterns.
        decision_tree[id]['aligned-patterns'].append(aligned_pattern_as_tuple[0])
        decision_tree[id]['aligned-patterns-text'].append((aligned_pattern_as_tuple[1], aligned_pattern_as_tuple[2], aligned_pattern_as_tuple[3][0]))
        decision_tree[id]['blocked-patterns'].append(aligned_pattern_as_tuple[0])
        # We need to calculate the overlap with other patterns.
        # We walk over the rows of the pattern table and see whether there is overlap.
        # Of course we need to skip the row representing the current pattern and in the case of grandchildren we need to skip pattern discarded previously.

        # We can do a range check... The values in A and B are the start positions and we know the length of the pattern, meaning that there are two vectors, one         # for Witness A and B.
        # Notice how there are multiple patterns associated with one start position.
        for row in df.itertuples():
            # skip if the row is the same instance as the pattern that is just aligned.
            if row.Index == aligned_pattern_as_tuple[0]:
                continue

            # start by checking only the largest length
            # print("Checking row: ", row.A, row.B, row.ngrams)
            tuple = aligned_pattern_as_tuple
            t_length = ngram_length[tuple[3][0]]
            r_length = ngram_length[row.ngrams[0]]

            # if row.A or row.A + r_length -1 is a point in the range of the tuple[1] to tuple[1] + t_length -1
            if row.A >= tuple[1] and row.A <= tuple[1] + t_length - 1 or row.A + r_length - 1 >= tuple[1] and row.A + r_length - 1 <= tuple[1] + t_length - 1:
                # print(row.ngrams[0], " overlaps in witness A!")
                decision_tree[id]['blocked-patterns'].append(row.Index)
                decision_tree[id]['blocked-patterns-text'].append((row.A, row.B, row.ngrams))
            elif row.B >= tuple[2] and row.B <= tuple[2] + t_length - 1 or row.B + r_length - 1 >= tuple[2] and row.B + r_length - 1 <= tuple[2] + t_length - 1:
                # print(row.ngrams[0], " overlaps in witness B!")
                decision_tree[id]['blocked-patterns'].append(row.Index)
                decision_tree[id]['blocked-patterns-text'].append((row.A, row.B, row.ngrams))

        find_transpositions(aligned_pattern_as_tuple, id)



create_root_node()
add_children(0)
decision_tree

{0: {'id': 0,
  'type': None,
  'parent': None,
  'children': [],
  'aligned-patterns': [],
  'aligned-patterns-text': [],
  'transposed-patterns': [],
  'transposed-patterns-text': [],
  'blocked-patterns': [],
  'blocked-patterns-text': []},
 1: {'id': 1,
  'type': 'align',
  'parent': 0,
  'children': [],
  'aligned-patterns': [0],
  'aligned-patterns-text': [(0, 3, 'the red')],
  'transposed-patterns': [1],
  'transposed-patterns-text': [(3, 0, 'the black')],
  'blocked-patterns': [0, 2, 5, 6, 7, 1, 4],
  'blocked-patterns-text': [(2, 2, ['and the', 'and']),
   (1, 4, ['red']),
   (0, 0, ['the']),
   (3, 3, ['the'])]},
 2: {'id': 2,
  'type': 'align',
  'parent': 0,
  'children': [],
  'aligned-patterns': [1],
  'aligned-patterns-text': [(3, 0, 'the black')],
  'transposed-patterns': [0],
  'transposed-patterns-text': [(0, 3, 'the red')],
  'blocked-patterns': [1, 2, 4, 6, 7, 0, 5],
  'blocked-patterns-text': [(2, 2, ['and the', 'and']),
   (4, 1, ['black']),
   (0, 0, ['the']),
  

In [21]:
sbf = df[['A', 'B', 'ngrams']]
sbf

Unnamed: 0,A,B,ngrams
0,0,3,"[the red, the]"
1,3,0,"[the black, the]"
2,2,2,"[and the, and]"
3,5,5,[cat]
4,4,1,[black]
5,1,4,[red]
6,0,0,[the]
7,3,3,[the]
