# Multiple witness alignment with skipgrams, pandas version

In [1]:
%%html
<!-- tables should be left-aligned-->
<style>
table {margin-left:0 !important}
</style>

## Problem description

### Input

```json
{'wit1': ['a', 'b', 'c', 'a', 'd', 'e'],
 'wit2': ['a', 'e', 'c', 'd'],
 'wit3': ['a', 'd', 'b']
}
```

### Expected output

siglum | token | token | token | token | token | token | token | token
---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ----
wit1  | #start | a | b | c | a | d | e | #end
wit2  | #start | a | e | c | - | d | - | #end
wit3  | #start | a | - | - | - | d | b | #end

### Topological sort

(Partly ordered)

`a (b e) c a d (e b)`

## Load libraries and configure noteook display parameters

In [2]:
import pandas as pd
import collections # for defaultdict
from bitarray import bitarray
import pprint as pp
from prettytable import PrettyTable # not part of anaconda distribution; install with pip
import pandas_profiling # https://towardsdatascience.com/10-simple-hacks-to-speed-up-your-data-analysis-in-python-ec18c6396e6b
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 150)

## Sample data and housekeeping

In [3]:
# sample data with a bit of repetition
witnessData = {'wit1': ['a', 'b', 'c', 'a', 'd', 'e'],
               'wit2': ['a', 'e', 'c', 'd'],
               'wit3': ['a', 'd', 'b']}

# fake stoplist, to ensure that we can identify stopwords and process them last
stoplist = {'a', 'c'} # set

# bitArrays is used to keep track of which witness tokens have already been processed
bitArrays = {k: bitarray(len(witnessData[k])) for k in witnessData}  # create a bitarray the length of each witness
for ba in bitArrays.values():  # initialize bitarrays to all 0 values
    ba.setall(0)

## Create _common sequence table_ (`csTable`)

Record location of all skipgrams in all witnesses.

In [4]:
# csTable: dictionary, in which
#   key: two-item tuple representing skipgram normalized token values (token[0], token[1])
#   value: list of three-item tuples records all locations where the key occurs: (siglum, offset[0], offset[1])
#     In Real Life:
#       values will include the t values corresponding to the normalized token values
#       use a named tuple or dataclass (https://realpython.com/python-data-classes/)
# In this test sample, we find all skip bigrams; in Real Life we would specify parameters for:
#   size of skipgram (bi, tri-, etc.; here bi-)
#   size of window (maximum distance between first and last members of skipgram; here the full witness length)
#   maximum size of skip between members of skipgram (here constrained only by size of window)
csTable = collections.defaultdict(list)
for key, value in witnessData.items(): # key is siglum, value is list of normalized token readings
    # in Real Life the value would also include a non-normalized t property
    for first in range(len(value)): # all first items in bigram
        for second in range(first + 1, len(value)): # pair with all following items
            csTable[(value[first], value[second])].append((key, first, second))
csTable

defaultdict(list,
            {('a', 'b'): [('wit1', 0, 1), ('wit3', 0, 2)],
             ('a', 'c'): [('wit1', 0, 2), ('wit2', 0, 2)],
             ('a', 'a'): [('wit1', 0, 3)],
             ('a', 'd'): [('wit1', 0, 4),
              ('wit1', 3, 4),
              ('wit2', 0, 3),
              ('wit3', 0, 1)],
             ('a', 'e'): [('wit1', 0, 5), ('wit1', 3, 5), ('wit2', 0, 1)],
             ('b', 'c'): [('wit1', 1, 2)],
             ('b', 'a'): [('wit1', 1, 3)],
             ('b', 'd'): [('wit1', 1, 4)],
             ('b', 'e'): [('wit1', 1, 5)],
             ('c', 'a'): [('wit1', 2, 3)],
             ('c', 'd'): [('wit1', 2, 4), ('wit2', 2, 3)],
             ('c', 'e'): [('wit1', 2, 5)],
             ('d', 'e'): [('wit1', 4, 5)],
             ('e', 'c'): [('wit2', 1, 2)],
             ('e', 'd'): [('wit2', 1, 3)],
             ('d', 'b'): [('wit3', 1, 2)]})

## Construct df and add information needed for ordered traversal

In [5]:
# convert to series before df since list lengths vary
csSeries = pd.Series(csTable)
csSeries # creates a NultiIndex, which we will want to flatten

a  b                                [(wit1, 0, 1), (wit3, 0, 2)]
   c                                [(wit1, 0, 2), (wit2, 0, 2)]
   a                                              [(wit1, 0, 3)]
   d    [(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]
   e                  [(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]
b  c                                              [(wit1, 1, 2)]
   a                                              [(wit1, 1, 3)]
   d                                              [(wit1, 1, 4)]
   e                                              [(wit1, 1, 5)]
c  a                                              [(wit1, 2, 3)]
   d                                [(wit1, 2, 4), (wit2, 2, 3)]
   e                                              [(wit1, 2, 5)]
d  e                                              [(wit1, 4, 5)]
e  c                                              [(wit2, 1, 2)]
   d                                              [(wit2, 1, 3)]
d  b                     

In [6]:
# convert series to dataframe, flatten MultiIndex, label columns
csDf = pd.DataFrame(csSeries).reset_index()
csDf.columns = ["first", "second", "locations"]
csDf

Unnamed: 0,first,second,locations
0,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]"
1,a,c,"[(wit1, 0, 2), (wit2, 0, 2)]"
2,a,a,"[(wit1, 0, 3)]"
3,a,d,"[(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]"
4,a,e,"[(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]"
5,b,c,"[(wit1, 1, 2)]"
6,b,a,"[(wit1, 1, 3)]"
7,b,d,"[(wit1, 1, 4)]"
8,b,e,"[(wit1, 1, 5)]"
9,c,a,"[(wit1, 2, 3)]"


In [7]:
# count witnesses for each skipgram (depth of block) and check for uniqueness of skipgram in all witnesses
#   extract sigla inside set comprehension to remove duplicates, then count
csDf["local_witnesses"] = csDf["locations"].map(lambda x: [location[0] for location in x])
csDf["unique_witnesses"]= csDf["local_witnesses"].map(lambda x: set(x))
csDf["local_witnessCount"] = csDf["local_witnesses"].str.len()
csDf["unique_witnessCount"] = csDf["unique_witnesses"].str.len()
csDf["witness_uniqueness"] = csDf["local_witnessCount"] == csDf["unique_witnessCount"]
csDf

Unnamed: 0,first,second,locations,local_witnesses,unique_witnesses,local_witnessCount,unique_witnessCount,witness_uniqueness
0,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]","[wit1, wit3]","{wit3, wit1}",2,2,True
1,a,c,"[(wit1, 0, 2), (wit2, 0, 2)]","[wit1, wit2]","{wit2, wit1}",2,2,True
2,a,a,"[(wit1, 0, 3)]",[wit1],{wit1},1,1,True
3,a,d,"[(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]","[wit1, wit1, wit2, wit3]","{wit3, wit2, wit1}",4,3,False
4,a,e,"[(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]","[wit1, wit1, wit2]","{wit2, wit1}",3,2,False
5,b,c,"[(wit1, 1, 2)]",[wit1],{wit1},1,1,True
6,b,a,"[(wit1, 1, 3)]",[wit1],{wit1},1,1,True
7,b,d,"[(wit1, 1, 4)]",[wit1],{wit1},1,1,True
8,b,e,"[(wit1, 1, 5)]",[wit1],{wit1},1,1,True
9,c,a,"[(wit1, 2, 3)]",[wit1],{wit1},1,1,True


In [8]:
# are both tokens are stopwords? (if so, we’ll process them last)
csDf["stopwords"] = csDf[["first","second"]].T.isin(stoplist).all()
csDf

Unnamed: 0,first,second,locations,local_witnesses,unique_witnesses,local_witnessCount,unique_witnessCount,witness_uniqueness,stopwords
0,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]","[wit1, wit3]","{wit3, wit1}",2,2,True,False
1,a,c,"[(wit1, 0, 2), (wit2, 0, 2)]","[wit1, wit2]","{wit2, wit1}",2,2,True,True
2,a,a,"[(wit1, 0, 3)]",[wit1],{wit1},1,1,True,True
3,a,d,"[(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]","[wit1, wit1, wit2, wit3]","{wit3, wit2, wit1}",4,3,False,False
4,a,e,"[(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]","[wit1, wit1, wit2]","{wit2, wit1}",3,2,False,False
5,b,c,"[(wit1, 1, 2)]",[wit1],{wit1},1,1,True,False
6,b,a,"[(wit1, 1, 3)]",[wit1],{wit1},1,1,True,False
7,b,d,"[(wit1, 1, 4)]",[wit1],{wit1},1,1,True,False
8,b,e,"[(wit1, 1, 5)]",[wit1],{wit1},1,1,True,False
9,c,a,"[(wit1, 2, 3)]",[wit1],{wit1},1,1,True,True


In [9]:
# sort and update row numbers, so that we can travese the skipgrams as follows
#   (not currently using stopword list to flter)
#   1. Words that don’t repeat within a witness first
#   2. Within that, deepest block (most witnesses) first
#   3. within that, rarest skipgrams first (less repetition is easier to place correctly)
csDf.sort_values(by=["witness_uniqueness", "unique_witnessCount", "local_witnessCount"], ascending=[False, False, True], inplace=True)
csDf.reset_index(inplace=True, drop=True) # update row numbers
csDf

Unnamed: 0,first,second,locations,local_witnesses,unique_witnesses,local_witnessCount,unique_witnessCount,witness_uniqueness,stopwords
0,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]","[wit1, wit3]","{wit3, wit1}",2,2,True,False
1,a,c,"[(wit1, 0, 2), (wit2, 0, 2)]","[wit1, wit2]","{wit2, wit1}",2,2,True,True
2,c,d,"[(wit1, 2, 4), (wit2, 2, 3)]","[wit1, wit2]","{wit2, wit1}",2,2,True,False
3,a,a,"[(wit1, 0, 3)]",[wit1],{wit1},1,1,True,True
4,b,c,"[(wit1, 1, 2)]",[wit1],{wit1},1,1,True,False
5,b,a,"[(wit1, 1, 3)]",[wit1],{wit1},1,1,True,False
6,b,d,"[(wit1, 1, 4)]",[wit1],{wit1},1,1,True,False
7,b,e,"[(wit1, 1, 5)]",[wit1],{wit1},1,1,True,False
8,c,a,"[(wit1, 2, 3)]",[wit1],{wit1},1,1,True,True
9,c,e,"[(wit1, 2, 5)]",[wit1],{wit1},1,1,True,False


## Construct topological order of nodes from df information

### `Node` class

In [10]:
class Node(object):
    def __init__(self, norm):
        self.tokendata = {}  # members are tokens (witness:offset pairs); no tokens for start and end nodes
        self.norm = norm # string value of node;
            # here all values are pre-normalized, so the n value on the node is equal to the (implicit) t value on the witness tokens
            # in Real Life, witness tokens will have t values that may differ from their shared n value that appears on the node
        self.rank = None

    def __repr__(self):
        return self.norm

    def __lt__(self, other):  # make it sortable by norm value
        return self.norm < other.norm

    def add_location(self, siglum, offset):
        self.tokendata[siglum] = offset

# TODO: do we need nodes to be sortable by n value?

### Debugging configuration

In [11]:
# for development; comment out bitArrays reset for production
for ba in bitArrays.values():  # initialize bitarrays to all 0 values
    ba.setall(0)
debug = True # controls printing of descriptive reports during processing

### Construct topological order (`toList`)

In [12]:
# traverse rows in order
toList = []
toList.extend([Node('#start'), Node('#end')]) # initialize with start and end nodes, which have no tokens
for row in csDf.itertuples(index=False): # process df rows in order
    ###
    # each row provides first and second (normalized tokens) and offsets of instances of those tokens in witnesses
    # place by associating token, witness, and offset
    ###
    if debug:
        print("\nProcessing:", row)
    for position, norm in enumerate([row.first, row.second]): # position (0, 1) is first or second skipgram token
        for location in row.locations:
            if debug: # print current value of toList
                print("\ntoList at start of current pass is")
                for node in toList:
                    print(node, node.tokendata)
                print("\nProcessing token", norm, "(",["first", "second"][position] , 
                  "value of skipgram", ''.join([row.first, row.second]), ") in", location)
            ###
            # what siglum and offset are we looking for?
            ###
            siglum = location[0]
            offset = location[position + 1]
            if debug:
                print("Have we processed this? ", bitArrays[siglum][offset])
            ###
            # do we need to process it, or have we alreaady taken care of it?
            ###
            if bitArrays[siglum][offset]: # already set, so break for this location
                if debug:
                    print('skipping:', norm, 'from skipgram', ''.join([row.first, row.second]), 'at', location)
                continue
            else:
                pass
            ###
            # since we didn’t break, create a new node and figure out where to place it
            ###
            modifyMe = None  # flag that tells us whether we need to modify an existing node (or create a new one)
            floor = 0 # floor and ceiling frame the locations where the new node can be placed
            ceiling = len(toList) - 1
            ###
            # find floor and ceiling
            ###
            for nodePos in range(len(toList)): # determine floor and ceiling by scanning nodes
                currentDict = toList[nodePos].tokendata # keys are a list of witnesses on token
                if debug:
                    print('Finding floor and ceiling; currentDict for', toList[nodePos].norm, '= ', currentDict, "at rank", nodePos)
                if siglum not in currentDict.keys():  # this dictionary isn't relevant; look at the next one
                    pass
                else: # is it a new floor or a new ceiling? (we've already filtered out the == case)
                    if currentDict[siglum] < offset: # move up the floor if the new offset is greater than a node already there
                        floor = nodePos + 1
                    else: # we’ve hit the ceiling if the new offset is less than the old one
                        ceiling = nodePos
                        break
            if debug:
                print("Floor:", floor, "; ceiling:", ceiling)
            ###
            # scan from floor to ceiling, looking for matching 'norm' value
            #
            # if there is a dictionary to modify, save it as modifyMe (don't modify it yet)
            # TODO: this gets the leftmost if there is more than one, which is not necessarily optimal
            ###
            for pos in range(floor, ceiling):
                if toList[pos].norm == norm:
                    modifyMe = toList[pos]
                    break
            ###
            # if there is a dictionary to modify, do it; otherwise insert a new dictionary at the ceiling
            # TODO: why at the ceiling?
            ###
            if modifyMe is None: # create and insert new token
                new_token = Node(norm)
                new_token.add_location(siglum, offset)
                toList.insert(ceiling, new_token)
                if debug:
                    print('inserting', siglum,':',offset,'as rank',nodePos)
            else: # modify existing token
                if debug:
                    print('adding', siglum,':',offset,'to',modifyMe,modifyMe.tokendata,'at rank',nodePos)
                modifyMe.tokendata[siglum] = offset

            # print("Processed ", siglum, ': ', offset)
            bitArrays[siglum][offset] = 1  # record that we've processed this token
if debug:
    print(bitArrays)    


Processing: Pandas(first='a', second='b', locations=[('wit1', 0, 1), ('wit3', 0, 2)], local_witnesses=['wit1', 'wit3'], unique_witnesses={'wit3', 'wit1'}, local_witnessCount=2, unique_witnessCount=2, witness_uniqueness=True, stopwords=False)

toList at start of current pass is
#start {}
#end {}

Processing token a ( first value of skipgram ab ) in ('wit1', 0, 1)
Have we processed this?  False
Finding floor and ceiling; currentDict for #start =  {} at rank 0
Finding floor and ceiling; currentDict for #end =  {} at rank 1
Floor: 0 ; ceiling: 1
inserting wit1 : 0 as rank 1

toList at start of current pass is
#start {}
a {'wit1': 0}
#end {}

Processing token a ( first value of skipgram ab ) in ('wit3', 0, 2)
Have we processed this?  False
Finding floor and ceiling; currentDict for #start =  {} at rank 0
Finding floor and ceiling; currentDict for a =  {'wit1': 0} at rank 1
Finding floor and ceiling; currentDict for #end =  {} at rank 2
Floor: 0 ; ceiling: 2
adding wit3 : 0 to a {'wit1': 0}

### Build list of edges for each witness

In [13]:
edgeSets = collections.defaultdict(list)  # key = siglum, value = list of node (source, target) tuples
edgeSourceByWitness = {}  # last target will be next source
for node in toList:  # token.norm is str; token.tokendata is dict with siglum:offset items
    if node.norm == '#start':  # not an edge target, so don’t add an edge, but set up source for next edge
        for siglum in witnessData:
            edgeSourceByWitness[siglum] = node
    elif node.norm == '#end':  # create edges to #end for all witnesses
        for siglum in witnessData:
            edgeSets[siglum].append((edgeSourceByWitness[siglum], node))
    else:
        for key, value in node.tokendata.items():
            # add next witness-specific edge, update value in edgeSourceByWitness
            edgeSets[key].append((edgeSourceByWitness[key], node))
            edgeSourceByWitness[key] = node
edges = set(inner for outer in edgeSets.values() for inner in outer)  # tuples of Tokens

### Index from edge target to source for calculating rank

In [14]:
findMySources = collections.defaultdict(list)
for edge in edges:
    findMySources[edge[1]].append(edge[0])

### Rank nodes in toList

In [15]:
for item in toList:
    inEdges = findMySources[item]
    item.rank = max([r.rank for r in inEdges], default=-1) + 1
node_table = pd.DataFrame([(item, item.tokendata, item.rank) for item in toList])
node_table.columns = ["norm", "tokendata", "rank"]
node_table

Unnamed: 0,norm,tokendata,rank
0,#start,{},0
1,a,"{'wit1': 0, 'wit3': 0, 'wit2': 0}",1
2,d,{'wit3': 1},2
3,b,"{'wit1': 1, 'wit3': 2}",3
4,e,{'wit2': 1},2
5,c,"{'wit1': 2, 'wit2': 2}",4
6,a,{'wit1': 3},5
7,d,"{'wit1': 4, 'wit2': 3}",6
8,e,{'wit1': 5},7
9,#end,{},8


### Index from rank to nodes for retrieval when building table by columns/ranks

In [16]:
nodesByRank = collections.defaultdict(list)
for node in toList:
    nodesByRank[node.rank].append(node)

## Create alignment table

In [17]:
table = PrettyTable(header=False)
orderedSigla = sorted(witnessData.keys())
table.add_column(None,[key for key in orderedSigla])
for rank, nodes in nodesByRank.items():  # add a column for each rank
    columnTokens = {}
    for node in nodes:  # copy tokens from all nodes at rank into a single dictionary; value is string (not offset)
        for key in node.tokendata.keys():
            columnTokens[key] = node.norm                
    columnData = []
    for siglum in orderedSigla:
        if siglum in columnTokens:
            columnData.append(columnTokens[siglum])
        elif node.norm in ["#start", "#end"]:
            columnData.append(node.norm)
        else:
            columnData.append('')
    table.add_column(None, columnData)
print(table)

+------+--------+---+---+---+---+---+---+---+------+
| wit1 | #start | a |   | b | c | a | d | e | #end |
| wit2 | #start | a | e |   | c |   | d |   | #end |
| wit3 | #start | a | d | b |   |   |   |   | #end |
+------+--------+---+---+---+---+---+---+---+------+


### Expected output

siglum | token | token | token | token | token | token | token | token
---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ----
wit1  | #start | a | b | c | a | d | e | #end
wit2  | #start | a | e | c | - | d | - | #end
wit3  | #start | a | - | - | - | d | b | #end

## Optional df profiling information 

In [18]:
if debug:
    for node in toList:
        print(node, node.tokendata)
    profile = pandas_profiling.ProfileReport(csDf)
    display(profile)

#start {}
a {'wit1': 0, 'wit3': 0, 'wit2': 0}
d {'wit3': 1}
b {'wit1': 1, 'wit3': 2}
e {'wit2': 1}
c {'wit1': 2, 'wit2': 2}
a {'wit1': 3}
d {'wit1': 4, 'wit2': 3}
e {'wit1': 5}
#end {}


0,1
Number of variables,9
Number of observations,16
Total Missing (%),0.0%
Total size in memory,1008.0 B
Average record size in memory,63.0 B

0,1
Numeric,1
Categorical,2
Boolean,2
Date,0
Text (Unique),0
Rejected,1
Unsupported,3

0,1
Distinct count,5
Unique (%),31.2%
Missing (%),0.0%
Missing (n),0

0,1
a,5
b,4
c,3
Other values (2),4

Value,Count,Frequency (%),Unnamed: 3
a,5,31.2%,
b,4,25.0%,
c,3,18.8%,
d,2,12.5%,
e,2,12.5%,

0,1
Distinct count,4
Unique (%),25.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.5
Minimum,1
Maximum,4
Zeros (%),0.0%

0,1
Minimum,1.0
5-th percentile,1.0
Q1,1.0
Median,1.0
Q3,2.0
95-th percentile,3.25
Maximum,4.0
Range,3.0
Interquartile range,1.0

0,1
Standard deviation,0.89443
Coef of variation,0.59628
Kurtosis,3.2967
Mean,1.5
MAD,0.6875
Skewness,1.9166
Sum,24
Variance,0.8
Memory size,208.0 B

Value,Count,Frequency (%),Unnamed: 3
1,11,68.8%,
2,3,18.8%,
4,1,6.2%,
3,1,6.2%,

Value,Count,Frequency (%),Unnamed: 3
1,11,68.8%,
2,3,18.8%,
3,1,6.2%,
4,1,6.2%,

Value,Count,Frequency (%),Unnamed: 3
1,11,68.8%,
2,3,18.8%,
3,1,6.2%,
4,1,6.2%,

Unsupported value

Unsupported value

0,1
Distinct count,5
Unique (%),31.2%
Missing (%),0.0%
Missing (n),0

0,1
d,4
e,4
c,3
Other values (2),5

Value,Count,Frequency (%),Unnamed: 3
d,4,25.0%,
e,4,25.0%,
c,3,18.8%,
a,3,18.8%,
b,2,12.5%,

0,1
Distinct count,2
Unique (%),12.5%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.1875

0,1
True,3
(Missing),13

Value,Count,Frequency (%),Unnamed: 3
True,3,18.8%,
(Missing),13,81.2%,

0,1
Correlation,0.96309

Unsupported value

0,1
Distinct count,2
Unique (%),12.5%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.875

0,1
True,14
(Missing),2

Value,Count,Frequency (%),Unnamed: 3
True,14,87.5%,
(Missing),2,12.5%,

Unnamed: 0,first,second,locations,local_witnesses,unique_witnesses,local_witnessCount,unique_witnessCount,witness_uniqueness,stopwords
0,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]","[wit1, wit3]","{wit3, wit1}",2,2,True,False
1,a,c,"[(wit1, 0, 2), (wit2, 0, 2)]","[wit1, wit2]","{wit2, wit1}",2,2,True,True
2,c,d,"[(wit1, 2, 4), (wit2, 2, 3)]","[wit1, wit2]","{wit2, wit1}",2,2,True,False
3,a,a,"[(wit1, 0, 3)]",[wit1],{wit1},1,1,True,True
4,b,c,"[(wit1, 1, 2)]",[wit1],{wit1},1,1,True,False


In [19]:
# for skipgram in csList: # skipgram is a tuple of skipgram items
#     locations = csTable[skipgram]  # list of three-item tuples of (siglum, location0, location1)
#     for skipgramPos in range(len(skipgram)):  # loop over head and tail by position ([0, 1])
#         norm = skipgram[skipgramPos]  # get normalized value of each token in skipgram by position
#         for location in locations:  # for each token, get witness and offset within witness
#             siglum = location[0]  # witness identifier
#             offset = location[skipgramPos + 1]  # offset of token within witness
#             if bitArrays[siglum][offset] == 1: # already set, so break for this location
#                 # print('skipping: ', norm, ' from ', skipgram, ' at ', location)
#                 break
#             floor = 0
#             ceiling = len(toList) - 1
#             modifyMe = None  # existing toList entry to be modified; if None, create a new one
#             for dictPos in range(len(toList)): # determine floor and ceiling
#                 currentDict = toList[dictPos].tokendata
#                 if siglum not in currentDict.keys():  # dictionary isn't relevant; check the next item in toList
#                     pass
#                 else:  # it can't be equal, since we used the bitarray to filter those out
#                     if currentDict[siglum] < offset:
#                         floor = dictPos
#                     else:
#                         ceiling = dictPos
#                         break
#             # scan from floor to ceiling, looking for matching 'norm' value
#             # if there is a dictionary to modify, save it as modifyMe (don't modify it yet)
#             # TODO: this gets the leftmost if there is more than one, which is not necessarily optimal
#             for pos in range(floor, ceiling):
#                 if toList[pos].norm == norm:
#                     modifyMe = toList[pos]
#                     break
#             # if there is a dictionary to modify, do it; otherwise insert a new dictionary at the ceiling
#             if modifyMe is None:
#                 new_token = Node(norm)
#                     new_token.add_location(siglum, offset)
#                     toList.insert(ceiling, new_token)
#                 else:
#                     # print('adding', siglum,':',offset,'to',modifyMe,modifyMe.tokendata)
#                     modifyMe.tokendata[siglum] = offset
#                 bitArrays[siglum][offset] = 1  # record that we've processed this token
#                 # print('added: ', norm, ' from ', skipgram, ' at ', location,
#                 #       ' with floor=', floor, ' and ceiling=', ceiling, sep='')
