# Multiple witness alignment with skipgrams, pandas version

## Load libraries and configure noteook display parameters

In [1]:
import pandas as pd
import collections # for defaultdict
from bitarray import bitarray
import pprint as pp
import pandas_profiling # https://towardsdatascience.com/10-simple-hacks-to-speed-up-your-data-analysis-in-python-ec18c6396e6b
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 150)

## Sample data and housekeeping

In [2]:
# sample data has a bit of repetition
witnessData = {'wit1': ['a', 'b', 'c', 'a', 'd', 'e'],
               'wit2': ['a', 'e', 'c', 'd'],
               'wit3': ['a', 'd', 'b']}

# fake stoplist, to ensure that we can identify stopwords and process them last
stoplist = {'a', 'c'} # set

# bitArrays is used to keep track of which witness tokens have already been processed
bitArrays = {k: bitarray(len(witnessData[k])) for k in witnessData}  # create a bitarray the length of each witness
for ba in bitArrays.values():  # initialize bitarrays to all 0 values
    ba.setall(0)

## Create _common sequence table_ (`csTable`)

Record location of all skipgrams in all witnesses.

In [3]:
# csTable: dictionary, in which
#   key: two-item tuple representing skipgram normalized token values (token[0], token[1])
#   value: list of three-item tuples records all locations where the key occurs: (siglum, offset[0], offset[1])
#     In Real Life:
#       values will include the t values corresponding to the normalized token values
#       use a named tuple or dataclass (https://realpython.com/python-data-classes/)
# In this test sample, we find all skip bigrams; in Real Life we would specify parameters for:
#   size of skipgram (bi, tri-, etc.; here bi-)
#   size of window (maximum distance between first and last members of skipgram; here the full witness length)
#   maximum size of skip between members of skipgram (here constrained only by size of window)
csTable = collections.defaultdict(list)
for key, value in witnessData.items(): # key is siglum, value is list of normalized token readings
    # in Real Life the value would also include a non-normalized t property
    for first in range(len(value)): # all first items in bigram
        for second in range(first + 1, len(value)): # pair with all following items
            csTable[(value[first], value[second])].append((key, first, second))
csTable

defaultdict(list,
            {('a', 'b'): [('wit1', 0, 1), ('wit3', 0, 2)],
             ('a', 'c'): [('wit1', 0, 2), ('wit2', 0, 2)],
             ('a', 'a'): [('wit1', 0, 3)],
             ('a', 'd'): [('wit1', 0, 4),
              ('wit1', 3, 4),
              ('wit2', 0, 3),
              ('wit3', 0, 1)],
             ('a', 'e'): [('wit1', 0, 5), ('wit1', 3, 5), ('wit2', 0, 1)],
             ('b', 'c'): [('wit1', 1, 2)],
             ('b', 'a'): [('wit1', 1, 3)],
             ('b', 'd'): [('wit1', 1, 4)],
             ('b', 'e'): [('wit1', 1, 5)],
             ('c', 'a'): [('wit1', 2, 3)],
             ('c', 'd'): [('wit1', 2, 4), ('wit2', 2, 3)],
             ('c', 'e'): [('wit1', 2, 5)],
             ('d', 'e'): [('wit1', 4, 5)],
             ('e', 'c'): [('wit2', 1, 2)],
             ('e', 'd'): [('wit2', 1, 3)],
             ('d', 'b'): [('wit3', 1, 2)]})

## Construct df and add information needed for ordered traversal

In [4]:
# convert to series before df since list lengths vary
csSeries = pd.Series(csTable)
csSeries # creates a NultiIndex, which we will want to flatten

a  b                                [(wit1, 0, 1), (wit3, 0, 2)]
   c                                [(wit1, 0, 2), (wit2, 0, 2)]
   a                                              [(wit1, 0, 3)]
   d    [(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]
   e                  [(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]
b  c                                              [(wit1, 1, 2)]
   a                                              [(wit1, 1, 3)]
   d                                              [(wit1, 1, 4)]
   e                                              [(wit1, 1, 5)]
c  a                                              [(wit1, 2, 3)]
   d                                [(wit1, 2, 4), (wit2, 2, 3)]
   e                                              [(wit1, 2, 5)]
d  e                                              [(wit1, 4, 5)]
e  c                                              [(wit2, 1, 2)]
   d                                              [(wit2, 1, 3)]
d  b                     

In [5]:
# convert series to dataframe, flatten MultiIndex, label columns
csDf = pd.DataFrame(csSeries).reset_index()
csDf.columns = ["first", "second", "locations"]
csDf

Unnamed: 0,first,second,locations
0,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]"
1,a,c,"[(wit1, 0, 2), (wit2, 0, 2)]"
2,a,a,"[(wit1, 0, 3)]"
3,a,d,"[(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]"
4,a,e,"[(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]"
5,b,c,"[(wit1, 1, 2)]"
6,b,a,"[(wit1, 1, 3)]"
7,b,d,"[(wit1, 1, 4)]"
8,b,e,"[(wit1, 1, 5)]"
9,c,a,"[(wit1, 2, 3)]"


In [6]:
# count witnesses for each skipgram (depth of block)
#   extract sigla inside set comprehension to remove duplicates, then count
csDf["witnessCount"] = csDf["locations"].apply(lambda locationList: len({location[0] for location in locationList}))
csDf

Unnamed: 0,first,second,locations,witnessCount
0,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]",2
1,a,c,"[(wit1, 0, 2), (wit2, 0, 2)]",2
2,a,a,"[(wit1, 0, 3)]",1
3,a,d,"[(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]",3
4,a,e,"[(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]",2
5,b,c,"[(wit1, 1, 2)]",1
6,b,a,"[(wit1, 1, 3)]",1
7,b,d,"[(wit1, 1, 4)]",1
8,b,e,"[(wit1, 1, 5)]",1
9,c,a,"[(wit1, 2, 3)]",1


In [7]:
# count total frequency of each skipgram (uniqueness)
csDf["locationCount"] = csDf["locations"].apply(lambda x: len(x)) # count items in list in each cell
csDf

Unnamed: 0,first,second,locations,witnessCount,locationCount
0,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]",2,2
1,a,c,"[(wit1, 0, 2), (wit2, 0, 2)]",2,2
2,a,a,"[(wit1, 0, 3)]",1,1
3,a,d,"[(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]",3,4
4,a,e,"[(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]",2,3
5,b,c,"[(wit1, 1, 2)]",1,1
6,b,a,"[(wit1, 1, 3)]",1,1
7,b,d,"[(wit1, 1, 4)]",1,1
8,b,e,"[(wit1, 1, 5)]",1,1
9,c,a,"[(wit1, 2, 3)]",1,1


In [8]:
# are both tokens are stopwords? (if so, we’ll process them last)
csDf["stopwords"] = csDf[["first","second"]].T.isin(stoplist).all()
csDf

Unnamed: 0,first,second,locations,witnessCount,locationCount,stopwords
0,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]",2,2,False
1,a,c,"[(wit1, 0, 2), (wit2, 0, 2)]",2,2,True
2,a,a,"[(wit1, 0, 3)]",1,1,True
3,a,d,"[(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]",3,4,False
4,a,e,"[(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]",2,3,False
5,b,c,"[(wit1, 1, 2)]",1,1,False
6,b,a,"[(wit1, 1, 3)]",1,1,False
7,b,d,"[(wit1, 1, 4)]",1,1,False
8,b,e,"[(wit1, 1, 5)]",1,1,False
9,c,a,"[(wit1, 2, 3)]",1,1,True


In [9]:
# sort and update row numbers, so that we can travese the skipgrams as follows
#   1. stopword pairs last
#   2. within that, deepest block (most witnesses) first
#   3. within that, rarest skipgrams first (less repetition is easier to place correctly)
csDf.sort_values(by=["stopwords", "witnessCount", "locationCount"], ascending=[True, False, True], inplace=True)
csDf.reset_index(inplace=True, drop=True) # update row numbers
csDf

Unnamed: 0,first,second,locations,witnessCount,locationCount,stopwords
0,a,d,"[(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]",3,4,False
1,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]",2,2,False
2,c,d,"[(wit1, 2, 4), (wit2, 2, 3)]",2,2,False
3,a,e,"[(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]",2,3,False
4,b,c,"[(wit1, 1, 2)]",1,1,False
5,b,a,"[(wit1, 1, 3)]",1,1,False
6,b,d,"[(wit1, 1, 4)]",1,1,False
7,b,e,"[(wit1, 1, 5)]",1,1,False
8,c,e,"[(wit1, 2, 5)]",1,1,False
9,d,e,"[(wit1, 4, 5)]",1,1,False


## Construct topological order of nodes from df information

### `Node` class

In [10]:
class Node(object):
    def __init__(self, norm):
        self.tokendata = {}  # members are tokens (witness:offset pairs); no tokens for start and end nodes
        self.norm = norm # string value of node;
            # here all values are pre-normalized, so the n value on the node is equal to the (implicit) t value on the witness tokens
            # in Real Life, witness tokens will have t values that may differ from their shared n value that appears on the node
        self.rank = None

    def __repr__(self):
        return self.norm

    def __lt__(self, other):  # make it sortable by norm value
        return self.norm < other.norm

    def add_location(self, siglum, offset):
        self.tokendata[siglum] = offset

# TODO: do we need nodes to be sortable by n value?

### Construct topological order (`toList`)

In [21]:
# for development; comment out for production
for ba in bitArrays.values():  # initialize bitarrays to all 0 values
    ba.setall(0)
debug = False # controls printing of descriptive reports during processing

In [22]:
# traverse rows in order
toList = []
toList.extend([Node('#start'), Node('#end')]) # initialize with start and end nodes, which have no tokens
for row in csDf.itertuples(index=False): # process df rows in order
    # each row provides first and second (normalized tokens) and offsets of instances of those tokens in witnesses
    # place by associating token, witness, and offset
    if debug:
        print("Processing:", row)
    for position, norm in enumerate([row.first, row.second]): # position (0, 1) is first or second skipgram token
        if debug:
            print("Row", position, ", token", norm)
        for location in row.locations:
            if debug:
                print("\nProcessing token", [row.first, row.second][position], "(",["first", "second"][position] , 
                  "value of skipgram", ''.join([row.first, row.second]), ") in", location)
            siglum = location[0]
            offset = location[position + 1]
            if debug:
                print("Siglum: ", siglum, "; Location: ", offset)
                print("Have we processed this? ", bitArrays[siglum][offset])
            if bitArrays[siglum][offset]: # already set, so break for this location
                if debug:
                    print('skipping:', norm, 'from skipgram', ''.join([row.first, row.second]), 'at', location, "\n")
                break
            floor = 0
            ceiling = len(toList) - 1
            modifyMe = None  # existing toList entry to be modified; if None, create a new one
            for nodePos in range(len(toList)): # determine floor and ceiling
                currentDict = toList[nodePos].tokendata
                if siglum not in currentDict.keys():  # dictionary isn't relevant; check the next item in toList
                    pass
                else: # it can't be equal, since we used the bitarray to filter those out
                    if currentDict[siglum] < offset:
                        floor = dictPos
                    else:
                        ceiling = dictPos
                        break
            # print("Processed ", siglum, ': ', offset)
            bitArrays[siglum][offset] = 1  # record that we've processed this token
if debug:
    print(bitArrays)
    profile = pandas_profiling.ProfileReport(csDf)
    display(profile)


In [13]:
# for skipgram in csList: # skipgram is a tuple of skipgram items
#     locations = csTable[skipgram]  # list of three-item tuples of (siglum, location0, location1)
#     for skipgramPos in range(len(skipgram)):  # loop over head and tail by position ([0, 1])
#         norm = skipgram[skipgramPos]  # get normalized value of each token in skipgram by position
#         for location in locations:  # for each token, get witness and offset within witness
#             siglum = location[0]  # witness identifier
#             offset = location[skipgramPos + 1]  # offset of token within witness
#             if bitArrays[siglum][offset] == 1: # already set, so break for this location
#                 # print('skipping: ', norm, ' from ', skipgram, ' at ', location)
#                 break
#             floor = 0
#             ceiling = len(toList) - 1
#             modifyMe = None  # existing toList entry to be modified; if None, create a new one
#             for dictPos in range(len(toList)): # determine floor and ceiling
#                 currentDict = toList[dictPos].tokendata
#                 if siglum not in currentDict.keys():  # dictionary isn't relevant; check the next item in toList
#                     pass
#                 else:  # it can't be equal, since we used the bitarray to filter those out
#                     if currentDict[siglum] < offset:
#                         floor = dictPos
#                     else:
#                         ceiling = dictPos
#                         break
#             # scan from floor to ceiling, looking for matching 'norm' value
#             # if there is a dictionary to modify, save it as modifyMe (don't modify it yet)
#             # TODO: this gets the leftmost if there is more than one, which is not necessarily optimal
#             for pos in range(floor, ceiling):
#                 if toList[pos].norm == norm:
#                     modifyMe = toList[pos]
#                     break
#             # if there is a dictionary to modify, do it; otherwise insert a new dictionary at the ceiling
#             if modifyMe is None:
#                 new_token = Node(norm)
#                     new_token.add_location(siglum, offset)
#                     toList.insert(ceiling, new_token)
#                 else:
#                     # print('adding', siglum,':',offset,'to',modifyMe,modifyMe.tokendata)
#                     modifyMe.tokendata[siglum] = offset
#                 bitArrays[siglum][offset] = 1  # record that we've processed this token
#                 # print('added: ', norm, ' from ', skipgram, ' at ', location,
#                 #       ' with floor=', floor, ' and ceiling=', ceiling, sep='')
