In [29]:
import pandas as pd
import numpy as np
df = pd.DataFrame()
import collections
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 150)

In [30]:
# sample data and fake stoplist
witnessData = {'wit1': ['a', 'b', 'c', 'a', 'd', 'e'],
               'wit2': ['a', 'e', 'c', 'd'],
               'wit3': ['a', 'd', 'b']}
stoplist = {'a', 'c'} # set

In [31]:
# csTable: list of dictionaries, in each of which
#   key is a two-item tuples representing skip token bigrams: (token[0], token[1])
#   value is a list of three-item tuples where the skipgram key occurs: (siglum, offset[0], offset[1])
csTable = collections.defaultdict(list)
for key, value in witnessData.items():
    for first in range(len(value)):
        for second in range(first + 1, len(value)):
            csTable[(value[first], value[second])].append((key, first, second))
csTable

defaultdict(list,
            {('a', 'b'): [('wit1', 0, 1), ('wit3', 0, 2)],
             ('a', 'c'): [('wit1', 0, 2), ('wit2', 0, 2)],
             ('a', 'a'): [('wit1', 0, 3)],
             ('a', 'd'): [('wit1', 0, 4),
              ('wit1', 3, 4),
              ('wit2', 0, 3),
              ('wit3', 0, 1)],
             ('a', 'e'): [('wit1', 0, 5), ('wit1', 3, 5), ('wit2', 0, 1)],
             ('b', 'c'): [('wit1', 1, 2)],
             ('b', 'a'): [('wit1', 1, 3)],
             ('b', 'd'): [('wit1', 1, 4)],
             ('b', 'e'): [('wit1', 1, 5)],
             ('c', 'a'): [('wit1', 2, 3)],
             ('c', 'd'): [('wit1', 2, 4), ('wit2', 2, 3)],
             ('c', 'e'): [('wit1', 2, 5)],
             ('d', 'e'): [('wit1', 4, 5)],
             ('e', 'c'): [('wit2', 1, 2)],
             ('e', 'd'): [('wit2', 1, 3)],
             ('d', 'b'): [('wit3', 1, 2)]})

In [32]:
# convert to series before df since list lengths vary
csSeries = pd.Series(csTable)
csSeries

a  b                                [(wit1, 0, 1), (wit3, 0, 2)]
   c                                [(wit1, 0, 2), (wit2, 0, 2)]
   a                                              [(wit1, 0, 3)]
   d    [(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]
   e                  [(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]
b  c                                              [(wit1, 1, 2)]
   a                                              [(wit1, 1, 3)]
   d                                              [(wit1, 1, 4)]
   e                                              [(wit1, 1, 5)]
c  a                                              [(wit1, 2, 3)]
   d                                [(wit1, 2, 4), (wit2, 2, 3)]
   e                                              [(wit1, 2, 5)]
d  e                                              [(wit1, 4, 5)]
e  c                                              [(wit2, 1, 2)]
   d                                              [(wit2, 1, 3)]
d  b                     

In [33]:
# convert series to dataframe, flatten multi-index, label columns
csDf = pd.DataFrame(csSeries).reset_index()
csDf.columns = [["first", "second", "locations"]]
csDf

Unnamed: 0,first,second,locations
0,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]"
1,a,c,"[(wit1, 0, 2), (wit2, 0, 2)]"
2,a,a,"[(wit1, 0, 3)]"
3,a,d,"[(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]"
4,a,e,"[(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]"
5,b,c,"[(wit1, 1, 2)]"
6,b,a,"[(wit1, 1, 3)]"
7,b,d,"[(wit1, 1, 4)]"
8,b,e,"[(wit1, 1, 5)]"
9,c,a,"[(wit1, 2, 3)]"


In [34]:
# count witnesses for each skipgram (depth)
csDf["witnessCount"] = csDf["locations"].apply(lambda locationList: len({item[0] for tuples in locationList for item in tuples}), axis=1)
csDf

Unnamed: 0,first,second,locations,witnessCount
0,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]",2
1,a,c,"[(wit1, 0, 2), (wit2, 0, 2)]",2
2,a,a,"[(wit1, 0, 3)]",1
3,a,d,"[(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]",3
4,a,e,"[(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]",2
5,b,c,"[(wit1, 1, 2)]",1
6,b,a,"[(wit1, 1, 3)]",1
7,b,d,"[(wit1, 1, 4)]",1
8,b,e,"[(wit1, 1, 5)]",1
9,c,a,"[(wit1, 2, 3)]",1


In [35]:
# count total frequency of each skipgram
csDf["locationCount"] = csDf["locations"].T.apply(lambda x: len(x[0]))
csDf

Unnamed: 0,first,second,locations,witnessCount,locationCount
0,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]",2,2
1,a,c,"[(wit1, 0, 2), (wit2, 0, 2)]",2,2
2,a,a,"[(wit1, 0, 3)]",1,1
3,a,d,"[(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]",3,4
4,a,e,"[(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]",2,3
5,b,c,"[(wit1, 1, 2)]",1,1
6,b,a,"[(wit1, 1, 3)]",1,1
7,b,d,"[(wit1, 1, 4)]",1,1
8,b,e,"[(wit1, 1, 5)]",1,1
9,c,a,"[(wit1, 2, 3)]",1,1


In [36]:
# are both tokens are stopwords?
csDf["stopwords"] = csDf[["first","second"]].T.isin(stoplist).all()
csDf

Unnamed: 0,first,second,locations,witnessCount,locationCount,stopwords
0,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]",2,2,False
1,a,c,"[(wit1, 0, 2), (wit2, 0, 2)]",2,2,True
2,a,a,"[(wit1, 0, 3)]",1,1,True
3,a,d,"[(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]",3,4,False
4,a,e,"[(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]",2,3,False
5,b,c,"[(wit1, 1, 2)]",1,1,False
6,b,a,"[(wit1, 1, 3)]",1,1,False
7,b,d,"[(wit1, 1, 4)]",1,1,False
8,b,e,"[(wit1, 1, 5)]",1,1,False
9,c,a,"[(wit1, 2, 3)]",1,1,True


In [37]:
# reset multindex column heads to strings
csDf.columns = [item[0] for item in csDf.columns.to_flat_index()]

In [38]:
# sort in place and update row numbers for new order
csDf.sort_values(by=["stopwords", "witnessCount", "locationCount"], ascending=[True, False, True], inplace=True)
csDf.reset_index(inplace=True)
csDf

Unnamed: 0,index,first,second,locations,witnessCount,locationCount,stopwords
0,3,a,d,"[(wit1, 0, 4), (wit1, 3, 4), (wit2, 0, 3), (wit3, 0, 1)]",3,4,False
1,0,a,b,"[(wit1, 0, 1), (wit3, 0, 2)]",2,2,False
2,10,c,d,"[(wit1, 2, 4), (wit2, 2, 3)]",2,2,False
3,4,a,e,"[(wit1, 0, 5), (wit1, 3, 5), (wit2, 0, 1)]",2,3,False
4,5,b,c,"[(wit1, 1, 2)]",1,1,False
5,6,b,a,"[(wit1, 1, 3)]",1,1,False
6,7,b,d,"[(wit1, 1, 4)]",1,1,False
7,8,b,e,"[(wit1, 1, 5)]",1,1,False
8,11,c,e,"[(wit1, 2, 5)]",1,1,False
9,12,d,e,"[(wit1, 4, 5)]",1,1,False


In [39]:
# iterate over rows in order
for index, row in csDf.iterrows():
    print(row["locations"])

[('wit1', 0, 4), ('wit1', 3, 4), ('wit2', 0, 3), ('wit3', 0, 1)]
[('wit1', 0, 1), ('wit3', 0, 2)]
[('wit1', 2, 4), ('wit2', 2, 3)]
[('wit1', 0, 5), ('wit1', 3, 5), ('wit2', 0, 1)]
[('wit1', 1, 2)]
[('wit1', 1, 3)]
[('wit1', 1, 4)]
[('wit1', 1, 5)]
[('wit1', 2, 5)]
[('wit1', 4, 5)]
[('wit2', 1, 2)]
[('wit2', 1, 3)]
[('wit3', 1, 2)]
[('wit1', 0, 2), ('wit2', 0, 2)]
[('wit1', 0, 3)]
[('wit1', 2, 3)]
