In [65]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [66]:
from decision_tree import *
import networkx as nx
import matplotlib.pyplot as plt
import pygraphviz as pgv
from io import BytesIO
from IPython.display import Image, display

In [67]:
def show_tree(_dtNode: dtNode):
    dag=pgv.AGraph(directed=True)
    print(dag)

In [68]:
# sample data with a bit of repetition
witnessData = {'wit1': ['a', 'b', 'c', 'a', 'd', 'e'],
               'wit2': ['a', 'e', 'c', 'd'],
               'wit3': ['a', 'd', 'b']}

In [69]:
# fake stoplist, to ensure that we can identify stopwords and process them last
stoplist = {'a', 'c'}  # set

In [70]:
# bitArray_dict is used to keep track of which witness tokens have already been processed
bitArray_dict = {k: bitarray(len(witnessData[k])) for k in witnessData}  # create a bitarray the length of each witness
for ba in bitArray_dict.values():  # initialize bitarrays to all 0 values
    ba.setall(0)

In [71]:
# csTable: dictionary, in which
#   key: two-item tuple representing skipgram normalized token values (token[0], token[1])
#   value: list of three-item tuples records all locations where the key occurs: (siglum, offset[0], offset[1])
#     In Real Life:
#       values will include the t values corresponding to the normalized token values
#       use a named tuple or dataclass (https://realpython.com/python-data-classes/)
# In this test sample, we find all skip bigrams; in Real Life we would specify parameters for:
#   size of skipgram (bi, tri-, etc.; here bi-)
#   size of window (maximum distance between first and last members of skipgram; here the full witness length)
#   maximum size of skip between members of skipgram (here constrained only by size of window)
csTable = collections.defaultdict(list)
for key, value in witnessData.items():  # key is siglum, value is list of normalized token readings
    # in Real Life the value would also include a non-normalized t property
    for first in range(len(value)):  # all first items in bigram
        for second in range(first + 1, len(value)):  # pair with all following items
            csTable[(value[first], value[second])].append((key, first, second))

In [72]:
# convert to series before df since list lengths vary
csSeries = pd.Series(csTable)

In [73]:
# convert series to dataframe, flatten MultiIndex, label columns
csDf = pd.DataFrame(csSeries).reset_index()
csDf.columns = ["first", "second", "locations"]

In [74]:
# count witnesses for each skipgram (depth of block) and check for uniqueness of skipgram in all witnesses
#   extract sigla inside set comprehension to remove duplicates, then count
csDf["local_witnesses"] = csDf["locations"].map(lambda x: [location[0] for location in x])
csDf["unique_witnesses"] = csDf["local_witnesses"].map(lambda x: set(x))
csDf["local_witnessCount"] = csDf["local_witnesses"].str.len()
csDf["unique_witnessCount"] = csDf["unique_witnesses"].str.len()
csDf["witness_uniqueness"] = csDf["local_witnessCount"] == csDf["unique_witnessCount"]
scale = pd.Series([100, -1, 10])
csDf["priority"] = pd.np.dot(csDf[["unique_witnessCount", "witness_uniqueness", "local_witnessCount"]], scale)

In [75]:
# are both tokens are stopwords? (if so, we’ll process them last)
csDf["stopwords"] = csDf[["first", "second"]].T.isin(stoplist).all()

In [76]:
# sort and update row numbers, so that we can traverse the skipgrams as follows
#   (not currently using stopword list to filter)
#   1. Words that don’t repeat within a witness first
#   2. Within that, deepest block (most witnesses) first
#   3. within that, rarest skipgrams first (less repetition is easier to place correctly)
csDf.sort_values(by=["unique_witnessCount", "witness_uniqueness", "local_witnessCount"], ascending=[False, False, True],
                 inplace=True)
csDf.reset_index(inplace=True, drop=True)  # update row numbers

In [77]:
# root of decision tree inherits empty toList, bitArray_dict with 0 values, and complete, sorted df
dtRoot = dtNode([Node("#start"), Node("#end")], "[none]", bitArray_dict, csDf)

In [80]:
# process root
parent: dtNode = dtRoot # node to expand
current, remainder = step(csDf) # current is rows to add, remainder is ... well ... what’s left
for i in range(len(current)):
    expand_dtNode(parent, current.iloc[i, :], pd.concat([current.drop(i, axis=0), remainder]))  # expands in place, adds children
    print(parent)
for child in parent.children:
    print("\nOne level down")
    print_alignment_table(child, witnessData, True)  # before expanding
    print(child)
    print_score(child)
    print("Placed skipgram:", child.skipgram)
    print("Percentage of witness tokens placed:", print_placed_witness_tokens(child))
    current_c, remainder_c = step(child.df)
    for j in range(len(current_c)):
        expand_dtNode(child, current_c.iloc[j, :], pd.concat([current_c.drop(j, axis=0), remainder_c]))
        for grandchild in child.children:
            print("\nTwo levels down")
            print_alignment_table(grandchild, witnessData, True)  # before expanding
            print(grandchild)
            print_score(grandchild)
            print("Placed skipgram:", grandchild.skipgram)
            print("Percentage of witness tokens placed:", print_placed_witness_tokens(grandchild))
            current_d, remainder_d = step(grandchild.df)
            for k in range(len(current_d)):
                expand_dtNode(grandchild, current_d.iloc[k, :], pd.concat([current_d.drop(k, axis=0), remainder_d]))
                for greatgrandchild in grandchild.children:
                    print("\nThree levels down")
                    print_alignment_table(greatgrandchild, witnessData, True)
                    print(greatgrandchild)
                    print_score(greatgrandchild)
                    print("Placed skipgram:", greatgrandchild.skipgram)
                    print("Percentage of witness tokens placed:", print_placed_witness_tokens(greatgrandchild))


There are 2 choices at this level
[#start, #end]

One level down
+------+--------+------+------+------+
| wit1 | #start | a(0) | d(4) | #end |
| wit2 | #start | a(0) | d(3) | #end |
| wit3 | #start | a(0) | d(1) | #end |
+------+--------+------+------+------+
[#start, a, d, #end]
Score (witness tokens / toList length):  3.0
Placed skipgram: ad
Percentage of witness tokens placed: 0.46153846153846156

There are 2 choices at this level

Two levels down
+------+--------+------+------+------+------+------+
| wit1 | #start | a(0) |      | d(4) | e(5) | #end |
| wit2 | #start | a(0) | e(1) | d(3) |      | #end |
| wit3 | #start | a(0) |      | d(1) |      | #end |
+------+--------+------+------+------+------+------+
[#start, a, e, d, e, #end]
Score (witness tokens / toList length):  2.0
Placed skipgram: ae
Percentage of witness tokens placed: 0.6153846153846154

There are 1 choices at this level

Three levels down
+------+--------+------+------+------+------+------+
| wit1 | #start | a(0) |

In [79]:
child

[#start, a, d, #end]