In [8]:
import TreeBuild as tb
import time

In [31]:
import numpy as np
import math

'''
Helper Methods
'''

def _isLeaf_(tree, parentNode):
    return (len(tree[parentNode]['children']) == 0)

def _isPreterminal_(tree, parentNode):
    for idx in tree[parentNode]['children']:
        if not _isLeaf_(tree, idx):
            return False
    return True

'''
Implementation of the Colins-Duffy or Subset-Tree (SST) Kernel
'''

def _cdHelper_(tree1, tree2, node1, node2, store, lam, SST_ON):
    # No duplicate computations
    if store[node1, node2] >= 0:
        return

    # Leaves yield similarity score by definition
    if (_isLeaf_(tree1, node1) or _isLeaf_(tree2, node2)):
        store[node1, node2] = 0
        return

    # same parent node
    if tree1[node1]['posOrTok'] == tree2[node2]['posOrTok']:
        # same children tokens
        if tree1[node1]['childrenTok'] == tree2[node2]['childrenTok']:
            # Check if both nodes are pre-terminal
            if _isPreterminal_(tree1, node1) and _isPreterminal_(tree2, node2):
                store[node1, node2] = lam
                return
            # Not pre-terminal. Recurse among the children of both token trees.
            else:
                nChildren = len(tree1[node1]['children'])
                
                runningTotal = None
                
                for idx in range(nChildren):
                     # index ->  node_id
                    tmp_n1 = tree1[node1]['children'][idx]
                    tmp_n2 = tree2[node2]['children'][idx]
                    # Recursively run helper
                    _cdHelper_(tree1, tree2, tmp_n1, tmp_n2, store, lam, SST_ON)

                    if (runningTotal == None):
                        runningTotal = SST_ON + store[tmp_n1, tmp_n2]
                    else:
                        runningTotal *= (SST_ON + store[tmp_n1, tmp_n2])
                    
                store[node1, node2] = lam * runningTotal
                return
        else:
            store[node1, node2] = 0
    else: # parent nodes are different
        store[node1, node2] = 0
        return

def _cdKernel_(tree1, tree2, lam, SST_ON):
    # Fill the initial state of the store
    store = np.empty((len(tree1), len(tree2)))
    store.fill(-1)
    # O(N^2) to compute the tree dot product
    for i in range(len(tree1)):
        for j in range(len(tree2)):
            _cdHelper_(tree1, tree2, i, j, store, lam, SST_ON)

    return store.sum()

'''
Returns a tuple w/ format: (raw, normalized)
If NORMALIZE_FLAG set to False, tuple[1] = -1
'''
def _CollinsDuffy_(tree1, tree2, lam, NORMALIZE_FLAG, SST_ON):
    raw_score = _cdKernel_(tree1, tree2, lam, SST_ON)
    if (NORMALIZE_FLAG):
        t1_score = _cdKernel_(tree1, tree1, lam, SST_ON)
        t2_score = _cdKernel_(tree2, tree2, lam, SST_ON)
        return (raw_score,(raw_score / math.sqrt(t1_score * t2_score)))
    else:
        return (raw_score,-1)


In [3]:
from pycorenlp import StanfordCoreNLP 
nlp = StanfordCoreNLP('http://localhost:9000')

In [4]:
def _getNLPToks_(rawSentence):
    try:
        output = nlp.annotate(rawSentence, properties={
            'annotators': 'tokenize,ssplit,pos,parse,depparse',
            'outputFormat': 'json'
        })
    except UnicodeDecodeError:
        print("Error Decoding Sentence")

    dependencies = output['sentences'][0]['basicDependencies']
    tokens = output['sentences'][0]['tokens']
    parse = output['sentences'][0]['parse'].split("\n")
    
    return {'deps':dependencies,
            'toks':tokens, 
            'parse':parse}


In [54]:
test_b1 = "Why are so many Quora users posting questions that are readily answered on Google?"
test_b2 = "Why do people ask Quora questions which can be answered easily by Google?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"
start_time = time.time()

tree_b1 = tb.tree()
tree_b2 = tb.tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
tb._generateTree_(toks_b1['parse'], tree_b1)
tb._generateTree_(toks_b2['parse'], tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
tb._flipTree_(tree_b1)
tb._flipTree_(tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

(rawScore, normScore) = _CollinsDuffy_(tree_b1, tree_b2, 0.4, 1, 1)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kerne
print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))
(rawScore, normScore) = _CollinsDuffy_(tree_b1, tree_b2, 1, 1, 0)
print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))

--- 0.2218029499053955 seconds ---
--- 0.22223401069641113 seconds ---
--- 0.2223958969116211 seconds ---
--- 0.22823190689086914 seconds ---
Raw Score: 8.1152
Norm Score: 0.322101987619
Raw Score: 7.0
Norm Score: 0.222474604157
