In [1]:
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP 
from collections import defaultdict
from itertools import product
import re
import time

In [17]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [29]:
def _getNLPToks_(rawSentence):
    output = nlp.annotate(rawSentence, properties={
      'annotators': 'tokenize,ssplit,pos,parse',
      'outputFormat': 'json'
      })
    return output['sentences'][0]['parse'].split("\n")

In [30]:
def tree(): return defaultdict(tree) # example of autovivification

In [31]:
def _leadingSpaces_(target):
    return len(target) - len(target.lstrip())

def _findParent_(curIndent, parid, treeRef):
    tmpid = parid
    while (curIndent <= treeRef[tmpid]['indent']):
        tmpid = treeRef[tmpid]['parid']
    return tmpid

In [32]:
def _generateTree_(rawTokens, treeRef):
    
    # (, ,) as stand-alone
    REGEX_PUNC = r"^\s*\([,!?.'\"]\s*[,!?.'\"]\)"
    # (token
    REGEX_OPEN = r"^\s*\(([a-zA-Z0-9_']*)$"
    # (token (tok1 tok2) (tok3 tok4) .... (tokx toky))
    REGEX_COMP = r"^\s*\(([a-zA-Z0-9_']*)\s*((?:[(]([a-zA-Z0-9_;.,?'!]*)\s*([a-zA-Z0-9_;\.,?!']*)[)]\s*)*)"
    # (tok1 tok2)
    REGEX_ISOL_IN_COMP = r"\(([a-zA-Z0-9_;.,?!']+)\s*([a-zA-Z0-9_;.,?!']+)\)"
    # (, ,)
    REGEX_PUNC_SOLO = r"\([,!?.'\"]\s*[,!?.'\"]\)"
    # (tok1 tok2) as stand-alone
    REGEX_SOLO_PAIR = r"^\s*\(([a-zA-Z0-9_']+)\s*([a-zA-Z0-9_']+)\)"
    
    # manually insert Root token 
    treeRef[len(treeRef)] = {'curid':0, 
                             'parid':-1, 
                             'posOrTok':'ROOT', 
                             'indent':0}
    ID_CTR = 1
    
    for tok in rawTokens[1:]:
        
        curIndent = _leadingSpaces_(tok) # the current indent level
        parid = _findParent_(curIndent, ID_CTR-1, treeRef) # determine parid
            
        checkChild = re.match(REGEX_COMP, tok)
        if (checkChild):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkChild.group(1), 
                               'indent':curIndent}
            upCTR = ID_CTR
            ID_CTR += 1
            # Eliminate further punctuation

            subCheck = re.sub(REGEX_PUNC_SOLO,'',checkChild.group(2))
            subs = re.findall(REGEX_ISOL_IN_COMP, subCheck) 
            for ch in subs:
                # THE INDENTING IS WRONG HERE - THE HEIRARCHY IS MESSED UP - check test output
                treeRef[ID_CTR] = {'curid':ID_CTR, 
                                   'parid':upCTR, 
                                   'posOrTok':ch[0], 
                                   'indent':curIndent+2}
                ID_CTR += 1
                treeRef[ID_CTR] = {'curid':ID_CTR, 
                                   'parid':ID_CTR-1, 
                                   'posOrTok':ch[1], 
                                   'indent':curIndent+2}
                ID_CTR += 1
            continue
            
            
        checkSingle = re.match(REGEX_SOLO_PAIR, tok)
        if (checkSingle):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkSingle.group(1), 
                               'indent':curIndent+2}
            ID_CTR += 1
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':ID_CTR-1, 
                               'posOrTok':checkSingle.group(2), 
                               'indent':curIndent+2}
            ID_CTR += 1
            continue


        checkMatch = re.match(REGEX_OPEN, tok)
        if (checkMatch):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkMatch.group(1), 
                               'indent':curIndent}
            ID_CTR += 1
            continue

        checkPunc = re.match(REGEX_PUNC, tok)
        if (checkPunc): # ignore punctuation
            continue

    return
            

In [33]:
'''
_generateTree_() method only provides tree (dict representation) listing parents. 
This is a naive method to add a "children" field to the tree - necessary for optimal Tree Kernel methods.
Also create the "childrenTok" that stores the physical tokens of the children. 
'''
def _flipTree_(treeRef):
    for k,v in treeRef.items():
        chSet = {sk:sv for (sk,sv) in treeRef.items() if sv['parid'] == k}
        treeRef[k]['children'] = set(chSet.keys())
        treeRef[k]['childrenTok'] = {d['posOrTok'] for d in chSet.values()}

In [34]:
'''
Implementation of the Colins-Duffy or Subset-Tree (SST) Kernel
'''

def _isLeaf_(tree, parentNode):
    return (len(tree[parentNode]['children']) == 0)

def _isPreterminal_(tree, parentNode):
    for idx in tree[parentNode]['children']:
        if not _isLeaf_(tree, idx):
            return False
    return True

def _cdHelper_(tree1, tree2, node1, node2, store, lam):    
    # No duplicate computations
    if store[node1, node2] >= 0:
        return
    
    # Leaves yield similarity score by definition
    if (_isLeaf_(tree1, node1) or _isLeaf_(tree2, node2)):
        store[node1, node2] = 0
        return
    
    # same parent node
    if tree1[node1]['posOrTok'] == tree2[node2]['posOrTok']: 
        # same children tokens
        if len(tree1[node1]['childrenTok'].symmetric_difference(tree2[node2]['childrenTok'])) == 0:
            # Check if both nodes are pre-terminal
            if _isPreterminal_(tree1, node1) and _isPreterminal_(tree2, node2):
                store[node1, node2] = lam
                return
            # Not pre-terminal. Recurse among the children of both token trees. 
            else:              
                combs = product(tree1[node1]['children'], tree2[node2]['children'])
                runningTotal = 0
                for idx, c in enumerate(combs):
                    _cdHelper_(tree1, tree2, c[0], c[1], store, lam) # Recursively run helper
                    runningTotal += (1 + store[c[0],c[1]])
                    
                store[node1, node2] = lam * runningTotal
                return
        else:
            store[node1, node2] = 0
    else: # parent nodes are different 
        store[node1, node2] = 0
        return 

def _cdKernel_(tree1, tree2, lam):
    # Fill the initial state of the store 
    store = np.empty((len(tree1), len(tree2)))
    store.fill(-1)
    
    # O(N^2) to compute the tree dot product
    for i in range(len(tree1)):
        for j in range(len(tree2)):
            _cdHelper_(tree1, tree2, i, j, store, lam)
            
    return store.sum()
            
    

## Test: Tree Generation and Collins-Duffy Kernel

In [35]:
test_b1 = "How do I reduce my thighs?"
test_b2 = "What can I do to reduce my belly?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"

In [36]:
start_time = time.time()

tree_b1 = tree()
tree_b2 = tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
_generateTree_(toks_b1, tree_b1)
_generateTree_(toks_b2, tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
_flipTree_(tree_b1)
_flipTree_(tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

score2 = _cdKernel_(tree_b1, tree_b2, 0.8)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kernel

print("Score: %s" % (score2))


--- 0.16412806510925293 seconds ---
--- 0.1654829978942871 seconds ---
--- 0.1657099723815918 seconds ---
--- 0.16646099090576172 seconds ---
Score: 7.68


In [None]:
toks_b1

In [57]:
start_time = time.time()
output = nlp.annotate("What can I do to reduce my belly?", properties={
  'annotators': 'tokenize,ssplit,pos,depparse,parse',
  'outputFormat': 'json'
  })
print("--- %s seconds ---" % (time.time() - start_time)) # tree kernel


--- 0.04473996162414551 seconds ---
