In [22]:
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP 
from collections import defaultdict
from itertools import product
import re
import time

In [23]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [66]:
def _getNLPToks_(rawSentence):
    output = nlp.annotate(rawSentence, properties={
      'annotators': 'parse',
      'outputFormat': 'json'
      })
    return output['sentences'][0]['parse'].split("\n")

In [26]:
def tree(): return defaultdict(tree) # example of autovivification

In [27]:
def _leadingSpaces_(target):
    return len(target) - len(target.lstrip())

def _findParent_(curIndent, parid, treeRef):
    tmpid = parid
    while (curIndent <= treeRef[tmpid]['indent']):
        tmpid = treeRef[tmpid]['parid']
    return tmpid

In [28]:
def _generateTree_(rawTokens, treeRef):
    
    # (, ,) as stand-alone
    REGEX_PUNC = r"^\s*\([,!?.'\"]\s*[,!?.'\"]\)"
    # (token
    REGEX_OPEN = r"^\s*\(([a-zA-Z0-9_']*)$"
    # (token (tok1 tok2) (tok3 tok4) .... (tokx toky))
    REGEX_COMP = r"^\s*\(([a-zA-Z0-9_']*)\s*((?:[(]([a-zA-Z0-9_;.,?'!]*)\s*([a-zA-Z0-9_;\.,?!']*)[)]\s*)*)"
    # (tok1 tok2)
    REGEX_ISOL_IN_COMP = r"\(([a-zA-Z0-9_;.,?!']+)\s*([a-zA-Z0-9_;.,?!']+)\)"
    # (, ,)
    REGEX_PUNC_SOLO = r"\([,!?.'\"]\s*[,!?.'\"]\)"
    # (tok1 tok2) as stand-alone
    REGEX_SOLO_PAIR = r"^\s*\(([a-zA-Z0-9_;.,?!']+)\s*([a-zA-Z0-9_;.,?!']+)\)"
    
    # manually insert Root token 
    treeRef[len(treeRef)] = {'curid':0, 
                             'parid':-1, 
                             'posOrTok':'ROOT', 
                             'indent':0}
    ID_CTR = 1
    
    for tok in rawTokens[1:]:
        
        curIndent = _leadingSpaces_(tok) # the current indent level
        parid = _findParent_(curIndent, ID_CTR-1, treeRef) # determine parid
            
        checkChild = re.match(REGEX_COMP, tok)
        if (checkChild):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkChild.group(1), 
                               'indent':curIndent}
            upCTR = ID_CTR
            ID_CTR += 1
            # Eliminate further punctuation

            subCheck = re.sub(REGEX_PUNC_SOLO,'',checkChild.group(2))
            subs = re.findall(REGEX_ISOL_IN_COMP, subCheck) 
            for ch in subs:
                # THE INDENTING IS WRONG HERE - THE HEIRARCHY IS MESSED UP - check test output
                treeRef[ID_CTR] = {'curid':ID_CTR, 
                                   'parid':upCTR, 
                                   'posOrTok':ch[0], 
                                   'indent':curIndent+2}
                ID_CTR += 1
                treeRef[ID_CTR] = {'curid':ID_CTR, 
                                   'parid':ID_CTR-1, 
                                   'posOrTok':ch[1], 
                                   'indent':curIndent+2}
                ID_CTR += 1
            continue
            
            
        checkSingle = re.match(REGEX_SOLO_PAIR, tok)
        if (checkSingle):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkSingle.group(1), 
                               'indent':curIndent+2}
            ID_CTR += 1
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':ID_CTR-1, 
                               'posOrTok':checkSingle.group(2), 
                               'indent':curIndent+2}
            ID_CTR += 1
            continue


        checkMatch = re.match(REGEX_OPEN, tok)
        if (checkMatch):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkMatch.group(1), 
                               'indent':curIndent}
            ID_CTR += 1
            continue

        checkPunc = re.match(REGEX_PUNC, tok)
        if (checkPunc): # ignore punctuation
            continue

    return
            

In [29]:
'''
_generateTree_() method only provides tree (dict representation) listing parents. 
This is a naive method to add a "children" field to the tree - necessary for optimal Tree Kernel methods 
'''
def _flipTree_(treeRef):
    for k,v in treeRef.items():
        treeRef[k]['children'] = set({sk:sv for (sk,sv) in treeRef.items() if sv['parid'] == k}.keys())


In [88]:
'''
Implementation of the Colins-Duffy or Subset-Tree (SST) Kernel
'''

def _isLeaf_(tree, parentNode):
    return (len(tree[parentNode]['children']) == 0)

def _isPreterminal_(tree, parentNode):
    for idx in tree[parentNode]['children']:
        if not _isLeaf_(tree, idx):
            return False
    return True

def _cdHelper_(tree1, tree2, node1, node2, store, lam):
    # use helper store
    if store[node1, node2] > -1:
        return store[node1, node2]
    
    # Leaves cannot have 0 similarity
    if _isLeaf_(tree1, node1) or _isLeaf_(tree2, node2):
        store[node1, node2] = 0
        return 0
    
    # same productions
    if len(tree1[node1]['children'].symmetric_difference(tree2[node2]['children'])) == 0:
        if _isPreterminal_(tree1, node1) and _isPreterminal_(tree2, node2):
            store[node1, node2] = lam
            return lam
        else:
            combs = product(tree1[node1]['children'], tree2[node2]['children'])
            subv = np.zeros(len(tree1[node1]['children']) * len(tree2[node2]['children']))
            for idx, c in enumerate(combs):
                subv[idx] = _cdHelper_(tree1, tree2, c[0], c[1], store, lam)
            subv += 1
            return lam * subv.prod()
    else:
        return 0

def _cdKernel_(tree1, tree2, lam):
    store = np.empty((len(tree1), len(tree2)))
    store.fill(-1)
    for i in range(len(tree1)):
        for j in range(len(tree2)):
            store[i,j] = _cdHelper_(tree1, tree2, i, j, store, lam)
            
    return store.sum()
            
    

## Test: Tree Generation

In [80]:
start_time = time.time()
for i in range(1000):
    treeTest = tree()
    _generateTree_(raw_toks, treeTest)
    _flipTree_(treeTest)
print("--- %s seconds ---" % (time.time() - start_time))


--- 0.10495209693908691 seconds ---


## Test: Tree Generation and Collins-Duffy Kernel

In [81]:
test_a1 = "How do I reduce my thighs?"
test_a2 = "What can I do to reduce my belly?"

test_b1 = "What is it like to die from a tylenol overdose?"
test_b2 = "What would it feel like to die of a heroin overdose?"

In [89]:
start_time = time.time()

tree_a1 = tree()
tree_a2 = tree()

# Generate raw tokens using Stanford Core NLP
toks_a1 = _getNLPToks_(test_a1)
toks_a2 = _getNLPToks_(test_a2)

# Generate a tree structure
_generateTree_(toks_a1, tree_a1)
_generateTree_(toks_a2, tree_a2)

# Flip the trees
_flipTree_(tree_a1)
_flipTree_(tree_a2)

score = _cdKernel_(tree_a1, tree_a2, 0.8)
print("--- %s seconds ---" % (time.time() - start_time))


--- 0.042402029037475586 seconds ---


In [90]:
score

16.893184000000005

In [91]:
test_a1

'How do I reduce my thighs?'

In [92]:
start_time = time.time()

tree_b1 = tree()
tree_b2 = tree()

# Generate raw tokens using Stanford Core NLP
toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)

# Generate a tree structure
_generateTree_(toks_b1, tree_b1)
_generateTree_(toks_b2, tree_b2)

# Flip the trees
_flipTree_(tree_b1)
_flipTree_(tree_b2)

score2 = _cdKernel_(tree_b1, tree_b2, 0.8)
print("--- %s seconds ---" % (time.time() - start_time))


--- 0.08159899711608887 seconds ---
