In [2]:
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP 
from collections import defaultdict
import re, time, bisect, math
import itertools as it


In [3]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [4]:
def _getNLPToks_(rawSentence):
    output = nlp.annotate(rawSentence, properties={
      'annotators': 'tokenize,ssplit,pos,parse',
      'outputFormat': 'json'
      })
    return output['sentences'][0]['parse'].split("\n")

In [5]:
def tree(): return defaultdict(tree) # example of autovivification

## Parse StanfordNLP output to Generate Tree

In [6]:
def _leadingSpaces_(target):
    return len(target) - len(target.lstrip())

def _findParent_(curIndent, parid, treeRef):
    tmpid = parid
    while (curIndent <= treeRef[tmpid]['indent']):
        tmpid = treeRef[tmpid]['parid']
    return tmpid

In [7]:
def _generateTree_(rawTokens, treeRef):
    
    # (, ,) as stand-alone
    REGEX_PUNC = r"^\s*\([,!?.'\"]\s*[,!?.'\"]\)"
    # (token
    REGEX_OPEN = r"^\s*\(([a-zA-Z0-9_']*)$"
    # (token (tok1 tok2) (tok3 tok4) .... (tokx toky))
    REGEX_COMP = r"^\s*\(([a-zA-Z0-9_']*)\s*((?:[(]([a-zA-Z0-9_;.,?'!]*)\s*([a-zA-Z0-9_;\.,?!']*)[)]\s*)*)"
    # (tok1 tok2)
    REGEX_ISOL_IN_COMP = r"\(([a-zA-Z0-9_;.,?!']+)\s*([a-zA-Z0-9_;.,?!']+)\)"
    # (, ,)
    REGEX_PUNC_SOLO = r"\([,!?.'\"]\s*[,!?.'\"]\)"
    # (tok1 tok2) as stand-alone
    REGEX_SOLO_PAIR = r"^\s*\(([a-zA-Z0-9_']+)\s*([a-zA-Z0-9_']+)\)"
    
    # manually insert Root token 
    treeRef[len(treeRef)] = {'curid':0, 
                             'parid':-1, 
                             'posOrTok':'ROOT', 
                             'indent':0,
                            'children':[],
                            'childrenTok':[]}
    ID_CTR = 1
    
    for tok in rawTokens[1:]:
        
        curIndent = _leadingSpaces_(tok) # the current indent level
        parid = _findParent_(curIndent, ID_CTR-1, treeRef) # determine parid
            
        checkChild = re.match(REGEX_COMP, tok)
        if (checkChild):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkChild.group(1), 
                               'indent':curIndent,
                              'children':[],
                              'childrenTok':[]}
            upCTR = ID_CTR
            ID_CTR += 1
            # Eliminate further punctuation

            subCheck = re.sub(REGEX_PUNC_SOLO,'',checkChild.group(2))
            subs = re.findall(REGEX_ISOL_IN_COMP, subCheck) 
            for ch in subs:
                # THE INDENTING IS WRONG HERE - THE HEIRARCHY IS MESSED UP - check test output
                treeRef[ID_CTR] = {'curid':ID_CTR, 
                                   'parid':upCTR, 
                                   'posOrTok':ch[0], 
                                   'indent':curIndent+2,
                                  'children':[],
                                  'childrenTok':[]}
                ID_CTR += 1
                treeRef[ID_CTR] = {'curid':ID_CTR, 
                                   'parid':ID_CTR-1, 
                                   'posOrTok':ch[1], 
                                   'indent':curIndent+2,
                                  'children':[],
                                  'childrenTok':[]}
                ID_CTR += 1
            continue
            
            
        checkSingle = re.match(REGEX_SOLO_PAIR, tok)
        if (checkSingle):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkSingle.group(1), 
                               'indent':curIndent+2,
                              'children':[],
                              'childrenTok':[]}
            ID_CTR += 1
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':ID_CTR-1, 
                               'posOrTok':checkSingle.group(2), 
                               'indent':curIndent+2,
                              'children':[],
                              'childrenTok':[]}
            ID_CTR += 1
            continue


        checkMatch = re.match(REGEX_OPEN, tok)
        if (checkMatch):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkMatch.group(1), 
                               'indent':curIndent,
                              'children':[],
                              'childrenTok':[]}
            ID_CTR += 1
            continue

        checkPunc = re.match(REGEX_PUNC, tok)
        if (checkPunc): # ignore punctuation
            continue

    return
            

In [8]:
'''
_generateTree_() method only provides tree (dict representation) listing parents. 
This is a naive method to add a "children" field to the tree - necessary for optimal Tree Kernel methods.
'''

# Switching to 2-pass O(N)
def _flipTree_(treeRef):
    # Pass 1 fill in children
    for k,v in treeRef.items():
        if (k > 0):
            bisect.insort(treeRef[v['parid']]['children'], k)
    # Pass 2 map children to tokens
    for k,v in treeRef.items():
        if (k > 0):
            treeRef[k]['childrenTok'] = [treeRef[ch]['posOrTok'] for ch in treeRef[k]['children']]

## Tree Kernel Helper Functions

In [9]:
def _isLeaf_(tree, parentNode):
    return (len(tree[parentNode]['children']) == 0)

def _isPreterminal_(tree, parentNode):
    for idx in tree[parentNode]['children']:
        if not _isLeaf_(tree, idx):
            return False
    return True


## Implementation of the Collins-Duffy Kernel

In [10]:
'''
Implementation of the Colins-Duffy or Subset-Tree (SST) Kernel
'''

def _cdHelper_(tree1, tree2, node1, node2, store, lam):    
    # No duplicate computations
    if store[node1, node2] >= 0:
        return
    
    # Leaves yield similarity score by definition
    if (_isLeaf_(tree1, node1) or _isLeaf_(tree2, node2)):
        store[node1, node2] = 0
        return
    
    # same parent node
    if tree1[node1]['posOrTok'] == tree2[node2]['posOrTok']: 
        # same children tokens
        if tree1[node1]['childrenTok'] == tree2[node2]['childrenTok']:
            # Check if both nodes are pre-terminal
            if _isPreterminal_(tree1, node1) and _isPreterminal_(tree2, node2):
                store[node1, node2] = lam
                return
            # Not pre-terminal. Recurse among the children of both token trees. 
            else:              
                nChildren = len(tree1[node1]['children'])
                runningTotal = 0
                for idx in range(nChildren):
                     # index ->  node_id
                    tmp_n1 = tree1[node1]['children'][idx]
                    tmp_n2 = tree2[node2]['children'][idx]
                    # Recursively run helper
                    _cdHelper_(tree1, tree2, tmp_n1, tmp_n2, store, lam) 
                    runningTotal += (1 + store[tmp_n1, tmp_n2])
                    
                store[node1, node2] = lam * runningTotal
                return
        else:
            store[node1, node2] = 0
    else: # parent nodes are different 
        store[node1, node2] = 0
        return 

def _cdKernel_(tree1, tree2, lam):
    # Fill the initial state of the store 
    store = np.empty((len(tree1), len(tree2)))
    store.fill(-1)
    
    # O(N^2) to compute the tree dot product
    for i in range(len(tree1)):
        for j in range(len(tree2)):
            _cdHelper_(tree1, tree2, i, j, store, lam)
            
    return store.sum()
            
'''
Returns a tuple w/ format: (raw, normalized)
If NORMALIZE_FLAG set to False, tuple[1] = -1
'''    
def _CollinsDuffy_(tree1, tree2, lam, NORMALIZE_FLAG):
    raw_score = _cdKernel_(tree1, tree2, lam)
    if (NORMALIZE_FLAG):
        t1_score = _cdKernel_(tree1, tree1, lam)
        t2_score = _cdKernel_(tree2, tree2, lam)
        return (raw_score,(raw_score / math.sqrt(t1_score * t2_score)))
    else:
        return (raw_score,-1)
    

## Implementation of the PT Kernel (Moschitti '06)

In [19]:
'''
Implementation of the Partial Tree (PT) Kernel from:
"Efficient Convolution Kernels for Dependency and Constituent Syntactic Trees"
by Alessandro Moschitti
'''

'''
The delta function is stolen from the Collins-Duffy kernel
''' 
    
def _deltaP_(tree1, tree2, seq1, seq2, store, lam, mu, p):
                
#     # Enumerate subsequences of length p+1 for each child set
#     leftSeqs = list(it.combinations(tree1[node1]['children'], seqLen))
#     rightSeqs = list(it.combinations(tree2[node2]['children'], seqLen))

    # generate delta(a,b)
    print(seq1)
    print(seq2)
    
    _delta_(tree1, tree2, seq1[-1], seq2[-1], store, lam, mu)
    if store[seq1[-1], seq2[-1]] == 0:
        return 0
    else:
        runningTot = 0
        for i in range(p-1, len(seq1)-1):
            for r in range(p-1, len(seq2)-1):
                scaleFactor = pow(lam, len(seq1[:-1])-i+len(seq2[:-1])-r)
                dp = _deltaP_(tree1, tree2, seq1[:i], seq2[:r], store, lam, mu, p-1)
                runningTot += (scaleFactor * dp)
                
        return runningTot
                
def _delta_(tree1, tree2, node1, node2, store, lam, mu):    
    # No duplicate computations
    if store[node1, node2] >= 0:
        return
    
    # Leaves yield similarity score by definition
    if (_isLeaf_(tree1, node1) or _isLeaf_(tree2, node2)):
        store[node1, node2] = 0
        return
    
    # same parent node
    if tree1[node1]['posOrTok'] == tree2[node2]['posOrTok']: 
        # establishes p_max
        childmin = min(len(tree1[node1]['children']), len(tree2[node2]['children']))
        
        deltaTot = 0
        for p in range(1,childmin+1):
            # compute delta_p
            print("Evaluating deltaP for p: %d" % p)
            deltaTot += _deltaP_(tree1, tree2,
                                 tree1[node1]['children'], 
                                 tree2[node2]['children'], store, lam, mu, p) 
        
        store[node1, node2] = mu * (pow(lam,2) + deltaTot)
        return

    else: 
        # parent nodes are different 
        store[node1, node2] = 0
        return 

def _ptKernel_(tree1, tree2, lam, mu):
    # Fill the initial state of the store 
    store = np.empty((len(tree1), len(tree2)))
    store.fill(-1)
    
    # O(N^2) to compute the tree dot product
    for i in range(len(tree1)):
        for j in range(len(tree2)):
            print("Evaluating Delta(%d,%d)" % (i,j))
            _delta_(tree1, tree2, i, j, store, lam, mu)
            
    return store.sum()
             
'''
Returns a tuple w/ format: (raw, normalized)
If NORMALIZE_FLAG set to False, tuple[1] = -1
'''    
def _MoschittiPT_(tree1, tree2, lam, mu, NORMALIZE_FLAG):
    raw_score = _ptKernel_(tree1, tree2, lam, mu)
    if (NORMALIZE_FLAG):
        t1_score = _ptKernel_(tree1, tree1, lam, mu)
        t2_score = _ptKernel_(tree2, tree2, lam, mu)
        return (raw_score,(raw_score / math.sqrt(t1_score * t2_score)))
    else:
        return (raw_score,-1)
    

## Test: Tree Generation and Collins-Duffy Kernel

In [48]:
test_b1 = "How do I reduce my thighs?"
test_b2 = "What can I do to reduce my belly?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"
start_time = time.time()

tree_b1 = tree()
tree_b2 = tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
_generateTree_(toks_b1, tree_b1)
_generateTree_(toks_b2, tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
_flipTree_(tree_b1)
_flipTree_(tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

(rawScore, normScore) = _CollinsDuffy_(tree_b1, tree_b2, 0.8, 1)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kernel

print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))

--- 0.02618718147277832 seconds ---
--- 0.026487112045288086 seconds ---
--- 0.02661919593811035 seconds ---
--- 0.028065204620361328 seconds ---
Raw Score: 6.08
Norm Score: 0.154358850021


## Test: Tree Generation and Moschitti PT Kernel

In [20]:
test_b1 = "Why do we enjoy parties in the scope of the universe"
test_b2 = "What are college parties like?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"
start_time = time.time()

tree_b1 = tree()
tree_b2 = tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
_generateTree_(toks_b1, tree_b1)
_generateTree_(toks_b2, tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
_flipTree_(tree_b1)
_flipTree_(tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

(rawScore, normScore) = _MoschittiPT_(tree_b1, tree_b2, 0.8, 0.4, 1)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kernel

print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))

--- 0.061335086822509766 seconds ---
--- 0.06187295913696289 seconds ---
--- 0.06212902069091797 seconds ---
Evaluating Delta(0,0)
Evaluating deltaP for p: 1
[1]
[1]
Evaluating deltaP for p: 1
[2, 5]
[2, 5, 16]
Evaluating deltaP for p: 2
[2, 5]
[2, 5, 16]
Evaluating Delta(0,1)
Evaluating Delta(0,2)
Evaluating Delta(0,3)
Evaluating Delta(0,4)
Evaluating Delta(0,5)
Evaluating Delta(0,6)
Evaluating Delta(0,7)
Evaluating Delta(0,8)
Evaluating Delta(0,9)
Evaluating Delta(0,10)
Evaluating Delta(0,11)
Evaluating Delta(0,12)
Evaluating Delta(0,13)
Evaluating Delta(0,14)
Evaluating Delta(0,15)
Evaluating Delta(0,16)
Evaluating Delta(1,0)
Evaluating Delta(1,1)
Evaluating Delta(1,2)
Evaluating Delta(1,3)
Evaluating Delta(1,4)
Evaluating Delta(1,5)
Evaluating Delta(1,6)
Evaluating Delta(1,7)
Evaluating Delta(1,8)
Evaluating Delta(1,9)
Evaluating Delta(1,10)
Evaluating Delta(1,11)
Evaluating Delta(1,12)
Evaluating Delta(1,13)
Evaluating Delta(1,14)
Evaluating Delta(1,15)
Evaluating Delta(1,16)
Eval

IndexError: list index out of range

## Test: Tree Kernels on Training Data

In [39]:
from random import randint

def read_data(path_to_file):
    df = pd.read_csv(path_to_file)
    print ("Shape of base training File = ", df.shape)
    # Remove missing values and duplicates from training data
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    print("Shape of base training data after cleaning = ", df.shape)
    return df

df_train = read_data("input/train.csv")

Shape of base training File =  (404290, 6)
Shape of base training data after cleaning =  (404288, 6)


In [50]:
## SET UP TEST INDICES TO SELECT RANDOM ROWS

TEST_SAMPLES = 100
indices = np.arange(TEST_SAMPLES)
for i in range(len(indices)):
    indices[i] = randint(0,df_train.shape[0])

In [51]:
for ind in indices:
    
    tree_1 = tree()
    tree_2 = tree()

    toks_1 = _getNLPToks_(df_train.get_value(ind,'question1'))
    toks_2 = _getNLPToks_(df_train.get_value(ind,'question2'))

    # Generate a tree structure
    _generateTree_(toks_1, tree_1)
    _generateTree_(toks_2, tree_2)

    # Flip the trees
    _flipTree_(tree_1)
    _flipTree_(tree_2)

    (scoreRaw, scoreNorm) = _CollinsDuffy_(tree_1, tree_2, 0.9, 1)
    print("%s \n %s \n Norm: %f | Raw: %f" % (df_train.get_value(ind,'question1'), df_train.get_value(ind,'question2'), scoreNorm, scoreRaw))

In general, which is worse for you: one can of beer or one can of soda? 
 Which is worse for your body: a diet soda or a regular soda? 
 Norm: 0.235097 | Raw: 38.588580
Can you check who views your VSCO? 
 Can you tell if someone screenshot a Vsco picture? 
 Norm: 0.301396 | Raw: 20.591100
What is an app that can protect the eyes from strain while looking at a mobile? 
 Do you strain your eyes while working at an IT job? 
 Norm: 0.155952 | Raw: 24.473790
How can I learn English in 6 months? 
 How can I learn English very well within 6 months? 
 Norm: 0.451824 | Raw: 39.754890
What are some shows like "Bates Motel"? 
 What are some shows like Bates Motel? 
 Norm: 0.654884 | Raw: 47.652642
What are the biggest culture shocks people face when coming to Kazakhstan? 
 What are the biggest culture shocks people face when coming to Boston? 
 Norm: 0.960704 | Raw: 164.102196
Why do I have to lie so much? 
 Why does Arvind Kejriwal lie so much? 
 Norm: 0.178813 | Raw: 13.364100
How do I pray as