In [7]:
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP 
from collections import defaultdict
import re, time, bisect, math
import itertools as it


In [8]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [9]:
def _getNLPToks_(rawSentence):
    output = nlp.annotate(rawSentence, properties={
      'annotators': 'tokenize,ssplit,pos,parse',
      'outputFormat': 'json'
      })
    return output['sentences'][0]['parse'].split("\n")

In [10]:
def tree(): return defaultdict(tree) # example of autovivification

## Parse StanfordNLP output to Generate Tree

In [11]:
def _leadingSpaces_(target):
    return len(target) - len(target.lstrip())

def _findParent_(curIndent, parid, treeRef):
    tmpid = parid
    while (curIndent <= treeRef[tmpid]['indent']):
        tmpid = treeRef[tmpid]['parid']
    return tmpid

In [12]:
def _generateTree_(rawTokens, treeRef):
    
    # ----------------------------- CLEARED ---------------------------------# 
    
    # (token
    REGEX_OPEN = r"^\s*\(([a-zA-Z0-9_']*)\s*$"
    
    # (token (tok1 tok2) (tok3 tok4) .... (tokx toky))
    REGEX_COMP = r"^\s*\(([a-zA-Z0-9_']+)\s*((?:[(]([a-zA-Z0-9_;.,?'!]+)\s*([a-zA-Z0-9_;\.,?!']+)[)]\s*)+)"
    
    # (, ,) as stand-alone. Used for match() not search()
    REGEX_PUNC = r"^\s*\([,!?.'\"]\s*[,!?.'\"]\)"

    # (tok1 tok2) as stand-alone
    REGEX_SOLO_PAIR = r"^\s*\(([a-zA-Z0-9_']+)\s*([a-zA-Z0-9_']+)\)"
        
    # (tok1 tok2) used in search()
    REGEX_ISOL_IN_COMP = r"\(([a-zA-Z0-9_;.,?!']+)\s*([a-zA-Z0-9_;.,?!']+)\)"
    # (punc punc) used in search()
    REGEX_PUNC_SOLO = r"\([,!?.'\"]\s*[,!?.'\"]\)"

    
    # manually insert Root token 
    treeRef[len(treeRef)] = {'curid':0, 
                             'parid':-1, 
                             'posOrTok':'ROOT', 
                             'indent':0,
                            'children':[],
                            'childrenTok':[]}
    ID_CTR = 1
    
    for tok in rawTokens[1:]:
        
        curIndent = _leadingSpaces_(tok) # the current indent level
        parid = _findParent_(curIndent, ID_CTR-1, treeRef) # determine parid
        
        # CHECK FOR COMPOSITE TOKENS
        checkChild = re.match(REGEX_COMP, tok)
        if (checkChild):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkChild.group(1), 
                               'indent':curIndent,
                              'children':[],
                              'childrenTok':[]}
            upCTR = ID_CTR
            ID_CTR += 1
            # Eliminate further punctuation

            subCheck = re.sub(REGEX_PUNC_SOLO,'',checkChild.group(2))
            subs = re.findall(REGEX_ISOL_IN_COMP, subCheck) 
            for ch in subs:
                # THE INDENTING IS WRONG HERE - THE HEIRARCHY IS MESSED UP - check test output
                treeRef[ID_CTR] = {'curid':ID_CTR, 
                                   'parid':upCTR, 
                                   'posOrTok':ch[0], 
                                   'indent':curIndent+2,
                                  'children':[],
                                  'childrenTok':[]}
                ID_CTR += 1
                treeRef[ID_CTR] = {'curid':ID_CTR, 
                                   'parid':ID_CTR-1, 
                                   'posOrTok':ch[1], 
                                   'indent':curIndent+2,
                                  'children':[],
                                  'childrenTok':[]}
                ID_CTR += 1
            continue
           

            
        checkSingle = re.match(REGEX_SOLO_PAIR, tok)
        if (checkSingle):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkSingle.group(1), 
                               'indent':curIndent+2,
                              'children':[],
                              'childrenTok':[]}
            ID_CTR += 1
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':ID_CTR-1, 
                               'posOrTok':checkSingle.group(2), 
                               'indent':curIndent+2,
                              'children':[],
                              'childrenTok':[]}
            ID_CTR += 1
            continue
        
        
        checkPunc = re.match(REGEX_PUNC, tok)
        if (checkPunc): # ignore punctuation
            continue

        checkMatch = re.match(REGEX_OPEN, tok)
        if (checkMatch):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkMatch.group(1), 
                               'indent':curIndent,
                              'children':[],
                              'childrenTok':[]}
            ID_CTR += 1
            continue

    return
            

In [13]:
'''
_generateTree_() method only provides tree (dict representation) listing parents. 
This is a naive method to add a "children" field to the tree - necessary for optimal Tree Kernel methods.
'''

# Switching to 2-pass O(N)
def _flipTree_(treeRef):
    # Pass 1 fill in children
    for k,v in treeRef.items():
        if (k > 0):
            bisect.insort(treeRef[v['parid']]['children'], k)
    # Pass 2 map children to tokens
    for k,v in treeRef.items():
        if (k > 0):
            treeRef[k]['childrenTok'] = [treeRef[ch]['posOrTok'] for ch in treeRef[k]['children']]
    treeRef[0]['childrenTok'] = treeRef[1]['posOrTok']

## Tree Kernel Helper Functions

In [14]:
def _isLeaf_(tree, parentNode):
    return (len(tree[parentNode]['children']) == 0)

def _isPreterminal_(tree, parentNode):
    for idx in tree[parentNode]['children']:
        if not _isLeaf_(tree, idx):
            return False
    return True


## Implementation of the Collins-Duffy Kernel

In [15]:
'''
Implementation of the Colins-Duffy or Subset-Tree (SST) Kernel
'''

def _cdHelper_(tree1, tree2, node1, node2, store, lam):    
    # No duplicate computations
    if store[node1, node2] >= 0:
        return
    
    # Leaves yield similarity score by definition
    if (_isLeaf_(tree1, node1) or _isLeaf_(tree2, node2)):
        store[node1, node2] = 0
        return
    
    # same parent node
    if tree1[node1]['posOrTok'] == tree2[node2]['posOrTok']: 
        # same children tokens
        if tree1[node1]['childrenTok'] == tree2[node2]['childrenTok']:
            # Check if both nodes are pre-terminal
            if _isPreterminal_(tree1, node1) and _isPreterminal_(tree2, node2):
                store[node1, node2] = lam
                return
            # Not pre-terminal. Recurse among the children of both token trees. 
            else:              
                nChildren = len(tree1[node1]['children'])
                runningTotal = 0
                for idx in range(nChildren):
                     # index ->  node_id
                    tmp_n1 = tree1[node1]['children'][idx]
                    tmp_n2 = tree2[node2]['children'][idx]
                    # Recursively run helper
                    _cdHelper_(tree1, tree2, tmp_n1, tmp_n2, store, lam) 
                    runningTotal += (1 + store[tmp_n1, tmp_n2])
                    
                store[node1, node2] = lam * runningTotal
                return
        else:
            store[node1, node2] = 0
    else: # parent nodes are different 
        store[node1, node2] = 0
        return 

def _cdKernel_(tree1, tree2, lam):
    # Fill the initial state of the store 
    store = np.empty((len(tree1), len(tree2)))
    store.fill(-1)
    
    # O(N^2) to compute the tree dot product
    for i in range(len(tree1)):
        for j in range(len(tree2)):
            _cdHelper_(tree1, tree2, i, j, store, lam)
            
    return store.sum()
            
'''
Returns a tuple w/ format: (raw, normalized)
If NORMALIZE_FLAG set to False, tuple[1] = -1
'''    
def _CollinsDuffy_(tree1, tree2, lam, NORMALIZE_FLAG):
    raw_score = _cdKernel_(tree1, tree2, lam)
    if (NORMALIZE_FLAG):
        t1_score = _cdKernel_(tree1, tree1, lam)
        t2_score = _cdKernel_(tree2, tree2, lam)
        return (raw_score,(raw_score / math.sqrt(t1_score * t2_score)))
    else:
        return (raw_score,-1)
    

## Implementation of the PT Kernel (Moschitti '06)

In [26]:
'''
Implementation of the Partial Tree (PT) Kernel from:
"Efficient Convolution Kernels for Dependency and Constituent Syntactic Trees"
by Alessandro Moschitti
'''

'''
The delta function is stolen from the Collins-Duffy kernel
''' 
    
def _deltaP_(tree1, tree2, seq1, seq2, store, lam, mu, p):
                
#     # Enumerate subsequences of length p+1 for each child set
#     leftSeqs = list(it.combinations(tree1[node1]['children'], seqLen))
#     rightSeqs = list(it.combinations(tree2[node2]['children'], seqLen))
    if p == 0:
        return 0

    # generate delta(a,b)
    _delta_(tree1, tree2, seq1[-1], seq2[-1], store, lam, mu)
    
    if store[seq1[-1], seq2[-1]] == 0:
        return 0
    else:
        runningTot = 0
        for i in range(p-1, len(seq1)-1):
            for r in range(p-1, len(seq2)-1):
                scaleFactor = pow(lam, len(seq1[:-1])-i+len(seq2[:-1])-r)
                dp = _deltaP_(tree1, tree2, seq1[:i], seq2[:r], store, lam, mu, p-1)
                runningTot += (scaleFactor * dp)
                
        return runningTot
                
def _delta_(tree1, tree2, node1, node2, store, lam, mu):    
#     print("Evaluating Delta: (%d,%d)" % (node1, node2))
    
    # No duplicate computations
    if store[node1, node2] >= 0:
        return
    
    # Leaves yield similarity score by definition
    if (_isLeaf_(tree1, node1) or _isLeaf_(tree2, node2)):
        store[node1, node2] = 0
        return
    
    # same parent node
    if tree1[node1]['posOrTok'] == tree2[node2]['posOrTok']: 
        # establishes p_max
        childmin = min(len(tree1[node1]['children']), len(tree2[node2]['children']))
        deltaTot = 0
        for p in range(1,childmin+1):
            # compute delta_p
            deltaTot += _deltaP_(tree1, tree2,
                                 tree1[node1]['children'], 
                                 tree2[node2]['children'], store, lam, mu, p) 
        
        store[node1, node2] = mu * (pow(lam,2) + deltaTot)
        return

    else: 
        # parent nodes are different 
        store[node1, node2] = 0
        return 

def _ptKernel_(tree1, tree2, lam, mu):
    # Fill the initial state of the store 
    store = np.empty((len(tree1), len(tree2)))
    store.fill(-1)
    
    # O(N^2) to compute the tree dot product
    for i in range(len(tree1)):
        for j in range(len(tree2)):
            _delta_(tree1, tree2, i, j, store, lam, mu)
            
    return store.sum()
             
'''
Returns a tuple w/ format: (raw, normalized)
If NORMALIZE_FLAG set to False, tuple[1] = -1
'''    
def _MoschittiPT_(tree1, tree2, lam, mu, NORMALIZE_FLAG):
    raw_score = _ptKernel_(tree1, tree2, lam, mu)
    if (NORMALIZE_FLAG):
        t1_score = _ptKernel_(tree1, tree1, lam, mu)
        t2_score = _ptKernel_(tree2, tree2, lam, mu)
        return (raw_score,(raw_score / math.sqrt(t1_score * t2_score)))
    else:
        return (raw_score,-1)
    

## Test: Tree Generation and Collins-Duffy Kernel

In [29]:
test_b1 = "Why do we enjoy parties in the scope of the universe"
test_b2 = "What are college parties like?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"
start_time = time.time()

tree_b1 = tree()
tree_b2 = tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
_generateTree_(toks_b1, tree_b1)
_generateTree_(toks_b2, tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
_flipTree_(tree_b1)
_flipTree_(tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

(rawScore, normScore) = _CollinsDuffy_(tree_b1, tree_b2, 0.8, 1)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kernel

print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))

--- 0.04027605056762695 seconds ---
--- 0.04057908058166504 seconds ---
--- 0.04071807861328125 seconds ---
--- 0.042855024337768555 seconds ---
Raw Score: 1.6
Norm Score: 0.0290974838407


## Test: Tree Generation and Moschitti PT Kernel

In [30]:
test_b1 = "Why do we enjoy parties in the scope of the universe"
test_b2 = "What are college parties like?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"
start_time = time.time()

tree_b1 = tree()
tree_b2 = tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
_generateTree_(toks_b1, tree_b1)
_generateTree_(toks_b2, tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
_flipTree_(tree_b1)
_flipTree_(tree_b2)

# for i in tree_b1.items():
#     print(i)
    
# for i in tree_b2.items():
#     print(i)

print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

(rawScore, normScore) = _MoschittiPT_(tree_b1, tree_b2, 0.8, 0.4, 1)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kernel

print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))

--- 0.039852142333984375 seconds ---
--- 0.04013514518737793 seconds ---
--- 0.040258169174194336 seconds ---
--- 0.04257512092590332 seconds ---
Raw Score: 4.096
Norm Score: 0.675520529473


## Test: Tree Kernels on Training Data

In [32]:
from random import randint

def read_data(path_to_file):
    df = pd.read_csv(path_to_file)
    print ("Shape of base training File = ", df.shape)
    # Remove missing values and duplicates from training data
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    print("Shape of base training data after cleaning = ", df.shape)
    return df

df_train = read_data("input/train.csv")

Shape of base training File =  (404290, 6)
Shape of base training data after cleaning =  (404288, 6)


In [33]:
## SET UP TEST INDICES TO SELECT RANDOM ROWS

TEST_SAMPLES = 100
indices = np.arange(TEST_SAMPLES)
for i in range(len(indices)):
    indices[i] = randint(0,df_train.shape[0])

In [40]:
for ind in indices:
    
    tree_1 = tree()
    tree_2 = tree()

    toks_1 = _getNLPToks_(df_train.get_value(ind,'question1'))
    toks_2 = _getNLPToks_(df_train.get_value(ind,'question2'))

    # Generate a tree structure
    _generateTree_(toks_1, tree_1)
    _generateTree_(toks_2, tree_2)

    # Flip the trees
    _flipTree_(tree_1)
    _flipTree_(tree_2)

    (scoreRaw_cd, scoreNorm_cd) = _CollinsDuffy_(tree_1, tree_2, 0.8, 1)
    (scoreRaw_pt, scoreNorm_pt) = _MoschittiPT_(tree_1, tree_2, 0.8, 0.4, 1)
    print("%s\n%s" % (df_train.get_value(ind,'question1'), df_train.get_value(ind,'question2')))
    print("Collins-Duffy | Norm: %f | Raw: %f" % (scoreNorm_cd, scoreRaw_cd))
    print("Moschitti | Norm: %f | Raw: %f\n" % (scoreNorm_pt, scoreRaw_pt))

How do I overcome smartphone addiction?
How do I fight my phone addiction?
Collins-Duffy | Norm: 0.655738 | Raw: 21.262080
Moschitti | Norm: 0.843661 | Raw: 2.816000

What is the strangest thing that ever happened to you?
What is the strangest thing someone has ever said to you?
Collins-Duffy | Norm: 0.323162 | Raw: 29.254400
Moschitti | Norm: 0.868092 | Raw: 7.424000

How do the Chinese in China view Americans?
How do the Chinese people view Americans?
Collins-Duffy | Norm: 0.096894 | Raw: 5.440000
Moschitti | Norm: 0.738549 | Raw: 4.608000

What are some major social faux pas to avoid when visiting Malta?
What are some major social faux pas to avoid when visiting Norway?
Collins-Duffy | Norm: 0.962300 | Raw: 93.331666
Moschitti | Norm: 1.000000 | Raw: 10.496000

What do you think of Jyoti Basu?
Who is Jyoti Basu?
Collins-Duffy | Norm: 0.291996 | Raw: 10.112000
Moschitti | Norm: 0.760886 | Raw: 2.816000

Which phone should I buy under INR 15K?
Which mobile I should buy under 15k?
Coll