In [1]:
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP 
from collections import defaultdict
import re

In [2]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [3]:
text = "This time around, they're moving even faster"
# text = "The real question is, who's afraid of Virgina Woolf?"


output = nlp.annotate(text, properties={
  'annotators': 'parse',
  'outputFormat': 'json'
  })

In [4]:
output

{'sentences': [{'basicDependencies': [{'dep': 'ROOT',
     'dependent': 7,
     'dependentGloss': 'moving',
     'governor': 0,
     'governorGloss': 'ROOT'},
    {'dep': 'det',
     'dependent': 1,
     'dependentGloss': 'This',
     'governor': 2,
     'governorGloss': 'time'},
    {'dep': 'nmod:npmod',
     'dependent': 2,
     'dependentGloss': 'time',
     'governor': 3,
     'governorGloss': 'around'},
    {'dep': 'advmod',
     'dependent': 3,
     'dependentGloss': 'around',
     'governor': 7,
     'governorGloss': 'moving'},
    {'dep': 'punct',
     'dependent': 4,
     'dependentGloss': ',',
     'governor': 7,
     'governorGloss': 'moving'},
    {'dep': 'nsubj',
     'dependent': 5,
     'dependentGloss': 'they',
     'governor': 7,
     'governorGloss': 'moving'},
    {'dep': 'aux',
     'dependent': 6,
     'dependentGloss': "'re",
     'governor': 7,
     'governorGloss': 'moving'},
    {'dep': 'advmod',
     'dependent': 8,
     'dependentGloss': 'even',
     'governo

In [5]:
print(output['sentences'][0]['parse'])

(ROOT
  (S
    (ADVP
      (NP (DT This) (NN time))
      (RB around))
    (, ,)
    (NP (PRP they))
    (VP (VBP 're)
      (VP (VBG moving)
        (ADJP (RB even) (JJR faster))))))


In [30]:
raw = output['sentences'][0]['parse']
raw_toks = raw.split("\n")

In [31]:
def tree(): return defaultdict(tree) # example of autovivification

In [32]:
print(repr(output['sentences'][0]['parse']))
# indent two spaces per 

"(ROOT\n  (S\n    (ADVP\n      (NP (DT This) (NN time))\n      (RB around))\n    (, ,)\n    (NP (PRP they))\n    (VP (VBP 're)\n      (VP (VBG moving)\n        (ADJP (RB even) (JJR faster))))))"


In [33]:
raw_toks

['(ROOT',
 '  (S',
 '    (ADVP',
 '      (NP (DT This) (NN time))',
 '      (RB around))',
 '    (, ,)',
 '    (NP (PRP they))',
 "    (VP (VBP 're)",
 '      (VP (VBG moving)',
 '        (ADJP (RB even) (JJR faster))))))']

In [34]:
raw_toks[0]

'(ROOT'

In [35]:
tr = re.match(r"^\s*\(([a-zA-Z0-9_]*)$",raw_toks[0])
print(tr.group(1)) # group(1) b/c group(0) is the complete match

ROOT


In [36]:
def _leadingSpaces_(target):
    return len(target) - len(target.lstrip())

def _findParent_(curIndent, parid, treeRef):
    tmpid = parid
    while (curIndent <= treeRef[tmpid]['indent']):
        tmpid = treeRef[tmpid]['parid']
    return tmpid

In [40]:
def _generateTree_(rawTokens, treeRef):
    
    REGEX_PUNC = r"^\s*\([,!?.'\"]\s*[,!?.'\"]\)"
    REGEX_OPEN = r"^\s*\(([a-zA-Z0-9_']*)$"
    REGEX_COMP = r"^\s*\(([a-zA-Z0-9_']*)\s*((?:[(]([a-zA-Z0-9_;.,?'!]*)\s*([a-zA-Z0-9_;\.,?!']*)[)]\s*)*)"
    REGEX_ISOL_IN_COMP = r"\(([a-zA-Z0-9_;.,?!']+)\s*([a-zA-Z0-9_;.,?!']+)\)"
    REGEX_PUNC_SOLO = r"\([,!?.'\"]\s*[,!?.'\"]\)"
        
    # manually insert Root token 
    treeRef[len(treeRef)] = {'curid':0, 
                             'parid':-1, 
                             'posOrTok':'ROOT', 
                             'indent':0}
    ID_CTR = 1
    
    for tok in rawTokens[1:]:
        
        curIndent = _leadingSpaces_(tok) # the current indent level
        parid = _findParent_(curIndent, ID_CTR-1, treeRef) # determine parid
        
        checkMatch = re.match(REGEX_OPEN, tok)
        if (checkMatch):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkMatch.group(1), 
                               'indent':curIndent}
            ID_CTR += 1
            continue

        checkPunc = re.match(REGEX_PUNC, tok)
        if (checkPunc): # ignore punctuation
            continue

        checkChild = re.match(REGEX_COMP, tok)
        if (checkChild):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkChild.group(1), 
                               'indent':curIndent}
            upCTR = ID_CTR
            ID_CTR += 1
            # Eliminate further punctuation
            print(checkChild.group(2))

            subCheck = re.sub(REGEX_PUNC_SOLO,'',checkChild.group(2))
            subs = re.findall(REGEX_ISOL_IN_COMP, subCheck) 
            for ch in subs:
                # THE INDENTING IS WRONG HERE - THE HEIRARCHY IS MESSED UP - check test output
                treeRef[ID_CTR] = {'curid':ID_CTR, 
                                   'parid':upCTR, 
                                   'posOrTok':ch[0], 
                                   'indent':curIndent+2}
                ID_CTR += 1
                treeRef[ID_CTR] = {'curid':ID_CTR, 
                                   'parid':ID_CTR-1, 
                                   'posOrTok':ch[1], 
                                   'indent':curIndent+2}
                ID_CTR += 1

    return
            

In [51]:
'''
_generateTree_() method only provides tree (dict representation) listing parents. 
This is a naive method to add a "children" field to the tree - necessary for optimal Tree Kernel methods 
'''
def _flipTree_(treeRef):
    for k,v in treeRef.items():
        treeRef[k]['children'] = list({sk:sv for (sk,sv) in treeRef.items() if sv['parid'] == k}.keys())


## Test: Tree Generation

In [41]:
treeTest = tree()
_generateTree_(raw_toks, treeTest)

(DT This) (NN time)

(PRP they)
(VBP 're)
(VBG moving)
(RB even) (JJR faster)


In [42]:
treeTest

defaultdict(<function __main__.tree>,
            {0: {'curid': 0, 'indent': 0, 'parid': -1, 'posOrTok': 'ROOT'},
             1: {'curid': 1, 'indent': 2, 'parid': 0, 'posOrTok': 'S'},
             2: {'curid': 2, 'indent': 4, 'parid': 1, 'posOrTok': 'ADVP'},
             3: {'curid': 3, 'indent': 6, 'parid': 2, 'posOrTok': 'NP'},
             4: {'curid': 4, 'indent': 8, 'parid': 3, 'posOrTok': 'DT'},
             5: {'curid': 5, 'indent': 8, 'parid': 4, 'posOrTok': 'This'},
             6: {'curid': 6, 'indent': 8, 'parid': 3, 'posOrTok': 'NN'},
             7: {'curid': 7, 'indent': 8, 'parid': 6, 'posOrTok': 'time'},
             8: {'curid': 8, 'indent': 6, 'parid': 2, 'posOrTok': 'RB'},
             9: {'curid': 9, 'indent': 4, 'parid': 1, 'posOrTok': 'NP'},
             10: {'curid': 10, 'indent': 6, 'parid': 9, 'posOrTok': 'PRP'},
             11: {'curid': 11, 'indent': 6, 'parid': 10, 'posOrTok': 'they'},
             12: {'curid': 12, 'indent': 4, 'parid': 1, 'posOrTok': 'V