In [1]:
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP 
from collections import defaultdict
import re

In [2]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [3]:
text = "This time around, they're moving even faster"
text = "The real question is, who's afraid of Virgina Woolf?"


output = nlp.annotate(text, properties={
  'annotators': 'parse',
  'outputFormat': 'json'
  })

In [4]:
output

{'sentences': [{'basicDependencies': [{'dep': 'ROOT',
     'dependent': 4,
     'dependentGloss': 'is',
     'governor': 0,
     'governorGloss': 'ROOT'},
    {'dep': 'det',
     'dependent': 1,
     'dependentGloss': 'The',
     'governor': 3,
     'governorGloss': 'question'},
    {'dep': 'amod',
     'dependent': 2,
     'dependentGloss': 'real',
     'governor': 3,
     'governorGloss': 'question'},
    {'dep': 'nsubj',
     'dependent': 3,
     'dependentGloss': 'question',
     'governor': 4,
     'governorGloss': 'is'},
    {'dep': 'punct',
     'dependent': 5,
     'dependentGloss': ',',
     'governor': 4,
     'governorGloss': 'is'},
    {'dep': 'nsubj',
     'dependent': 6,
     'dependentGloss': 'who',
     'governor': 8,
     'governorGloss': 'afraid'},
    {'dep': 'cop',
     'dependent': 7,
     'dependentGloss': "'s",
     'governor': 8,
     'governorGloss': 'afraid'},
    {'dep': 'ccomp',
     'dependent': 8,
     'dependentGloss': 'afraid',
     'governor': 4,
     '

In [5]:
print(output['sentences'][0]['parse'])

(ROOT
  (S
    (NP (DT The) (JJ real) (NN question))
    (VP (VBZ is) (, ,)
      (SBARQ
        (WHNP (WP who))
        (SQ (VBZ 's)
          (ADJP (JJ afraid))
          (PP (IN of)
            (NP (NNP Virgina) (NNP Woolf))))
        (. ?)))))


In [6]:
raw = output['sentences'][0]['parse']
raw_toks = raw.split("\n")

In [7]:
def tree(): return defaultdict(tree) # example of autovivification

In [8]:
print(repr(output['sentences'][0]['parse']))
# indent two spaces per 

"(ROOT\n  (S\n    (NP (DT The) (JJ real) (NN question))\n    (VP (VBZ is) (, ,)\n      (SBARQ\n        (WHNP (WP who))\n        (SQ (VBZ 's)\n          (ADJP (JJ afraid))\n          (PP (IN of)\n            (NP (NNP Virgina) (NNP Woolf))))\n        (. ?)))))"


In [9]:
raw_toks

['(ROOT',
 '  (S',
 '    (NP (DT The) (JJ real) (NN question))',
 '    (VP (VBZ is) (, ,)',
 '      (SBARQ',
 '        (WHNP (WP who))',
 "        (SQ (VBZ 's)",
 '          (ADJP (JJ afraid))',
 '          (PP (IN of)',
 '            (NP (NNP Virgina) (NNP Woolf))))',
 '        (. ?)))))']

In [11]:
raw_toks[0]

'(ROOT'

In [19]:
tr = re.match(r"^\s*\(([a-zA-Z0-9_]*)$",raw_toks[0])
print(tr.group(1)) # group(1) b/c group(0) is the complete match

ROOT


In [55]:
def _leadingSpaces_(target):
    return len(target) - len(target.lstrip())

def _findParent_(curIndent, parid, treeRef):
    tmpid = parid
    while (curIndent <= treeRef[tmpid]['indent']):
        tmpid = treeRef[tmpid]['parid']
    return tmpid

In [102]:
def _generateTree_(rawTokens, treeRef):
    
    REGEX_PUNC = r"^\s*\([,!?.'\"]\s*[,!?.'\"]\)"
    REGEX_OPEN = r"^\s*\(([a-zA-Z0-9_']*)$"
    REGEX_COMP = r"^\s*\(([a-zA-Z0-9_']*)\s*((?:[(]([a-zA-Z0-9_;.,?'!]*)\s*([a-zA-Z0-9_;\.,?!']*)[)]\s*)*)"
    REGEX_ISOL_IN_COMP = r"\(([a-zA-Z0-9_;.,?!']+)\s*([a-zA-Z0-9_;.,?!']+)\)"
    REGEX_PUNC_SOLO = r"\([,!?.'\"]\s*[,!?.'\"]\)"
    
    # manually insert Root token 
    treeRef[0] = {'curid':0, 'parid':0, 'posOrTok':'ROOT', 'indent':0}
    ID_CTR = 1
    
    for tok in rawTokens[1:]:
        
        curIndent = _leadingSpaces_(tok) # the current indent level
        parid = _findParent_(curIndent, ID_CTR-1, treeRef) # determine parid
        
        checkMatch = re.match(REGEX_OPEN, tok)
        if (checkMatch):
            treeRef[ID_CTR] = {'curid':ID_CTR, 'parid':parid, 'posOrTok':checkMatch.group(1), 'indent':curIndent}
            ID_CTR += 1
            continue

        checkPunc = re.match(REGEX_PUNC, tok)
        if (checkPunc): # ignore punctuation
            continue

        checkChild = re.match(REGEX_COMP, tok)
        if (checkChild):
            treeRef[ID_CTR] = {'curid':ID_CTR, 'parid':parid, 'posOrTok':checkChild.group(1), 'indent':curIndent}
            upCTR = ID_CTR
            ID_CTR += 1
            # Eliminate further punctuation
            print(checkChild.group(2))

            subCheck = re.sub(REGEX_PUNC_SOLO,'',checkChild.group(2))
            subs = re.findall(REGEX_ISOL_IN_COMP, subCheck) 
            for ch in subs:
                # THE INDENTING IS WRONG HERE - THE HEIRARCHY IS MESSED UP - check test output
                treeRef[ID_CTR] = {'curid':ID_CTR, 'parid':upCTR, 'posOrTok':ch[0], 'indent':curIndent}
                ID_CTR += 1
                treeRef[ID_CTR] = {'curid':ID_CTR, 'parid':ID_CTR-1, 'posOrTok':ch[1], 'indent':curIndent}
                ID_CTR += 1

    return
            

## Test: Tree Generation

In [103]:
treeTest = tree()
_generateTree_(raw_toks, treeTest)

(DT The) (JJ real) (NN question)
(VBZ is) (, ,)
(WP who)
(VBZ 's)
(JJ afraid)
(IN of)
(NNP Virgina) (NNP Woolf)


In [104]:
treeTest

defaultdict(<function __main__.tree>,
            {0: {'curid': 0, 'indent': 0, 'parid': 0, 'posOrTok': 'ROOT'},
             1: {'curid': 1, 'indent': 2, 'parid': 0, 'posOrTok': 'S'},
             2: {'curid': 2, 'indent': 4, 'parid': 1, 'posOrTok': 'NP'},
             3: {'curid': 3, 'indent': 4, 'parid': 2, 'posOrTok': 'DT'},
             4: {'curid': 4, 'indent': 4, 'parid': 3, 'posOrTok': 'The'},
             5: {'curid': 5, 'indent': 4, 'parid': 2, 'posOrTok': 'JJ'},
             6: {'curid': 6, 'indent': 4, 'parid': 5, 'posOrTok': 'real'},
             7: {'curid': 7, 'indent': 4, 'parid': 2, 'posOrTok': 'NN'},
             8: {'curid': 8, 'indent': 4, 'parid': 7, 'posOrTok': 'question'},
             9: {'curid': 9, 'indent': 4, 'parid': 1, 'posOrTok': 'VP'},
             10: {'curid': 10, 'indent': 4, 'parid': 9, 'posOrTok': 'VBZ'},
             11: {'curid': 11, 'indent': 4, 'parid': 10, 'posOrTok': 'is'},
             12: {'curid': 12, 'indent': 6, 'parid': 11, 'posOrTok': '