In [53]:
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP 
from collections import defaultdict
import re
import time

In [3]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [4]:
# text = "This time around, they're moving even faster"
text = "After 12th years old boy and I had sex with a 12 years old girl, with her consent. Is there anything wrong?"


output = nlp.annotate(text, properties={
  'annotators': 'parse',
  'outputFormat': 'json'
  })

In [5]:
output

{'sentences': [{'basicDependencies': [{'dep': 'ROOT',
     'dependent': 8,
     'dependentGloss': 'had',
     'governor': 0,
     'governorGloss': 'ROOT'},
    {'dep': 'case',
     'dependent': 1,
     'dependentGloss': 'After',
     'governor': 3,
     'governorGloss': 'years'},
    {'dep': 'amod',
     'dependent': 2,
     'dependentGloss': '12th',
     'governor': 3,
     'governorGloss': 'years'},
    {'dep': 'nmod',
     'dependent': 3,
     'dependentGloss': 'years',
     'governor': 8,
     'governorGloss': 'had'},
    {'dep': 'amod',
     'dependent': 4,
     'dependentGloss': 'old',
     'governor': 5,
     'governorGloss': 'boy'},
    {'dep': 'nsubj',
     'dependent': 5,
     'dependentGloss': 'boy',
     'governor': 8,
     'governorGloss': 'had'},
    {'dep': 'cc',
     'dependent': 6,
     'dependentGloss': 'and',
     'governor': 5,
     'governorGloss': 'boy'},
    {'dep': 'conj',
     'dependent': 7,
     'dependentGloss': 'I',
     'governor': 5,
     'governorGloss':

In [6]:
print(output['sentences'][0]['parse'])

(ROOT
  (S
    (PP (IN After)
      (NP (JJ 12th) (NNS years)))
    (NP
      (NP (JJ old) (NN boy))
      (CC and)
      (NP (PRP I)))
    (VP (VBD had)
      (NP
        (NP (NN sex))
        (PP (IN with)
          (NP (DT a)
            (ADJP
              (NP (CD 12) (NNS years))
              (JJ old))
            (NN girl))))
      (, ,)
      (PP (IN with)
        (NP (PRP$ her) (NN consent))))
    (. .)))


In [7]:
raw = output['sentences'][0]['parse']
raw_toks = raw.split("\n")

In [8]:
def tree(): return defaultdict(tree) # example of autovivification

In [10]:
raw_toks

['(ROOT',
 '  (S',
 '    (PP (IN After)',
 '      (NP (JJ 12th) (NNS years)))',
 '    (NP',
 '      (NP (JJ old) (NN boy))',
 '      (CC and)',
 '      (NP (PRP I)))',
 '    (VP (VBD had)',
 '      (NP',
 '        (NP (NN sex))',
 '        (PP (IN with)',
 '          (NP (DT a)',
 '            (ADJP',
 '              (NP (CD 12) (NNS years))',
 '              (JJ old))',
 '            (NN girl))))',
 '      (, ,)',
 '      (PP (IN with)',
 '        (NP (PRP$ her) (NN consent))))',
 '    (. .)))']

In [13]:
def _leadingSpaces_(target):
    return len(target) - len(target.lstrip())

def _findParent_(curIndent, parid, treeRef):
    tmpid = parid
    while (curIndent <= treeRef[tmpid]['indent']):
        tmpid = treeRef[tmpid]['parid']
    return tmpid

In [64]:
def _generateTree_(rawTokens, treeRef):
    
    # (, ,) as stand-alone
    REGEX_PUNC = r"^\s*\([,!?.'\"]\s*[,!?.'\"]\)"
    # (token
    REGEX_OPEN = r"^\s*\(([a-zA-Z0-9_']*)$"
    # (token (tok1 tok2) (tok3 tok4) .... (tokx toky))
    REGEX_COMP = r"^\s*\(([a-zA-Z0-9_']*)\s*((?:[(]([a-zA-Z0-9_;.,?'!]*)\s*([a-zA-Z0-9_;\.,?!']*)[)]\s*)*)"
    # (tok1 tok2)
    REGEX_ISOL_IN_COMP = r"\(([a-zA-Z0-9_;.,?!']+)\s*([a-zA-Z0-9_;.,?!']+)\)"
    # (, ,)
    REGEX_PUNC_SOLO = r"\([,!?.'\"]\s*[,!?.'\"]\)"
    # (tok1 tok2) as stand-alone
    REGEX_SOLO_PAIR = r"^\s*\(([a-zA-Z0-9_;.,?!']+)\s*([a-zA-Z0-9_;.,?!']+)\)"
    
    # manually insert Root token 
    treeRef[len(treeRef)] = {'curid':0, 
                             'parid':-1, 
                             'posOrTok':'ROOT', 
                             'indent':0}
    ID_CTR = 1
    
    for tok in rawTokens[1:]:
        
        curIndent = _leadingSpaces_(tok) # the current indent level
        parid = _findParent_(curIndent, ID_CTR-1, treeRef) # determine parid
            
        checkChild = re.match(REGEX_COMP, tok)
        if (checkChild):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkChild.group(1), 
                               'indent':curIndent}
            upCTR = ID_CTR
            ID_CTR += 1
            # Eliminate further punctuation

            subCheck = re.sub(REGEX_PUNC_SOLO,'',checkChild.group(2))
            subs = re.findall(REGEX_ISOL_IN_COMP, subCheck) 
            for ch in subs:
                # THE INDENTING IS WRONG HERE - THE HEIRARCHY IS MESSED UP - check test output
                treeRef[ID_CTR] = {'curid':ID_CTR, 
                                   'parid':upCTR, 
                                   'posOrTok':ch[0], 
                                   'indent':curIndent+2}
                ID_CTR += 1
                treeRef[ID_CTR] = {'curid':ID_CTR, 
                                   'parid':ID_CTR-1, 
                                   'posOrTok':ch[1], 
                                   'indent':curIndent+2}
                ID_CTR += 1
            continue
            
            
        checkSingle = re.match(REGEX_SOLO_PAIR, tok)
        if (checkSingle):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkSingle.group(1), 
                               'indent':curIndent+2}
            ID_CTR += 1
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':ID_CTR-1, 
                               'posOrTok':checkSingle.group(2), 
                               'indent':curIndent+2}
            ID_CTR += 1
            continue


        checkMatch = re.match(REGEX_OPEN, tok)
        if (checkMatch):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkMatch.group(1), 
                               'indent':curIndent}
            ID_CTR += 1
            continue

        checkPunc = re.match(REGEX_PUNC, tok)
        if (checkPunc): # ignore punctuation
            continue

    return
            

In [59]:
'''
_generateTree_() method only provides tree (dict representation) listing parents. 
This is a naive method to add a "children" field to the tree - necessary for optimal Tree Kernel methods 
'''
def _flipTree_(treeRef):
    for k,v in treeRef.items():
        treeRef[k]['children'] = list({sk:sv for (sk,sv) in treeRef.items() if sv['parid'] == k}.keys())


## Test: Tree Generation

In [65]:
start_time = time.time()
for i in range(1000):
    treeTest = tree()
    _generateTree_(raw_toks, treeTest)
print("--- %s seconds ---" % (time.time() - start_time))


--- 0.12793493270874023 seconds ---


In [52]:
treeTest

defaultdict(<function __main__.tree>,
            {0: {'curid': 0, 'indent': 0, 'parid': -1, 'posOrTok': 'ROOT'},
             1: {'curid': 1, 'indent': 2, 'parid': 0, 'posOrTok': 'S'},
             2: {'curid': 2, 'indent': 4, 'parid': 1, 'posOrTok': 'PP'},
             3: {'curid': 3, 'indent': 6, 'parid': 2, 'posOrTok': 'IN'},
             4: {'curid': 4, 'indent': 6, 'parid': 3, 'posOrTok': 'After'},
             5: {'curid': 5, 'indent': 6, 'parid': 2, 'posOrTok': 'NP'},
             6: {'curid': 6, 'indent': 8, 'parid': 5, 'posOrTok': 'JJ'},
             7: {'curid': 7, 'indent': 8, 'parid': 6, 'posOrTok': '12th'},
             8: {'curid': 8, 'indent': 8, 'parid': 5, 'posOrTok': 'NNS'},
             9: {'curid': 9, 'indent': 8, 'parid': 8, 'posOrTok': 'years'},
             10: {'curid': 10, 'indent': 4, 'parid': 1, 'posOrTok': 'NP'},
             11: {'curid': 11, 'indent': 6, 'parid': 10, 'posOrTok': 'NP'},
             12: {'curid': 12, 'indent': 8, 'parid': 11, 'posOrTok': '