In [1]:
import sys
sys.path.append('..')
import doctable as dt
from spacy import displacy
import spacy
from spacy.matcher import Matcher
from pprint import pprint
nlp = spacy.load('en')

In [2]:
exstr = 'Hat is red. Barak Obama is tall.'
doc = nlp(exstr)
doc

Hat is red. Barak Obama is tall.

## Making Parsetrees

By default, `parse_tok_func=None` means it will use the internal `.parse_tok()` method passing additional arguments through the `parse_tok_args` parameter. You may add additional token info by setting dictionary values attrname->func in the `parsetree_tok_info` parameter.

`def get_parsetree(cls, doc, tok_info_map=None, children_attrname='childs', merge_ents=False, spacy_ngram_matcher=None, merge_noun_chunks=False)`

In [3]:
# can modify the attribute of the children
dt.DocParser.get_parsetree(doc, children_attrname='children')

[{'tok': 'is',
  'dep': 'ROOT',
  'children': [{'tok': 'hat', 'dep': 'nsubj', 'children': []},
   {'tok': 'red', 'dep': 'acomp', 'children': []},
   {'tok': '.', 'dep': 'punct', 'children': []}]},
 {'tok': 'is',
  'dep': 'ROOT',
  'children': [{'tok': 'Obama',
    'dep': 'nsubj',
    'children': [{'tok': 'Barak', 'dep': 'compound', 'children': []}]},
   {'tok': 'tall', 'dep': 'acomp', 'children': []},
   {'tok': '.', 'dep': 'punct', 'children': []}]}]

In [4]:
# can modify any applied attributes
tok_info_map = {
    'tok': lambda tok: dt.DocParser.parse_tok(tok, lemmatize=True),
    'pos': lambda tok: tok.pos_,
}
dt.DocParser.get_parsetree(doc, tok_info_map=tok_info_map)

[{'tok': 'be',
  'pos': 'AUX',
  'childs': [{'tok': 'hat', 'pos': 'PROPN', 'childs': []},
   {'tok': 'red', 'pos': 'ADJ', 'childs': []},
   {'tok': '.', 'pos': 'PUNCT', 'childs': []}]},
 {'tok': 'be',
  'pos': 'AUX',
  'childs': [{'tok': 'Obama',
    'pos': 'PROPN',
    'childs': [{'tok': 'Barak', 'pos': 'PROPN', 'childs': []}]},
   {'tok': 'tall', 'pos': 'ADJ', 'childs': []},
   {'tok': '.', 'pos': 'PUNCT', 'childs': []}]}]

### Genere Parsetrees While Applying Token Merges

In [5]:
# full version includes tag, pos, dep, and ent_type in addition to 'tok' and 'children'
dt.DocParser.get_parsetree(doc, merge_ents=True)

[{'tok': 'is',
  'dep': 'ROOT',
  'childs': [{'tok': 'hat', 'dep': 'nsubj', 'childs': []},
   {'tok': 'red', 'dep': 'acomp', 'childs': []},
   {'tok': '.', 'dep': 'punct', 'childs': []}]},
 {'tok': 'is',
  'dep': 'ROOT',
  'childs': [{'tok': 'Barak Obama', 'dep': 'nsubj', 'childs': []},
   {'tok': 'tall', 'dep': 'acomp', 'childs': []},
   {'tok': '.', 'dep': 'punct', 'childs': []}]}]

In [6]:
# create spacy matcher object to pass to .get_parsetree()
matcher = Matcher(nlp.vocab)
matcher.add('currency', None, [{'LOWER':'tall'}, {'IS_PUNCT':True}])

# you can see that "tall." at the end of the sentence has been merged into a single token, 
# but still works with the parsetree
pprint(dt.DocParser.get_parsetree(doc, spacy_ngram_matcher=matcher))
doc = nlp(exstr) # reverts doc back to original because adding the match (called in .tokenize_doc()) modified it

[{'childs': [{'childs': [], 'dep': 'nsubj', 'tok': 'hat'},
             {'childs': [], 'dep': 'acomp', 'tok': 'red'},
             {'childs': [], 'dep': 'punct', 'tok': '.'}],
  'dep': 'ROOT',
  'tok': 'is'},
 {'childs': [{'childs': [], 'dep': 'nsubj', 'tok': 'Barak Obama'},
             {'childs': [], 'dep': 'acomp', 'tok': 'tall.'}],
  'dep': 'ROOT',
  'tok': 'is'}]


## GrammarTree Objects
These objects allow you to work with ParseTree objects. They are produced by DocParser objects to extract aspects of parsetrees.

In [8]:
gt = dt.DocParser.get_grammartree(doc, spacy_ngram_matcher=matcher)
gt

NameError: name 'get_parsetree' is not defined