In [1]:
import sys
sys.path.append('..')
import doctable as dt
from spacy import displacy
import spacy
nlp = spacy.load('en')

In [2]:
exstr = 'Hat is red. They are tall.'
doc = nlp(exstr)
doc

Hat is red. They are tall.

## Parsetrees

By default, `parse_tok_func=None` means it will use the internal `.parse_tok()` method passing additional arguments through the `parse_tok_args` parameter. You may add additional token info by setting dictionary values attrname->func in the `parsetree_tok_info` parameter.

Def: `get_parsetree(cls, doc, parse_tok_func=None, parse_tok_args=dict(), parsetree_tok_info=None, tok_attrname='tok', children_attrname='children')`

In [11]:
# teh simplest version is to set parse_tok_func manually and then set parsetree_tok_info to an empty dict
dt.DocParser.get_parsetree(doc, parse_tok_func=lambda tok: tok.text.upper(), parsetree_tok_info=dict())

[{'tok': 'IS',
  'children': [{'tok': 'HAT', 'children': []},
   {'tok': 'RED', 'children': []},
   {'tok': '.', 'children': []}]},
 {'tok': 'ARE',
  'children': [{'tok': 'THEY', 'children': []},
   {'tok': 'TALL', 'children': []},
   {'tok': '.', 'children': []}]}]

In [3]:
# full version includes tag, pos, dep, and ent_type in addition to 'tok' and 'children'
dt.DocParser.get_parsetree(doc)

[{'pos': 'AUX',
  'tag': 'VBZ',
  'dep': 'ROOT',
  'ent_type': None,
  'tok': 'is',
  'children': [{'pos': 'PROPN',
    'tag': 'NNP',
    'dep': 'nsubj',
    'ent_type': None,
    'tok': 'hat',
    'children': []},
   {'pos': 'ADJ',
    'tag': 'JJ',
    'dep': 'acomp',
    'ent_type': None,
    'tok': 'red',
    'children': []},
   {'pos': 'PUNCT',
    'tag': '.',
    'dep': 'punct',
    'ent_type': None,
    'tok': '.',
    'children': []}]},
 {'pos': 'AUX',
  'tag': 'VBP',
  'dep': 'ROOT',
  'ent_type': None,
  'tok': 'are',
  'children': [{'pos': 'PRON',
    'tag': 'PRP',
    'dep': 'nsubj',
    'ent_type': None,
    'tok': 'they',
    'children': []},
   {'pos': 'ADJ',
    'tag': 'JJ',
    'dep': 'acomp',
    'ent_type': None,
    'tok': 'tall',
    'children': []},
   {'pos': 'PUNCT',
    'tag': '.',
    'dep': 'punct',
    'ent_type': None,
    'tok': '.',
    'children': []}]}]

In [4]:
# setting this to empty dict means it will only include the token string and the children
# also converted children to child
dt.DocParser.get_parsetree(doc, parsetree_tok_info={}, children_attrname='child')

[{'tok': 'is',
  'child': [{'tok': 'hat', 'child': []},
   {'tok': 'red', 'child': []},
   {'tok': '.', 'child': []}]},
 {'tok': 'are',
  'child': [{'tok': 'they', 'child': []},
   {'tok': 'tall', 'child': []},
   {'tok': '.', 'child': []}]}]