# DocParser Tokenization


In [38]:
import sys
sys.path.append('..')
import doctable as dt
from spacy import displacy
import spacy
nlp = spacy.load('en')
from spacy.matcher import Matcher

DocParser is built specifically to convert spacy doc objects to token lists or simple parsetree objects which are convenient to store in a doctable. As such, we begin by creating a spacy doc object.

In [2]:
exstr = 'James will paint the house for $20 (twenty dollars). He is a rule-breaker'
doc = nlp(exstr)
doc

James will paint the house for $20 (twenty dollars). He is a rule-breaker

## Document Tokenizing

Typically you will want to parse at the Spacy doc aobject. The `.tokenize_doc()` method includes common functionality for tokenizing your documents. Arguments to this function present a series of decisions that need to be made for every tokenization process. There are two 

In [4]:
# the most basic version performs tokenizing with all default settings
print(dt.DocParser.tokenize_doc(doc))

['James', 'will', 'paint', 'the', 'house', 'for', '$', '20', '(', 'twenty', 'dollars', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


In [5]:
# split into sentences (list of lists)
print(dt.DocParser.tokenize_doc(doc, split_sents=True))

[['James', 'will', 'paint', 'the', 'house', 'for', '$', '20', '(', 'twenty', 'dollars', ')', '.'], ['he', 'is', 'a', 'rule', '-', 'breaker']]


In [6]:
print(dt.DocParser.tokenize_doc(doc, merge_ents=True))
doc = nlp(exstr) # reverts doc back to original because adding the match (called in .tokenize_doc()) modified it

['James', 'will', 'paint', 'the', 'house', 'for', '$', '20', '(', 'twenty dollars', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


In [7]:
print(dt.DocParser.tokenize_doc(doc, merge_noun_chunks=True))
doc = nlp(exstr) # reverts doc back to original because adding the match (called in .tokenize_doc()) modified it

['James', 'will', 'paint', 'the house', 'for', '$', '20', '(', 'twenty dollars', ')', '.', 'he', 'is', 'a rule-breaker']


### Choose to include tokens
You may not want to include all tokens, depending on spacy token information. For this case, we use the `.use_tok()` method which includes some built-in arguments to do some boilerplate steps. Again see the [full documentaiton](https://devincornell.github.io/doctable/ref/doctable.DocParser.html) to see all arguments and defaults.
The function simply returns a boolean True/False value given a spacy token, but can be passed to `.tokenize_doc()` for added flexibility.

It is most easily used by overriding parameters through a lambda function.

In [8]:
# first try a custom function keeps only non-numbers
use_tok_nobreaker = lambda tok: not tok.like_num
print(dt.DocParser.tokenize_doc(doc, use_tok_func=use_tok_nobreaker))

['James', 'will', 'paint', 'the', 'house', 'for', '$', '(', 'dollars', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


In [9]:
# now, you can override the .use_tok() to take care of some simple stuff
use_tok_nostop = lambda tok: dt.DocParser.use_tok(tok, filter_stop=True)
print(dt.DocParser.tokenize_doc(doc, use_tok_func=use_tok_nostop))

['James', 'paint', 'house', '$', '20', '(', 'dollars', ')', '.', 'rule', '-', 'breaker']


In [40]:
# remove digits
use_tok_nodigit = lambda tok: dt.DocParser.use_tok(tok, filter_digit=True)
print(dt.DocParser.tokenize_doc(doc, use_tok_func=use_tok_nodigit))

['James', 'will', 'paint', 'the', 'house', 'for', '$', '(', 'twenty', 'dollars', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


In [11]:
# remove numbers (see it removed both "20" and "Twenty")
use_tok_nonum = lambda tok: dt.DocParser.use_tok(tok, filter_num=True)
print(dt.DocParser.tokenize_doc(doc, use_tok_func=use_tok_nonum))

['James', 'will', 'paint', 'the', 'house', 'for', '$', '(', 'dollars', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


In [12]:
# here it thought 'James' was an organization. Use the filter_ent_types arg to remove specific ent types
use_tok_nonames = lambda tok: dt.DocParser.use_tok(tok, filter_ent_types=['ORG'])
print(dt.DocParser.tokenize_doc(doc, use_tok_func=use_tok_nonames))

['will', 'paint', 'the', 'house', 'for', '$', '20', '(', 'twenty', 'dollars', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


In [13]:
# remove all entities using the filter_all_ents argument
use_tok_nonents = lambda tok: dt.DocParser.use_tok(tok, filter_all_ents=True)
print(dt.DocParser.tokenize_doc(doc, use_tok_func=use_tok_nonents))

['will', 'paint', 'the', 'house', 'for', '$', '(', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


In [14]:
# you can also add to the use_tok method using a custom function
def custom_use_tok(tok):
    use = dt.DocParser.use_tok(tok, filter_num=True)
    return use and tok.pos_ != 'VERB' # here removes all verbs (including "paint")
print(dt.DocParser.tokenize_doc(doc, use_tok_func=custom_use_tok))

['James', 'will', 'the', 'house', 'for', '$', '(', 'dollars', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


### Choose how to parse tokens
Conversion from a spacy token to a string can happen a number of different ways. The `.parse_tok()` method provides a number of features for this task, or a custom function can be provided.

In [15]:
#parse_tok(tok, replace_num=None, replace_digit=None, lemmatize=False, normal_convert=None, format_ents=True, ent_convert=None)

In [16]:
# a custom function will simply return the original text using the tok.text property
parse_tok = lambda tok: tok.text
print(dt.DocParser.tokenize_doc(doc, parse_tok_func=parse_tok))

['James', 'will', 'paint', 'the', 'house', 'for', '$', '20', '(', 'twenty', 'dollars', ')', '.', 'He', 'is', 'a', 'rule', '-', 'breaker']


In [17]:
# using .parse_tok(), first try replacing numbers with "__NUM__"
parse_tok = lambda tok: dt.DocParser.parse_tok(tok, replace_num='__NUM__')
print(dt.DocParser.tokenize_doc(doc, parse_tok_func=parse_tok))

['James', 'will', 'paint', 'the', 'house', 'for', '$', '__NUM__', '(', '__NUM__', 'dollars', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


In [18]:
# now lemmatize
parse_tok = lambda tok: dt.DocParser.parse_tok(tok, lemmatize=True)
print(dt.DocParser.tokenize_doc(doc, parse_tok_func=parse_tok))

['James', 'will', 'paint', 'the', 'house', 'for', '$', '20', '(', 'twenty', 'dollars', ')', '.', '-pron-', 'be', 'a', 'rule', '-', 'breaker']


In [41]:
# format_ents is one of the most useful features. 
# It will standardize ents by converting all consecutive whitespace to 
# spaces and then capitalize the first letter. This is the default setting, but it can be turned off.
parse_tok = lambda tok: dt.DocParser.parse_tok(tok, format_ents=True)
print(dt.DocParser.tokenize_doc(doc, parse_tok_func=parse_tok))

['James', 'will', 'paint', 'the', 'house', 'for', '$', '20', '(', 'Twenty', 'Dollars', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


### Merging N-Grams
DocParser offers two convenient ways to work with n-grams: (1) using the spacy matcher and (2) using the post-processed multi-token matcher. The first method is applied after normal spacy processing is finished. It involves passing a tuple of ngrams as tuples to apply after all parsing has completed. The good thing about this approach is that it doesn't require much code. The unfortunate thing is that it can only access the tokens after normal parsing. If you would like to merge tokens with hyphens between them or currency symbols to their numbers, you should use the pre-processing method.

The pre-processing spacy.Matcher functionality is used to create ngrams which access certain underlying spacy components like IS_DIGIT etc. See [Spacy Matcher documention for more details](https://spacy.io/usage/rule-based-matching). The basic workflow is to create a matcher object, add patterns, and then pass matcher to .tokenize_doc(). Note that since the doc object itself is modified, python must be restarted to revert back to other tokenization method.

In [20]:
# post-parsing ngram merging
ngrams = (
    ('the', 'house'),
    ('rule', '-', 'breaker'),
    ('he', 'is', 'a'),
)
# by default 
print(dt.DocParser.tokenize_doc(doc, ngrams=ngrams))
print()
print(dt.DocParser.tokenize_doc(doc, ngrams=ngrams, ngram_sep='_')) # specify ngram_sep

['James', 'will', 'paint', 'the house', 'for', '$', '20', '(', 'twenty', 'dollars', ')', '.', 'he is a', 'rule - breaker']

['James', 'will', 'paint', 'the_house', 'for', '$', '20', '(', 'twenty', 'dollars', ')', '.', 'he_is_a', 'rule_-_breaker']


In [21]:
# spacy matcher object (will be passed to docparser)
matcher = Matcher(nlp.vocab)

# matches currency numbers
pattern = [{'TEXT':'$'},{'IS_DIGIT':True}]
matcher.add('currency', None, pattern)

# matches the phrase "he will" or "He Will" or "HE WILL"
pattern2 = [{'LOWER':'he'},{'LOWER':'will'}]
matcher.add('he_will', None, pattern2)

# matches hyphens
pattern3 = [{'IS_SPACE':False},{'TEXT':'-'},{'IS_SPACE':False}]
matcher.add('he_will', None, pattern3)

print([tok for tok in dt.DocParser.tokenize_doc(doc, spacy_ngram_matcher=matcher)])
doc = nlp(exstr) # reverts doc back to original because adding the match (called in .tokenize_doc()) modified it

['James', 'will', 'paint', 'the', 'house', 'for', '$20', '(', 'twenty', 'dollars', ')', '.', 'he', 'is', 'a', 'rule-breaker']


## Token Parsing
An essential element of any spacy tokenization process is to convert Spacy token objects into strings.

Full function header: `def parse_tok(tok, replace_num=None, replace_digit=None, lemmatize=False, normal_convert=None, format_ents=True, ent_convert=None)`

In [22]:
# by default this method includes everything
print([dt.DocParser.parse_tok(tok) for tok in doc])

['James', 'will', 'paint', 'the', 'house', 'for', '$', '20', '(', 'twenty', 'dollars', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


In [23]:
# numbers include anything that looks like a number
print([dt.DocParser.parse_tok(tok, replace_num='__NUM__') for tok in doc])

['James', 'will', 'paint', 'the', 'house', 'for', '$', '__NUM__', '(', '__NUM__', 'dollars', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


In [24]:
# digits include 5 but not five
print([dt.DocParser.parse_tok(tok, replace_digit='__DIGIT__') for tok in doc])

['James', 'will', 'paint', 'the', 'house', 'for', '$', '__DIGIT__', '(', 'twenty', 'dollars', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


In [25]:
# returns spacy lemmas
print([dt.DocParser.parse_tok(tok, lemmatize=True) for tok in doc])

['James', 'will', 'paint', 'the', 'house', 'for', '$', '20', '(', 'twenty', 'dollars', ')', '.', '-pron-', 'be', 'a', 'rule', '-', 'breaker']


In [26]:
# additionally you can choose to format ents. This is often used so that named entities 
# with slightly different whitespaces (newlines or just spaces) can be considered as the same.
# note that both "Twenty" and "Dollars" start with capital letters even though original sentence
# does not have them capitalized. Essentially this process involves splitting and joining back with spaces,
# then converting the strings to lower case and calling .capitalize() to convert the first letter of each 
# word to upper-case. Note that you can merge entities before parsing as well.
print([dt.DocParser.parse_tok(tok, format_ents=True) for tok in doc])

['James', 'will', 'paint', 'the', 'house', 'for', '$', '20', '(', 'Twenty', 'Dollars', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


In [27]:
# can provide additional post-processing that applies to only entities or normal tokens
# (often they should be treated separately)
print([dt.DocParser.parse_tok(tok, normal_convert=lambda t: '_normtok_') for tok in doc])
print()
print([dt.DocParser.parse_tok(tok, normal_convert=lambda t: t.text_with_ws) for tok in doc]) # includes whitespace
print()
print([dt.DocParser.parse_tok(tok, normal_convert=lambda t: (t.lower_.strip(), t.pos_)) for tok in doc]) # includes part of speech

['James', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '20', '_normtok_', 'twenty', 'dollars', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_']

['James', 'will ', 'paint ', 'the ', 'house ', 'for ', '$', '20', '(', 'twenty', 'dollars', ')', '. ', 'He ', 'is ', 'a ', 'rule', '-', 'breaker']

['James', ('will', 'AUX'), ('paint', 'VERB'), ('the', 'DET'), ('house', 'NOUN'), ('for', 'ADP'), ('$', 'SYM'), '20', ('(', 'PUNCT'), 'twenty', 'dollars', (')', 'PUNCT'), ('.', 'PUNCT'), ('he', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('rule', 'NOUN'), ('-', 'PUNCT'), ('breaker', 'NOUN')]


In [28]:
# and can provide basically the same thing with named entities (while keeping normal token processing the same)
print([dt.DocParser.parse_tok(tok, ent_convert=lambda t: t.ent_type_) for tok in doc])
print()
print([dt.DocParser.parse_tok(tok, ent_convert=lambda t: (t.lower_.strip(), t.ent_type_, t.pos_)) for tok in doc])

['ORG', 'will', 'paint', 'the', 'house', 'for', '$', 'MONEY', '(', 'MONEY', 'MONEY', ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']

[('james', 'ORG', 'PROPN'), 'will', 'paint', 'the', 'house', 'for', '$', ('20', 'MONEY', 'NUM'), '(', ('twenty', 'MONEY', 'NUM'), ('dollars', 'MONEY', 'NOUN'), ')', '.', 'he', 'is', 'a', 'rule', '-', 'breaker']


In [29]:
# can use both to similar effect
print([dt.DocParser.parse_tok(tok, normal_convert=lambda t: t.pos_, ent_convert=lambda t: t.pos_) for tok in doc])

['PROPN', 'AUX', 'VERB', 'DET', 'NOUN', 'ADP', 'SYM', 'NUM', 'PUNCT', 'NUM', 'NOUN', 'PUNCT', 'PUNCT', 'PRON', 'AUX', 'DET', 'NOUN', 'PUNCT', 'NOUN']


## Token Filtering
As part of the DocParser pipeline, it is often useful to filter out tokens that are not useful. There are several useful options in `.use_tok()` for this. Note that this method can be overridden for use in other DocParser methods.

In [30]:
# by default, all tokens except whitespace are included
print([dt.DocParser.use_tok(tok) for tok in doc])
# to turn whitespace filtering off, use filter_whitespace=False (no whitespace is in this example so can't show example)

[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]


In [31]:
# filter 
print([tok for tok in doc if dt.DocParser.use_tok(tok, filter_punct=True)])

[James, will, paint, the, house, for, $, 20, twenty, dollars, He, is, a, rule, breaker]


In [32]:
# filter stopwords (according to spacy). To modify stopwords: https://stackoverflow.com/questions/41170726/add-remove-stop-words-with-spacy 
print([tok for tok in doc if dt.DocParser.use_tok(tok, filter_stop=True)])

[James, paint, house, $, 20, (, dollars, ), ., rule, -, breaker]


In [33]:
# filter both
print([tok for tok in doc if dt.DocParser.use_tok(tok, filter_stop=True, filter_punct=True)])

[James, paint, house, $, 20, dollars, rule, breaker]


In [34]:
# filter_number would filter "twenty" wheras filter_digit would not
print([tok for tok in doc if dt.DocParser.use_tok(tok, filter_digit=True)])
print([tok for tok in doc if dt.DocParser.use_tok(tok, filter_num=True)])

[James, will, paint, the, house, for, $, (, twenty, dollars, ), ., He, is, a, rule, -, breaker]
[James, will, paint, the, house, for, $, (, dollars, ), ., He, is, a, rule, -, breaker]


In [35]:
# filter named entities
print([tok for tok in doc if dt.DocParser.use_tok(tok, filter_all_ents=True)])

[will, paint, the, house, for, $, (, ), ., He, is, a, rule, -, breaker]


In [36]:
# filter only numbers. See this page for details on named entity types: https://spacy.io/api/annotation#named-entities
print([tok for tok in doc if dt.DocParser.use_tok(tok, filter_ent_types=['MONEY',])])
# filters "twenty dollars"

[James, will, paint, the, house, for, $, (, ), ., He, is, a, rule, -, breaker]


## Document Preprocessing
A common step taken prior to using the spacy parser is to remove objects you don't want to be tokenized. So far this function is quite simple but more features may be added later.

In [37]:
exstr = 'To search the web, see http://google.com'
dt.DocParser.preprocess(exstr, remove_url=True)

TypeError: preprocess() got an unexpected keyword argument 'remove_url'