In [1]:
import sys
sys.path.append('..')
import doctable as dt
from spacy import displacy
import spacy
nlp = spacy.load('en')
from spacy.matcher import Matcher

In [2]:
exstr = 'James said he will paint the house red for $20 (twenty dollars). He is such a rule-breaker.'
doc = nlp(exstr)
doc

James said he will paint the house red for $20 (twenty dollars). He is such a rule-breaker.

## Document Parsing

Header: `def tokenize_doc(cls, doc, split_sents=False, merge_ents=False, merge_noun_chunks=False, ngrams=list(), spacy_ngram_matcher=None, ngram_sep=' ', use_tok_args=dict(), parse_tok_args=dict())`


In [3]:
# the most basic version performs tokenizing with all default settings
print([tok for tok in dt.DocParser.tokenize_doc(doc)])

['James', 'said', 'he', 'will', 'paint', 'the', 'house', 'red', 'for', '$', '20', '(', 'Twenty', 'Dollars', ')', '.', 'he', 'is', 'such', 'a', 'rule', '-', 'breaker', '.']


In [4]:
# split into sentences (list of lists)
print([tok for tok in dt.DocParser.tokenize_doc(doc, split_sents=True)])

[['James', 'said', 'he', 'will', 'paint', 'the', 'house', 'red', 'for', '$', '20', '(', 'Twenty', 'Dollars', ')', '.'], ['he', 'is', 'such', 'a', 'rule', '-', 'breaker', '.']]


In [5]:
print([tok for tok in dt.DocParser.tokenize_doc(doc, merge_ents=True)])
doc = nlp(exstr) # reverts doc back to original because adding the match (called in .tokenize_doc()) modified it

['James', 'said', 'he', 'will', 'paint', 'the', 'house', 'red', 'for', '$', '20', '(', 'Twenty Dollars', ')', '.', 'he', 'is', 'such', 'a', 'rule', '-', 'breaker', '.']


In [6]:
print([tok for tok in dt.DocParser.tokenize_doc(doc, merge_noun_chunks=True)])
doc = nlp(exstr) # reverts doc back to original because adding the match (called in .tokenize_doc()) modified it

['James', 'said', 'he', 'will', 'paint', 'the house', 'red', 'for', '$', '20', '(', 'Twenty Dollars', ')', '.', 'he', 'is', 'such a rule-breaker', '.']


In [7]:
print([tok for tok in dt.DocParser.tokenize_doc(doc)])

['James', 'said', 'he', 'will', 'paint', 'the', 'house', 'red', 'for', '$', '20', '(', 'Twenty', 'Dollars', ')', '.', 'he', 'is', 'such', 'a', 'rule', '-', 'breaker', '.']


### Merging N-Grams
DocParser offers two convenient ways to work with n-grams: (1) using the spacy matcher and (2) using the post-processed multi-token matcher. The first method is applied after normal spacy processing is finished. It involves passing a tuple of ngrams as tuples to apply after all parsing has completed. The good thing about this approach is that it doesn't require much code. The unfortunate thing is that it can only access the tokens after normal parsing. If you would like to merge tokens with hyphens between them or currency symbols to their numbers, you should use the pre-processing method.

The pre-processing spacy.Matcher functionality is used to create ngrams which access certain underlying spacy components like IS_DIGIT etc. See [Spacy Matcher documention for more details for that](https://spacy.io/usage/rule-based-matching). The basic workflow is to create a matcher object, add patterns, and then pass matcher to .tokenize_doc(). Note that since the doc object itself is modified, python must be restarted to revert back to other tokenization method.

In [8]:
# post-parsing ngram merging
ngrams = (
    ('the', 'house'),
    ('rule', '-', 'breaker'),
    ('he', 'is', 'such'),
)
# first using previously tokenized doc (was modified by )
print([tok for tok in dt.DocParser.tokenize_doc(doc, ngrams=ngrams)])
print()
print([tok for tok in dt.DocParser.tokenize_doc(doc, ngrams=ngrams, ngram_sep='_')])

['James', 'said', 'he', 'will', 'paint', 'the house', 'red', 'for', '$', '20', '(', 'Twenty', 'Dollars', ')', '.', 'he is such', 'a', 'rule - breaker', '.']

['James', 'said', 'he', 'will', 'paint', 'the_house', 'red', 'for', '$', '20', '(', 'Twenty', 'Dollars', ')', '.', 'he_is_such', 'a', 'rule_-_breaker', '.']


In [9]:
# spacy matcher object (will be passed to docparser)
matcher = Matcher(nlp.vocab)

# matches currency numbers
pattern = [{'TEXT':'$'},{'IS_DIGIT':True}]
matcher.add('currency', None, pattern)

# matches the phrase "he will" or "He Will" or "HE WILL"
pattern2 = [{'LOWER':'he'},{'LOWER':'will'}]
matcher.add('he_will', None, pattern2)

# matches hyphens
pattern3 = [{'IS_SPACE':False},{'TEXT':'-'},{'IS_SPACE':False}]
matcher.add('he_will', None, pattern3)

print([tok for tok in dt.DocParser.tokenize_doc(doc, spacy_ngram_matcher=matcher)])
doc = nlp(exstr) # reverts doc back to original because adding the match (called in .tokenize_doc()) modified it

['James', 'said', 'he will', 'paint', 'the', 'house', 'red', 'for', '$20', '(', 'Twenty', 'Dollars', ')', '.', 'he', 'is', 'such', 'a', 'rule-breaker', '.']


## Token Parsing
An essential element of any spacy tokenization process is to convert Spacy token objects into strings.

Full function header: `def parse_tok(tok, replace_num=None, replace_digit=None, lemmatize=False, normal_convert=None, format_ents=True, ent_convert=None)`

In [10]:
# by default this method includes everything
print([dt.DocParser.parse_tok(tok) for tok in doc])

['James', 'said', 'he', 'will', 'paint', 'the', 'house', 'red', 'for', '$', '20', '(', 'Twenty', 'Dollars', ')', '.', 'he', 'is', 'such', 'a', 'rule', '-', 'breaker', '.']


In [11]:
# numbers include anything that looks like a number
print([dt.DocParser.parse_tok(tok, replace_num='__NUM__') for tok in doc])

['James', 'said', 'he', 'will', 'paint', 'the', 'house', 'red', 'for', '$', '__NUM__', '(', '__NUM__', 'Dollars', ')', '.', 'he', 'is', 'such', 'a', 'rule', '-', 'breaker', '.']


In [12]:
# digits include 5 but not five
print([dt.DocParser.parse_tok(tok, replace_digit='__DIGIT__') for tok in doc])

['James', 'said', 'he', 'will', 'paint', 'the', 'house', 'red', 'for', '$', '__DIGIT__', '(', 'Twenty', 'Dollars', ')', '.', 'he', 'is', 'such', 'a', 'rule', '-', 'breaker', '.']


In [13]:
# returns spacy lemmas
print([dt.DocParser.parse_tok(tok, lemmatize=True) for tok in doc])

['James', 'say', '-pron-', 'will', 'paint', 'the', 'house', 'red', 'for', '$', '20', '(', 'Twenty', 'Dollars', ')', '.', '-pron-', 'be', 'such', 'a', 'rule', '-', 'breaker', '.']


In [14]:
# additionally you can choose to format ents. This is often used so that named entities 
# with slightly different whitespaces (newlines or just spaces) can be considered as the same.
# note that both "Twenty" and "Dollars" start with capital letters even though original sentence
# does not have them capitalized. Essentially this process involves splitting and joining back with spaces,
# then converting the strings to lower case and calling .capitalize() to convert the first letter of each 
# word to upper-case. Note that you can merge entities before parsing as well.
print([dt.DocParser.parse_tok(tok, format_ents=True) for tok in doc])

['James', 'said', 'he', 'will', 'paint', 'the', 'house', 'red', 'for', '$', '20', '(', 'Twenty', 'Dollars', ')', '.', 'he', 'is', 'such', 'a', 'rule', '-', 'breaker', '.']


In [15]:
# can provide additional post-processing that applies to only entities or normal tokens
# (often they should be treated separately)
print([dt.DocParser.parse_tok(tok, normal_convert=lambda t: '_normtok_') for tok in doc])
print()
print([dt.DocParser.parse_tok(tok, normal_convert=lambda t: t.text_with_ws) for tok in doc]) # includes whitespace
print()
print([dt.DocParser.parse_tok(tok, normal_convert=lambda t: (t.lower_.strip(), t.pos_)) for tok in doc]) # includes part of speech

['James', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '20', '_normtok_', 'Twenty', 'Dollars', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_', '_normtok_']

['James', 'said ', 'he ', 'will ', 'paint ', 'the ', 'house ', 'red ', 'for ', '$', '20', '(', 'Twenty', 'Dollars', ')', '. ', 'He ', 'is ', 'such ', 'a ', 'rule', '-', 'breaker', '.']

['James', ('said', 'VERB'), ('he', 'PRON'), ('will', 'AUX'), ('paint', 'VERB'), ('the', 'DET'), ('house', 'NOUN'), ('red', 'NOUN'), ('for', 'ADP'), ('$', 'SYM'), '20', ('(', 'PUNCT'), 'Twenty', 'Dollars', (')', 'PUNCT'), ('.', 'PUNCT'), ('he', 'PRON'), ('is', 'AUX'), ('such', 'DET'), ('a', 'DET'), ('rule', 'NOUN'), ('-', 'PUNCT'), ('breaker', 'NOUN'), ('.', 'PUNCT')]


In [16]:
# and can provide basically the same thing with named entities (while keeping normal token processing the same)
print([dt.DocParser.parse_tok(tok, ent_convert=lambda t: t.ent_type_) for tok in doc])
print()
print([dt.DocParser.parse_tok(tok, ent_convert=lambda t: (t.lower_.strip(), t.ent_type_, t.pos_)) for tok in doc])

['ORG', 'said', 'he', 'will', 'paint', 'the', 'house', 'red', 'for', '$', 'MONEY', '(', 'MONEY', 'MONEY', ')', '.', 'he', 'is', 'such', 'a', 'rule', '-', 'breaker', '.']

[('james', 'ORG', 'PROPN'), 'said', 'he', 'will', 'paint', 'the', 'house', 'red', 'for', '$', ('20', 'MONEY', 'NUM'), '(', ('twenty', 'MONEY', 'NUM'), ('dollars', 'MONEY', 'NOUN'), ')', '.', 'he', 'is', 'such', 'a', 'rule', '-', 'breaker', '.']


In [17]:
# can use both to similar effect
print([dt.DocParser.parse_tok(tok, normal_convert=lambda t: t.pos_, ent_convert=lambda t: t.pos_) for tok in doc])

['PROPN', 'VERB', 'PRON', 'AUX', 'VERB', 'DET', 'NOUN', 'NOUN', 'ADP', 'SYM', 'NUM', 'PUNCT', 'NUM', 'NOUN', 'PUNCT', 'PUNCT', 'PRON', 'AUX', 'DET', 'DET', 'NOUN', 'PUNCT', 'NOUN', 'PUNCT']


## Token Filtering
As part of the DocParser pipeline, it is often useful to filter out tokens that are not useful. There are several useful options in `.use_tok()` for this. Note that this method can be overridden for use in other DocParser methods.

In [18]:
# by default, all tokens except whitespace are included
print([dt.DocParser.use_tok(tok) for tok in doc])
# to turn whitespace filtering off, use filter_whitespace=False (no whitespace is in this example so can't show example)

[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]


In [19]:
# filter 
print([tok for tok in doc if dt.DocParser.use_tok(tok, filter_punct=True)])

[James, said, he, will, paint, the, house, red, for, $, 20, twenty, dollars, He, is, such, a, rule, breaker]


In [20]:
# filter stopwords (according to spacy). To modify stopwords: https://stackoverflow.com/questions/41170726/add-remove-stop-words-with-spacy 
print([tok for tok in doc if dt.DocParser.use_tok(tok, filter_stop=True)])

[James, said, paint, house, red, $, 20, (, dollars, ), ., rule, -, breaker, .]


In [21]:
# filter both
print([tok for tok in doc if dt.DocParser.use_tok(tok, filter_stop=True, filter_punct=True)])

[James, said, paint, house, red, $, 20, dollars, rule, breaker]


In [22]:
# filter_number would filter "twenty" wheras filter_digit would not
print([tok for tok in doc if dt.DocParser.use_tok(tok, filter_digit=True)])
print([tok for tok in doc if dt.DocParser.use_tok(tok, filter_num=True)])

[James, said, he, will, paint, the, house, red, for, $, (, twenty, dollars, ), ., He, is, such, a, rule, -, breaker, .]
[James, said, he, will, paint, the, house, red, for, $, (, dollars, ), ., He, is, such, a, rule, -, breaker, .]


In [23]:
# filter named entities
print([tok for tok in doc if dt.DocParser.use_tok(tok, filter_all_ents=True)])

[said, he, will, paint, the, house, red, for, $, (, ), ., He, is, such, a, rule, -, breaker, .]


In [24]:
# filter only numbers. See this page for details on named entity types: https://spacy.io/api/annotation#named-entities
print([tok for tok in doc if dt.DocParser.use_tok(tok, filter_ent_types=['MONEY',])])
# filters "twenty dollars"

[James, said, he, will, paint, the, house, red, for, $, (, ), ., He, is, such, a, rule, -, breaker, .]


## Document Preprocessing
A common step taken prior to using the spacy parser is to remove objects you don't want to be tokenized. So far this function is quite simple but more features may be added later.

In [25]:
exstr = 'To search the web, see http://google.com'
dt.DocParser.preprocess(exstr, remove_url=True)

'To search the web, see '