
# Basic NLP operations

1. Lemmatization
1. Part-of-speech (POS) tagging
1. Dependency parsing
1. Rule-based pattern matching
1. Named entity recognition (NER)
1. Word embeddings

## Tools
- Python 3
- spaCy

In [None]:
# spaCy
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
#
# Lemmatization
#

text = u'''She goes for a walk every day.
           He was going to the cinema in the evening.
           They have already gone to work.
           I took my coffee to go and went home.'''
word_to_find = u'go'

In [None]:
doc = nlp(text)

for token in doc:
    if token.lemma_ != word_to_find:
        continue
        
    print(f'{token.text:<8} {token.lemma_}')

In [None]:
#
# Part-of-speech (POS) tagging
#

text = u'The sky above the port was the color of television, tuned to a dead channel.'

In [None]:
doc = nlp(text)

for token in doc:
    print(f'{token.text:<12} {token.pos_}')

In [None]:
#
# Dependency parsing
#

text = u'Bob never took Spanish at school.'
doc = nlp(text)

In [None]:
from spacy import displacy

displacy.render(doc, style="dep", jupyter=True)

In [None]:
#
# Rule-based pattern matching
#

text = u'''Corrective actions to previous audit findings are not implemented in a timely manner or
           are not always documented. There are even some finding that were never discussed within the team.'''
doc = nlp(text)

In [None]:
from spacy.matcher import Matcher

negated_verb_pattern = [ {'DEP': 'neg'}, {'POS': 'VERB'} ]

matcher = Matcher(nlp.vocab)
matcher.add('NEGATED_VERB', None, negated_verb_pattern)
matches = matcher(doc)

for rule_id, start_token, end_token in matches:
    print(doc[start_token:end_token])

In [None]:
# Token attributes - https://spacy.io/usage/linguistic-features#adding-patterns-attributes
#    DEP - syntactic dependency
#    OP  - quantifier (`?` means optional)

# Dependency tokens - https://stackoverflow.com/a/40288324/95
#    neg    - negation modifier
#    advmod - adverbial modifier

# {'DEP': 'advmod', 'OP': '?'}, 

In [None]:
#
# Named entity recognition (NER)
#

text = u'Marek Grzenkowicz came to the Devoxx conference from Poland yesterday around 11 AM.'
doc = nlp(text)

In [None]:
from spacy import displacy

displacy.render(doc, style="ent", jupyter=True)

# Supported entity types: https://spacy.io/api/annotation#named-entities

In [None]:
#
# Word embeddings
#