
# Basic NLP operations

1. Lemmatization
1. Part-of-speech (POS) tagging
1. Dependency parsing
1. Rule-based pattern matching
1. Named entity recognition (NER)
1. Word embeddings

## Tools
- Python 3
- spaCy

## Lemmatization

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [None]:
text = u'''She goes for a walk every day.
           He was going to the cinema in the evening.
           They have already gone to work.
           I took my coffee to go and went home.'''
word_to_find = u'go'

In [None]:
doc = nlp(text)

for token in doc:
    if token.lemma_ != word_to_find:
        continue
        
    print(f'{token.text:<8} {token.lemma_}')

## Part-of-speech (POS) tagging

In [None]:
text = u'The sky above the port was the color of television, tuned to a dead channel.'

In [None]:
doc = nlp(text)

for token in doc:
    print(f'{token.text:<12} {token.pos_}')

## Dependency parsing

In [None]:
text = u'Bob never took Spanish at school.'
doc = nlp(text)

In [None]:
from spacy import displacy

displacy.render(doc, style="dep", jupyter=True)

## Rule-based pattern matching

In [None]:
text = u'''Corrective actions to previous audit findings are not implemented in a timely manner or
           are not always documented. There are even some finding that were never discussed within the team.'''
doc = nlp(text)

In [None]:
from spacy.matcher import Matcher

negated_verb_pattern = [ {'DEP': 'neg'}, {'POS': 'VERB'} ]

matcher = Matcher(nlp.vocab)
matcher.add('NEGATED_VERB', None, negated_verb_pattern)
matches = matcher(doc)

for rule_id, start_token, end_token in matches:
    print(doc[start_token:end_token])

In [None]:
# Token attributes - https://spacy.io/usage/linguistic-features#adding-patterns-attributes
#    DEP - syntactic dependency
#    OP  - quantifier (`?` means optional)

# Dependency tokens - https://stackoverflow.com/a/40288324/95
#    neg    - negation modifier
#    advmod - adverbial modifier

# {'DEP': 'advmod', 'OP': '?'}, 

## Named entity recognition (NER)

In [None]:
text = u'Marek Grzenkowicz came to the Devoxx conference from Poland yesterday around 11 AM.'
doc = nlp(text)

In [None]:
from spacy import displacy

displacy.render(doc, style="ent", jupyter=True)

# Supported entity types: https://spacy.io/api/annotation#named-entities

## Word embeddings

In [None]:
nlp = spacy.load('en_core_web_md')  # larger models, with word vectors

In [None]:
word_dog = nlp.vocab[u'dog']

print(word_dog.vector[:42])

In [None]:
print(f'length = {len(word_dog.vector)}')
print(f'min    = {min(word_dog.vector)}')
print(f'max    = {max(word_dog.vector)}')

## Cosine similarity

TODO: add image; use Wolfram to draw it

In [None]:
# TODO: use `cosine_similarity` instead

print(word_dog.similarity(nlp.vocab[u'cat']))
print(word_dog.similarity(nlp.vocab[u'hammer']))

## Arithmetic of word vectors

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_vectors(word_vector, vocabulary, skip_words=(), num=3):
    result = [
        w
        for w in vocabulary
        if w.prob >= -15
        and w.is_lower
        and any(w.vector != word_vector)
        and w.orth_ not in skip_words
    ]
    result = sorted(
        result,
        key=lambda w: cosine_similarity([w.vector], [word_vector]),
        reverse=True
    )
    return [w.orth_ for w in result[:num]]

In [None]:
print(find_similar_vectors(word_dog.vector, nlp.vocab, num=10))

## Arithmetic of word vectors
$$ \mathit{parent} + \mathit{woman} = \mathit{x} $$

In [None]:
x = nlp.vocab[u'parent'].vector + nlp.vocab[u'woman'].vector
print(find_similar_vectors(x, nlp.vocab, skip_words=[u'parent', u'woman']))

$$ \mathit{seawater} -\mathit{salt} = \mathit{x} $$

In [None]:
x = nlp.vocab[u'seawater'].vector - nlp.vocab[u'salt'].vector
print(find_similar_vectors(x, nlp.vocab, skip_words=[u'seawater', u'salt']))

## Arithmetic of word vectors
$$ \mathit{Germany} - \mathit{Berlin} = \mathit{x} - \mathit{Warsaw} $$

In [None]:
x = nlp.vocab[u'germany'].vector - nlp.vocab[u'berlin'].vector + nlp.vocab[u'warsaw'].vector
print(find_similar_vectors(x, nlp.vocab, skip_words=[u'germany', u'berlin', u'warsaw']))