
# Basic NLP operations

1. Lemmatization
1. Part-of-speech (POS) tagging
1. Dependency parsing
1. Rule-based pattern matching
1. Named entity recognition (NER)
1. Word embeddings

## Lemmatization

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [None]:
text = u'''She goes for a walk every day.
           He was going to the cinema in the evening.
           They have already gone to work.
           I took my coffee to go and went home.'''
word_to_find = u'go'

In [None]:
doc = nlp(text)

for token in doc:
    if token.lemma_ == word_to_find:
        print(f'{token.text:<8} {token.lemma_}')

## Part-of-speech (POS) tagging

In [None]:
text = u'The sky above the port was the color of television, tuned to a dead channel.'

In [None]:
doc = nlp(text)

for token in doc:
    print(f'{token.text:<12} {token.pos_}')

## Dependency parsing

In [None]:
text = u'Bob never took Spanish at school.'
doc = nlp(text)

In [None]:
from spacy import displacy

displacy.render(doc, style="dep", jupyter=True)

## Rule-based pattern matching

In [None]:
text = u'''Corrective actions to the previous audit findings are not implemented in a timely manner.
           Most are not fully documented.
           There are even some findings that the team never discussed.'''
doc = nlp(text)

In [None]:
from spacy.matcher import Matcher

negated_verb_pattern = [ {'DEP': 'neg'}, {'POS': 'VERB'} ]

matcher = Matcher(nlp.vocab)
matcher.add('NEGATED_VERB', None, negated_verb_pattern)
matches = matcher(doc)

for rule_id, start_token, end_token in matches:
    print(doc[start_token:end_token])

In [None]:
# {'DEP': 'advmod', 'OP': '?'}, 
# advmod - adverbial modifier

In [None]:
# Token attributes - https://spacy.io/usage/linguistic-features#adding-patterns-attributes
#    DEP - syntactic dependency
#    OP  - quantifier (`?` means optional)

# Dependency tokens - https://stackoverflow.com/a/40288324/95
#    neg    - negation modifier
#    advmod - adverbial modifier

## Named entity recognition (NER)

In [None]:
text = u'Marek Grzenkowicz came to the Devoxx conference from Poland yesterday around 11 AM.'
doc = nlp(text)

In [None]:
from spacy import displacy

displacy.render(doc, style="ent", jupyter=True)

In [None]:
# Default spaCy entity types - https://spacy.io/api/annotation#named-entities

## Word embeddings

1. Context-free language models
   - word2vec, GloVe, fastText
1. Contextual models
   - ELMo, BERT

In [None]:
nlp = spacy.load('en_core_web_md')  # larger models, with word vectors

### Example

> He was sitting by the **river bank**.
>
> A **bank account** was opened for them in the morning.

1. Context-free language models - vectors trained on a text corpus from co-occurrence statistics
1. Contextual models - word representations that are a function of the entire context of sentence or paragraph

## Word embeddings

In [None]:
vector_dog = nlp.vocab[u'dog'].vector
print(vector_dog[:42])

In [None]:
print(f'length  = {len(vector_dog)}')
print(f'min_val = {min(vector_dog)}')
print(f'max_val = {max(vector_dog)}')

## Comparing word vectors

**Cosine similarity**
![cosine similarity in 2D](images/vectors_cos_sim_600.png)

**Note:** For illustrative purposes only; the values above are random.

## Comparing word vectors

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

words = ['dog', 'husky', 'cat', 'horse', 'tree', 'stone', 'bitcoin']

for word in words:
    vector_word = nlp.vocab[word].vector
    
    cos_sim = cosine_similarity([vector_dog], [vector_word])  # calculate cosine between two vectors
    
    cos_sim = cos_sim[0][0]
    print(f'cos(dog, {word + ")":<8} = {cos_sim:>6.3f}')

## Arithmetic of word vectors

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_vectors(word_vector, vocabulary, skip_words=(), num=3):
    """
    Finds vectors close to `word_vector` in terms of cosine similarity, inside given `vocabulary`.
    """
    result = [
        w
        for w in vocabulary
        if w.prob >= -15
        and w.is_lower
        and any(w.vector != word_vector)
        and w.orth_ not in skip_words
    ]
    result = sorted(
        result,
        key=lambda w: cosine_similarity([w.vector], [word_vector]),
        reverse=True
    )
    return [w.orth_ for w in result[:num]]

In [None]:
find_similar_vectors(vector_dog, nlp.vocab, num=6)

## Arithmetic of word vectors
\begin{equation}
\large{\mathit{  V_{parent} + V_{woman} = x  }}
\end{equation}

In [None]:
x = nlp.vocab[u'parent'].vector + nlp.vocab[u'woman'].vector
find_similar_vectors(x, nlp.vocab, skip_words=[u'parent', u'woman'])

\begin{equation}
\large{\mathit{  V_{seawater} - V_{salt} = x  }}
\end{equation}

In [None]:
x = nlp.vocab[u'seawater'].vector - nlp.vocab[u'salt'].vector
find_similar_vectors(x, nlp.vocab, skip_words=[u'seawater', u'salt'])

## Arithmetic of word vectors
\begin{equation}
\large{\mathit{  V_{Germany} - V_{Berlin} = x - V_{Warsaw}  }}
\end{equation}

In [None]:
x = nlp.vocab[u'germany'].vector - nlp.vocab[u'berlin'].vector + nlp.vocab[u'warsaw'].vector
find_similar_vectors(x, nlp.vocab, skip_words=[u'germany', u'berlin', u'warsaw'])