# SpaCy Embeddings Study
---
Glenn Abastillas

In [4]:
import spacy as sp
import numpy as np
import altair as alt
from nltk.corpus import brown

### Load Corpus

In [10]:
raw_text = '. '.join([' '.join(sent) for sent in brown.sents('cp01')])

In [11]:
nlp = sp.load('en_core_web_md')

text = nlp(raw_text)

### Inspect Corpus

In [12]:
for i, token in enumerate(text):
    print(f'Token: {str(token):>10}\t\tLemma: {token.lemma_:>10}')
    if i > 11:
        break

Token:       They		Lemma:     -PRON-
Token:    neither		Lemma:    neither
Token:      liked		Lemma:       like
Token:        nor		Lemma:        nor
Token:   disliked		Lemma:    dislike
Token:        the		Lemma:        the
Token:        Old		Lemma:        Old
Token:        Man		Lemma:        Man
Token:         ..		Lemma:         ..
Token:         To		Lemma:         to
Token:       them		Lemma:     -PRON-
Token:         he		Lemma:     -PRON-
Token:      could		Lemma:      could


Define similarity function

In [18]:
def similarity(a, b):
    return (a @ b) / (np.sqrt(a @ a) * np.sqrt(b @ b))

## Demonstrate Algebra with Vectors

In [13]:
vocab = ['man', 'woman', 'he', 'she', 'him', 'her', 'brother', 'sister']
man, woman, he, she, him, her, brother, sister = [text.vocab[_] for _ in vocab]

Syntactic relationships

In [20]:
he.similarity(him)

0.82752

In [21]:
him_ = he.vector + (her.vector - she.vector)

similarity(him_, him.vector)

0.88178796

Semantic relationships

In [28]:
sister.similarity(brother)

0.7468089

In [26]:
sibling_ = brother.vector - he.vector + she.vector

similarity(sibling_, sister.vector)

0.87216073