In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [4]:
# So what does a word vector look like? Since spaCy employs 300 dimensions, word vectors are stored as 300-item arrays

nlp(u'lion').vector.shape

(300,)

In [5]:
nlp(u'The quick brown fox jumped').vector.shape

(300,)

In [6]:
tokens = nlp(u'lion cat pet')

In [7]:
# Check the cosine similarity between words 1 and 2

for token1 in tokens:
    for token2 in tokens:
        print(token1.text,token2.text, token1.similarity(token2))

lion lion 1.0
lion cat 0.52654374
lion pet 0.39923766
cat lion 0.52654374
cat cat 1.0
cat pet 0.7505456
pet lion 0.39923766
pet cat 0.7505456
pet pet 1.0


In [32]:
len(nlp.vocab.vectors)

684830

In [9]:
tokens = nlp(u'dog cat nargle')

In [10]:
for token in tokens:
    print(token.text, token.has_vector,token.vector_norm,token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
nargle False 0.0 True


In [11]:
from scipy import spatial

cosine_similarity = lambda vec1,vec2: 1 - spatial.distance.cosine(vec1,vec2)

In [12]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

In [13]:
# king - man + woman => new vector similar queen, princess, highness

new_vector = king-man+woman

In [29]:
computed_similarities = []

In [31]:
computed_similarities = []

for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector,word.vector)
                computed_similarities.append((word,similarity))

In [33]:
computed_similarities = sorted(computed_similarities,key=lambda item:-item[1])

In [34]:
print([t[0].text for t in computed_similarities[:10]])

['king', 'woman', 'she', 'lion', 'who', 'fox', 'brown', 'when', 'dare', 'cat']


In [36]:
lamp = nlp.vocab['lamp'].vector
desk = nlp.vocab['desk'].vector
door = nlp.vocab['door'].vector

In [37]:
new_vector = lamp-desk+door

In [38]:
computed_similarities = []

for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector,word.vector)
                computed_similarities.append((word,similarity))

In [39]:
computed_similarities = sorted(computed_similarities,key=lambda item:-item[1])

In [40]:
print([t[0].text for t in computed_similarities[:10]])

['lamp', 'door', 'when', 'man', 'cat', 'it', 'where', 'that', 'we', 'is']
