In [2]:
import spacy

# word vectors occupy lot of space. hence en_core_web_sm model do not have them included. 
# In order to download
# word vectors you need to install large or medium english model. We will install the large one!
# make sure you have run "python -m spacy download en_core_web_lg" to install large english model
nlp = spacy.load("en_core_web_lg")

In [4]:
doc = nlp("dog cat banana kem dvsohnvown")

for token in doc:
    print(token.text, "Vector:", token.has_vector, "OOV:", token.is_oov)

dog Vector: True OOV: False
cat Vector: True OOV: False
banana Vector: True OOV: False
kem Vector: True OOV: False
dvsohnvown Vector: False OOV: True


In [5]:
doc[0].vector.shape

(300,)

In [6]:
base_token = nlp("bread")
base_token.vector.shape

(300,)

In [7]:
doc = nlp("bread sandwich burger car tiger human wheat")

for token in doc:
    print(f"{token.text} <-> {base_token.text}:", token.similarity(base_token))

bread <-> bread: 0.9999999766167111
sandwich <-> bread: 0.6874560014053445
burger <-> bread: 0.5440373883702087
car <-> bread: 0.1644114584391833
tiger <-> bread: 0.1449235625942581
human <-> bread: 0.21103660928832707
wheat <-> bread: 0.6572456428272563


In [11]:
def print_similarity(base_word, words_to_compare):
    base_token = nlp(base_word)
    doc = nlp(words_to_compare)
    for token in doc:
        print(f"{token.text} <-> {base_token.text}: ", token.similarity(base_token))

In [12]:
print_similarity("iphone", "apple samsung iphone dog kitten")

apple <-> iphone:  0.6339781147910419
samsung <-> iphone:  0.6678678014329177
iphone <-> iphone:  1.0000000285783557
dog <-> iphone:  0.17431037640553934
kitten <-> iphone:  0.14685812907484028


In [13]:
king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result], [queen])

array([[0.78808445]], dtype=float32)