In [1]:
"""
Need to download the larger Spacy models in order to use its vector embeddings
python3 -m spacy download en_core_web_lg
Word2vec is a two-layer neural net for text processing
its input is a corpus and its output is a set of feature vectors for words in that corpus
-purpose-built to group the vectors of similar words together in vectorspace
-mathematically detects cosine similarities between words represented by a 300-dimension vector
-once a w2v model is trained, it can make accurate guesses about a word's in-context meaning
1. use context to predict a target word (CBOW continuous bag of words)
2. use a word to predict a target context (skip-gram)
use vector arithmetic such as `new_v = king-man+woman` or `new_v closest to vector for queen`
"""
import spacy
nlp = spacy.load('en_core_web_lg')

In [7]:
#Same number of vector dimensions for documents and words =300
print (nlp(u'lion').vector.shape)
print(nlp.vocab.vectors.shape) #hundreds of thousands of words, each represented by 300D vector

(300,)
(514157, 300)


In [33]:
#related words' score is calculated as the square root of the sum of squared vectors
print("First group:")
tokens = nlp(u'lion cat pet')
for token1 in tokens:
    for token2 in tokens:
        print(token1.text,token2.text,token1.similarity(token2))


print("\n Second group:")
tokens2 = nlp(u"like love hate blarg")
for token1 in tokens2:
    print(token1.text,
        token1.has_vector, #is it in the vocabulary?
        token1.vector_norm, #What's the normalized vector
        token1.is_oov), #out of vocabulary
    for token2 in tokens2:
        print(token2.text,
              token1.similarity(token2))

        

First group:
lion lion 1.0
lion cat 0.3854507803916931
lion pet 0.20031584799289703
cat lion 0.3854507803916931
cat cat 1.0
cat pet 0.732966423034668
pet lion 0.20031584799289703
pet cat 0.732966423034668
pet pet 1.0

 Second group:
like True 50.609623 False
like 1.0
love 0.5212638974189758
hate 0.5065141320228577
blarg 0.0
love True 58.563564 False
like 0.5212638974189758
love 1.0
hate 0.5708349943161011
blarg 0.0
hate True 46.569798 False
like 0.5065141320228577
love 0.5708349943161011
hate 1.0
blarg 0.0
blarg False 0.0 True
like 0.0
love 0.0
hate 0.0
blarg 1.0




In [46]:
from scipy import spatial
cosine_similarity = lambda vec1,vec2: 1-spatial.distance.cosine(vec1,vec2)
king = nlp.vocab['government'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
new_v = king
computed_similarities = []
for word in nlp.vocab:    #for 500K+ words in vocab
    w_v = word.vector
    if word.has_vector and word.is_lower and word.is_alpha:
        similarity = cosine_similarity(new_v,w_v)
        computed_similarities.append((word,similarity))
computed_similarities = sorted(computed_similarities,key=lambda item:-item[1])
#without the minus in the line above, they would be sorted in descending order
#i.e. starting with the LEAST similar words. Here we sort for the MOST similar.
print([t[0].text for t in computed_similarities[:10]])

['government', 'that', 'and', 'those', 'should', 'king', 'they', 'would', 'cause', 'these']
