### Sample program to use word2vec  

#### Import libraries  

In [1]:
from gensim.models import word2vec

#### Parameters  

In [2]:
model_file = 'word2vec_text8.model'

#### Load model file 

In [3]:
model = word2vec.Word2Vec.load(model_file)

#### Check model  

In [4]:
print(model.wv.vector_size)  # dimension of embedding
print(len(model.wv.vocab.keys()))  # number of words
print(list(model.wv.vocab.keys())[:10])  # show first 10 words

300
25097
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


#### Use model  

In [5]:
print(model['woman'])  # embedded vector (1-d ndarray)

[-3.49999294e-02  6.24854304e-02  2.30930243e-02  7.00774789e-02
  2.05733832e-02 -7.31931050e-06 -2.48185708e-03 -1.90785043e-02
 -6.83930367e-02  5.08368500e-02  2.14990787e-02 -1.16221063e-01
 -3.60147692e-02  6.24374151e-02 -3.37759070e-02  8.34584087e-02
  9.50198174e-02  5.31408824e-02 -3.37081142e-02 -2.23763324e-02
 -2.43504550e-02  1.00581519e-01  7.86220096e-03  1.43915974e-03
 -5.40241003e-02 -5.46032749e-02  6.65581599e-02 -8.07848796e-02
 -1.54060242e-03 -1.02234229e-01  8.31187218e-02 -1.59485396e-02
 -1.61619969e-02 -5.05060144e-02  7.16775730e-02  4.36517075e-02
 -8.80400017e-02  7.97648821e-03 -8.84747133e-03 -6.41736249e-03
  4.70976233e-02 -6.87187389e-02 -7.19427317e-02  5.32208383e-02
  3.48791145e-02 -9.67119820e-03  2.02849992e-02  4.42537181e-02
  3.36094461e-02  3.68396752e-02  7.01953620e-02  5.04930839e-02
 -3.92268822e-02 -1.43000633e-01 -2.54257955e-02 -5.10448962e-02
  1.52412569e-02  2.68369238e-03  6.15779981e-02  1.09479688e-01
 -1.97701715e-02  3.63650

  print(model['woman'])  # embedded vector (1-d ndarray)


In [6]:
print(model['man'])  # embedded vector (1-d ndarray)

[-6.04694709e-02  7.92911798e-02  8.74510929e-02  8.94903094e-02
  2.57714819e-02  4.77714092e-03 -3.27486098e-02 -1.76389907e-02
 -1.17973723e-02  7.56385475e-02  5.85177317e-02 -1.12875395e-01
 -2.70414390e-02  6.60581589e-02 -8.60184059e-02  1.54842203e-02
 -5.54477889e-03  3.54370824e-03  5.83944609e-03  3.62386554e-02
 -3.52189224e-03  6.87501058e-02  5.80834523e-02 -2.44240668e-02
 -4.43813540e-02 -5.35375960e-02  5.39615415e-02 -3.48533578e-02
  2.78553721e-02 -7.91059956e-02 -7.71595864e-03  1.05949575e-02
  2.83694919e-02 -4.61927205e-02  8.51447210e-02  4.27688845e-02
 -4.90028262e-02 -5.63603565e-02 -6.14818418e-03  6.62312731e-02
  2.18672045e-02 -5.44028506e-02 -1.29982084e-01  7.37903044e-02
  5.94367720e-02 -5.30885831e-02 -2.73186136e-02 -6.10895492e-02
  4.38734554e-02  1.21160354e-02 -4.26043011e-02 -5.84747531e-02
 -6.38770387e-02 -7.75160491e-02  5.84527440e-02  1.42753532e-04
 -2.46809181e-02  3.46447751e-02  1.25399800e-02  8.87962952e-02
 -1.42371524e-02  1.00334

  print(model['man'])  # embedded vector (1-d ndarray)


In [7]:
print("Words similar to 'woman'")
display(model.wv.most_similar("woman"))

Words similar to 'woman'


[('child', 0.7187955379486084),
 ('girl', 0.6928633451461792),
 ('man', 0.6841185092926025),
 ('lover', 0.6255887150764465),
 ('prostitute', 0.6253218650817871),
 ('person', 0.6128579378128052),
 ('herself', 0.6080033779144287),
 ('lady', 0.6049087643623352),
 ('husband', 0.6021214127540588),
 ('stranger', 0.5987484455108643)]

In [8]:
print("Similarity between 'girl' and 'woman'")
print(model.wv.similarity("girl", "woman"))
print("Similarity between 'girl' and 'man'")
print(model.wv.similarity("girl", "man"))
print("Similarity between 'girl' and 'car'")
print(model.wv.similarity("girl", "car"))
print("Similarity between 'bus' and 'car'")
print(model.wv.similarity("bus", "car"))

Similarity between 'girl' and 'woman'
0.6928633
Similarity between 'girl' and 'man'
0.5887219
Similarity between 'girl' and 'car'
0.2824464
Similarity between 'bus' and 'car'
0.44663033


In [9]:
print("'woman' - 'man' + 'king' (top 10)")
display(model.wv.most_similar(positive=["woman", "king"],
                              negative=["man"], topn=10))

'woman' - 'man' + 'king' (top 10)


[('queen', 0.6150717735290527),
 ('isabella', 0.5218780040740967),
 ('princess', 0.5207738280296326),
 ('prince', 0.5201140642166138),
 ('empress', 0.5156155824661255),
 ('elizabeth', 0.5133547782897949),
 ('matilda', 0.5132491588592529),
 ('throne', 0.5130334496498108),
 ('husband', 0.5107308626174927),
 ('daughter', 0.5060718059539795)]

In [18]:
print("'pig' - 'cow' + 'pork' (top 10)")
display(model.wv.most_similar(positive=["pig", "pork"],
                              negative=["cow"], topn=10))

'pig' - 'cow' + 'pork' (top 10)


[('potatoes', 0.8328580260276794),
 ('butter', 0.8022991418838501),
 ('sauce', 0.7979693412780762),
 ('beef', 0.7979012727737427),
 ('soy', 0.7911310195922852),
 ('spices', 0.7899657487869263),
 ('tomato', 0.7853360176086426),
 ('barley', 0.7788819074630737),
 ('desserts', 0.7782789468765259),
 ('noodles', 0.7753096222877502)]