In [2]:
from gensim.models import Word2Vec
import numpy as np
import pandas as pd

In [3]:
# define training data
sentences = [['bears', 'live', 'in', 'the', 'zoo'],
             ['bees', 'produce', 'honey', 'which', 'is','eaten','by','bears'],
             ['bears', 'climb', 'in','trees'],
             ['bees', 'make', 'a','nest','in','the','trees'],
             ['honey', 'is', 'in', 'the','tree']]

In [4]:
# train model
model = Word2Vec(sentences, min_count=1,size=5)

In [5]:
# summarize the loaded model
print(model)

Word2Vec(vocab=18, size=5, alpha=0.025)


In [6]:
# summarize vocabulary
words = list(model.wv.vocab)
print(words)

['bears', 'live', 'in', 'the', 'zoo', 'bees', 'produce', 'honey', 'which', 'is', 'eaten', 'by', 'climb', 'trees', 'make', 'a', 'nest', 'tree']


In [11]:
#Get the word vector for each item
model.wv.__getitem__('bears')

array([-0.0717726 , -0.0459957 , -0.07283463,  0.06320243,  0.08454664],
      dtype=float32)

In [7]:
# access vector for one word
basic_word = 'bears'
df = pd.DataFrame()
for w in words:
    basic_wv = model.wv.__getitem__(basic_word)
    if w != basic_word:
        other_word = model.wv.__getitem__(w)
        similarity = np.dot(basic_wv,other_word.T)/(np.linalg.norm(basic_wv)*np.linalg.norm(other_word))
        d = {}
        d['word']=w
        d['similarity']=similarity
        df = pd.concat([df,pd.DataFrame.from_dict(d,orient='index').T],axis=0)
df.sort_values(by='similarity',ascending=False).head()

Unnamed: 0,word,similarity
0,nest,0.796168
0,is,0.655385
0,live,0.526429
0,the,0.467772
0,climb,0.465385


In [8]:
# save model
model.save('model.bin')

In [9]:
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

Word2Vec(vocab=18, size=5, alpha=0.025)


In [10]:
df.head()

Unnamed: 0,word,similarity
0,live,0.526429
0,in,-0.938223
0,the,0.467772
0,zoo,-0.780614
0,bees,-0.245093
