# Word2vec model using Gensim

1. pip install gensim, tsne, bokeh
3. download sample text corpus http://wortschatz.uni-leipzig.de/en/download/
2. Text preprocessing iterator
3. Train word2vec model
4. Sanity check
5. tsne 
5. plot with bokeh

In [1]:
import logging
import os.path
import sys
from gensim.models import Word2Vec

In [2]:
class sentences_iter(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname), encoding = "latin-1"):
                yield line.split()

In [3]:
sentences = sentences_iter('eng_news_2005_100K/') 
model = Word2Vec(sentences, size=int(100), window=20, min_count=5,hs=1)

In [4]:
model.most_similar('world')

  """Entry point for launching an IPython kernel.


[('world,', 0.6282347440719604),
 ('Europe', 0.6268082857131958),
 ('nation', 0.612112283706665),
 ('strength', 0.5519618391990662),
 ('history', 0.5486292839050293),
 ('II', 0.5298193097114563),
 ('clearly', 0.5283876657485962),
 ('Europe,', 0.5249205827713013),
 ('history,', 0.5237999558448792),
 ('country', 0.5188781023025513)]

In [5]:
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(27956, 100)

In [6]:
import numpy as np
import datetime
from tsne import bh_sne
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool, BoxSelectTool

In [7]:
vectors = np.array(model.wv.syn0,dtype='float')

  """Entry point for launching an IPython kernel.


In [9]:
# perform t-SNE embedding on subset of the large vocabulary
start = datetime.datetime.now()
vis_data = bh_sne(vectors)
end = datetime.datetime.now()
print(f"tsne in {end - start}")
# take like 5 mins

tsne in 0:02:22.825835


In [10]:
words = model.wv.index2word

In [11]:
# plot the result
vis_x = vis_data[:, 0]
vis_y = vis_data[:, 1]

output_file("word2vec_tsne.html")
source = ColumnDataSource(data=dict(
    x= vis_data[:, 0],
    y= vis_data[:, 1],
    desc= words ,
))

hover = HoverTool(tooltips=[
    ("word", "@desc"),
])

p = figure(plot_width=500, plot_height=500, tools=[hover,"wheel_zoom"],
           title="Glove Tsne")

p.circle('x', 'y', size=20, source=source)
show(p)