# TSNE of pretrained Glove model on wiki

1. pip install gensim, tsne, bokehe
3. Download pretrained glove model from https://nlp.stanford.edu/projects/glove/
4. load model in gensim (faster loading)
4. tsne 
5. plot with bokeh

In [3]:
# imports
from tsne import bh_sne
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool, BoxSelectTool
import numpy as np
from gensim.models import KeyedVectors
import datetime

In [4]:
# to load a pre-trained model from .txt, make sure you add 400000 300 as first line
# you can also set a limit of the vocabulary in case you don't need all
st = datetime.datetime.now()
word_vectors = KeyedVectors.load_word2vec_format('glove/glove.6B.300d.txt',binary=False)
print(f"{datetime.datetime.now()-st}")

0:01:44.104396


In [5]:
# # quicker loading
# force the unit-normalization, destructively in-place 
word_vectors.init_sims(replace=True) 
# save in binary format
word_vectors.save('glove/glove.6B.300d-gensim.bin')

In [6]:
# # load from binary
st = datetime.datetime.now()
glove_model = KeyedVectors.load('glove/glove.6B.300d-gensim.bin', mmap='r') # memory-mapped from disk
glove_model.syn0norm = glove_model.syn0  # prevent recalc of normed vectors
print("{datetime.datetime.now()-st}")
# # for more details check this post https://stackoverflow.com/questions/42986405/how-to-speed-up-gensim-word2vec-model-load-time

{datetime.datetime.now()-st}


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [7]:
# sanity check
glove_model.most_similar('queen')

[('elizabeth', 0.6771447658538818),
 ('princess', 0.635676383972168),
 ('king', 0.6336469650268555),
 ('monarch', 0.5814188122749329),
 ('royal', 0.543052613735199),
 ('majesty', 0.5350357294082642),
 ('victoria', 0.5239557027816772),
 ('throne', 0.5097099542617798),
 ('lady', 0.5045416355133057),
 ('crown', 0.49980056285858154)]

In [8]:
glove_model.most_similar('php')

[('javascript', 0.6667327880859375),
 ('scripting', 0.5983599424362183),
 ('perl', 0.5901949405670166),
 ('mysql', 0.5828666090965271),
 ('server-side', 0.5826088190078735),
 ('c++', 0.5619826316833496),
 ('runtime', 0.5475314855575562),
 ('open-source', 0.5299729704856873),
 ('sql', 0.521233320236206),
 ('backend', 0.5180889368057251)]

In [9]:
vectors = np.array(glove_model.syn0norm,dtype='float')
vectors.shape

  """Entry point for launching an IPython kernel.


(400000, 300)

In [10]:
# perform t-SNE embedding on subset of the large vocabulary
start, end = 1500, 2000
vis_data = bh_sne(vectors[start:end])
words = glove_model.index2word[start:end]

In [11]:
# plot the result
vis_x = vis_data[:, 0]
vis_y = vis_data[:, 1]

output_file("glove_tsne.html")
source = ColumnDataSource(data=dict(
    x= vis_data[:, 0],
    y= vis_data[:, 1],
    desc= words ,
))

hover = HoverTool(tooltips=[
    ("word", "@desc"),
])

p = figure(plot_width=800, plot_height=800, tools=[hover,"wheel_zoom"],
           title="Glove Tsne")

p.circle('x', 'y', size=20, source=source)
show(p)