In [1]:
# To reload modules easily
%load_ext autoreload
%autoreload 2

In [2]:
from gensim.models import Word2Vec

In [3]:
from embedding.word2vec_model import Word2VecModel

In [4]:
total_model = Word2Vec.load('./embedding/output/total.w2v')

# T-SNE

In [5]:
from sklearn.manifold import TSNE

In [6]:
import pandas as pd
import pickle

In [7]:
total_model.wv.most_similar('우정/Noun')

[('우애/Noun', 0.4294128715991974),
 ('사랑/Noun', 0.42187103629112244),
 ('혈맹/Noun', 0.41183096170425415),
 ('벗/Noun', 0.37951797246932983),
 ('연인/Noun', 0.3687002956867218),
 ('인연/Noun', 0.36418092250823975),
 ('관계/Noun', 0.3563617467880249),
 ('애국심/Noun', 0.35634028911590576),
 ('충정/Noun', 0.3552236258983612),
 ('객관성/Noun', 0.3541759252548218)]

In [8]:
#top 2000개 단어
num_words = 2000
word_embeddings = pd.DataFrame(total_model.wv.syn0norm[:num_words, :], index=total_model.wv.index2word[:num_words])
word_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
<Pad>/Pad,0.012968,-0.026009,0.040839,0.089757,0.020421,0.041947,0.035136,0.044575,-0.017558,-0.069458,...,-0.099105,-0.11375,-0.003138,-0.04307,0.089293,-0.081942,-0.092553,0.085203,-0.02984,0.065207
./Punctuation,0.003854,-0.023621,0.047863,0.095151,0.027836,0.040989,0.04161,0.045424,-0.007725,-0.054123,...,-0.098826,-0.115645,0.00674,-0.04893,0.094439,-0.082157,-0.102242,0.074221,-0.024942,0.064997
다/Eomi,-0.026355,-0.004177,0.092559,0.104662,-0.041381,0.034248,-0.020521,0.04726,0.023471,-0.064311,...,-0.101598,-0.152212,-0.020208,-0.009619,0.141439,-0.027813,-0.106114,0.147667,-0.043632,0.027728
을/Josa,-0.003576,-0.047254,0.043073,0.071429,-0.010837,0.012623,-0.052747,0.076342,0.026389,0.017566,...,-0.074355,-0.053415,0.008716,-0.100021,0.078621,0.084877,-0.015338,0.00284,-0.054647,0.035356
이/Josa,0.013653,-0.059938,-0.059719,0.058683,0.037522,-0.128005,0.023919,0.062025,-0.014803,-0.004364,...,-0.015631,-0.006488,-0.034205,-0.095185,0.049525,-0.011988,-0.057874,0.028749,-0.066505,0.046578


In [9]:
%%time
USE_PREMADE_TSNE = False

tsne_filepath = 'tsne.pkl'

if not USE_PREMADE_TSNE:
    
    tsne = TSNE(random_state=0)
    tsne_points = tsne.fit_transform(word_embeddings.values)
    with open(tsne_filepath, 'wb') as f:
        pickle.dump(tsne_points, f)
else:
    with open(tsne_filepath, 'rb') as f:
        tsne_points = pickle.load(f)

tsne_df = pd.DataFrame(tsne_points, index=word_embeddings.index, columns=['x_coord', 'y_coord'])
tsne_df['word'] = tsne_df.index

CPU times: user 28.4 s, sys: 2.41 s, total: 30.8 s
Wall time: 30.9 s


In [10]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

In [11]:
# prepare the data in a form suitable for bokeh.
plot_data = ColumnDataSource(tsne_df)

# create the plot and configure it
tsne_plot = figure(title='t-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   active_scroll='wheel_zoom'
                  )

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = '@word') )

tsne_plot.circle('x_coord', 'y_coord', source=plot_data,
                 color='red', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color='orange')

# adjust visual elements of the plot
tsne_plot.title.text_font_size = value('16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# show time!
show(tsne_plot);