## 말뭉치 읽기

In [1]:
from gensim.models import Word2Vec

In [2]:
corpus = []
for line in open("data/phone_review_mecab.txt", "r", encoding="utf-8").readlines():
    tokens = line.strip().split()
    corpus.append(tokens)

In [4]:
len(corpus) # 말뭉치의 길이

293189

## Skip-Gram 임베딩
negative sampling

In [5]:
# back propagation 자동으로 해준다

embedding_model = Word2Vec(corpus,       # input : [[token1, token2, ...], [token1, token2, ...]]
                           size=100,     # size: dimension
                           window = 2,   # target: 앞뒤로 몇개의 단어를 볼지 결정
                           workers=4,    # thread 수
                           sg=1)         # sg=1 : defalut, skip-gram(target word > context word) 여부, 
                                         # 1 : skip-gram
                                         # 0 : cbow

In [6]:
embedding_model.save("embedding/word2vec")

## Skip-Gram 임베딩 읽어들이기

In [7]:
from gensim.models import Word2Vec
embedding_model = Word2Vec.load("embedding/word2vec")

In [9]:
# wv : word vector
embedding_model.wv

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x2ae3c65e8d0>

In [8]:
# word vector 의 '나' 라는 벡터
embedding_model.wv["나"]

array([-6.92137539e-01,  5.00585020e-01,  1.56238645e-01, -3.84098202e-01,
        2.74021357e-01,  1.03710391e-01, -5.78586221e-01,  1.05814360e-01,
        4.16480899e-02,  1.07101411e-01, -1.78247005e-01, -4.18365568e-01,
        1.16108306e-01, -2.78781354e-01,  4.42766733e-02, -5.91917709e-02,
        4.14409995e-01, -4.59095806e-01, -6.06498003e-01, -1.49665505e-01,
        3.29480976e-01, -1.09163843e-01,  9.39316750e-02,  1.91694289e-01,
        1.83611512e-01,  4.01866436e-01,  2.15231404e-02,  1.32973716e-01,
        6.04255676e-01,  7.93497682e-01,  3.62070322e-01,  4.56160814e-01,
        3.55643630e-01, -1.52785927e-01, -3.25357243e-02, -1.95726752e-01,
       -1.00610986e-01,  7.39861280e-03, -2.82465458e-01, -2.97254883e-02,
        1.54788643e-01,  2.27146745e-01, -1.90797016e-01, -6.05341271e-02,
       -1.33778781e-01, -1.72604784e-01, -3.80382776e-01, -1.27952680e-01,
        1.62757620e-01,  5.42557538e-01,  1.00075006e-01, -7.26817583e-04,
        1.34640738e-01,  

In [31]:
embedding_model.most_similar("디자인", topn=5)

  """Entry point for launching an IPython kernel.


[('외형', 0.7763488292694092),
 ('UI', 0.7097629308700562),
 ('컨셉', 0.7055120468139648),
 ('시야각', 0.6925698518753052),
 ('촉감', 0.6924220323562622)]

In [27]:
vocab = embedding_model.wv.index2word

In [30]:
len(vocab)

23600

## 임베딩 시각화

In [28]:
from bokeh.io import output_notebook, show

output_notebook()

In [32]:
import pandas as pd
from sklearn.manifold import TSNE # 100 차원 임베딩을 사람이 인지할 수 있도록 차원축소가 필요, TSNE: 단어 임베딩과 잘 맞는 차원축소 기법
from bokeh.plotting import figure
from bokeh.models import LinearColorMapper, ColumnDataSource, LabelSet

def visualize_words(words, vecs, palette="Viridis256"):
    tsne = TSNE(n_components=2)
    tsne_results = tsne.fit_transform(vecs)
    df = pd.DataFrame(columns=['x', 'y', 'word'])
    df['x'], df['y'], df['word'] = tsne_results[:, 0], tsne_results[:, 1], list(words)
    source = ColumnDataSource(ColumnDataSource.from_df(df))
    labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                      text_font_size="15pt", text_color="#555555",
                      source=source, text_align='center')
    color_mapper = LinearColorMapper(palette=palette, low=min(tsne_results[:, 1]), high=max(tsne_results[:, 1]))
    plot = figure(plot_width=800, plot_height=1000)
    plot.scatter("x", "y", size=12, source=source, color={'field': 'y', 'transform': color_mapper}, line_color=None,
                 fill_alpha=0.8)
    plot.add_layout(labels)
    show(plot)

In [35]:
import random
words = random.sample(vocab[:3000], 100) # 최빈 3000 개 단어 중에 100개를 sampling 해라
vecs = [embedding_model.wv[word] for word in words]
visualize_words(words, vecs)

In [38]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from bokeh.models import ColorBar, BasicTicker

def visualize_between_words(words, vecs, palette="Viridis256"):
    df_list = []
    for word1_idx, word1 in enumerate(words):
        for word2_idx, word2 in enumerate(words):
            vec1 = vecs[word1_idx]
            vec2 = vecs[word2_idx]
            if np.any(vec1) and np.any(vec2):
                score = cosine_similarity(X=[vec1], Y=[vec2])
                df_list.append({'x': word1, 'y': word2, 'similarity': score[0][0]})
    df = pd.DataFrame(df_list)
    color_mapper = LinearColorMapper(palette=palette, low=1, high=0)
    TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
    p = figure(x_range=list(words), y_range=list(reversed(list(words))),
               x_axis_location="above", plot_width=900, plot_height=900,
               toolbar_location='below', tools=TOOLS,
               tooltips=[('words', '@x @y'), ('similarity', '@similarity')])
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 3.14 / 3
    p.rect(x="x", y="y", width=1, height=1,
           source=df,
           fill_color={'field': 'similarity', 'transform': color_mapper},
           line_color=None)
    color_bar = ColorBar(ticker=BasicTicker(desired_num_ticks=5),
                         color_mapper=color_mapper, major_label_text_font_size="7pt",
                         label_standoff=6, border_line_color=None, location=(0, 0))
    p.add_layout(color_bar, 'right')
    show(p)

In [39]:
words = random.sample(vocab[:3000], 30)
vecs = [embedding_model.wv[word] for word in words]
visualize_between_words(words, vecs)

## 임베딩을 텍스트 형태로 저장하기

In [40]:
with open("embedding/word2vec.txt", "w", encoding="utf-8") as f:
    for token in vocab:
        vec = [str(el) for el in embedding_model.wv[token]]
        line = token + " " + " ".join(vec) + "\n"
        f.writelines(line)