# Skip-gram 実装

In [2]:
!pip install tensorflow-gpu==1.2

Collecting tensorflow-gpu==1.2
[?25l  Downloading https://files.pythonhosted.org/packages/f2/5e/a51a5df287753c69459ca4572ecd9db78a259007734c4e19af7c5d68080c/tensorflow_gpu-1.2.0-cp35-cp35m-manylinux1_x86_64.whl (89.2MB)
[K    99% |████████████████████████████████| 89.1MB 11.8MB/s eta 0:00:01    19% |██████▏                         | 17.2MB 11.8MB/s eta 0:00:07    58% |██████████████████▋             | 51.9MB 12.1MB/s eta 0:00:04^C


In [1]:
from __future__ import print_function

import os
import random
import zipfile
import math
import shutil
import glob
from collections import Counter
from six.moves.urllib.request import urlretrieve

import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
from test_word2vec import ans_make_pair

os.environ["CUDA_VISIBLE_DEVICES"]='0'

file_list = ['data/ptb.train.txt', 'data/ptb.valid.txt', 'data/ptb.test.txt']
all_lines = []
for file in file_list:
    f=open(file)
    all_lines.extend(f.readlines())
    f.close()
    
data =[]
for line in all_lines:
    line = line.strip('\n').lower().split()
    data.append(line)
print('lines : ', len(data))

min_freq = 5

words = []
for sentence in data:
    words.extend(sentence)
word_cnt = Counter(words)

word2id = {'<unk>':0}
id2word = {0:'<unk>'}

for word, cnt in word_cnt.most_common():
    if cnt<min_freq:
        break
    if word !='<unk>':
        word2id[word] = len(word2id)
        id2word[len(id2word)] = word
    
print('vocabulary number is : ', len(word2id))

converted_data = []
for sentence in data:
    id_sentence = []
    for word in sentence:
        if word in word2id:
            id_sentence.append(word2id[word])
        else:
            id_sentence.append(word2id['<unk>'])
    converted_data.append(id_sentence)

def make_pair(sentence, window_size):
    X_train = []
    y_train = []
    ## TODO
    for target_index in range(len(sentence)):
        for context_index in range(max(target_index - window_size, 0), min(target_index + window_size+1, len(sentence))): 
            if target_index != context_index:
                X_train.append(sentence[target_index])
                y_train.append(sentence[context_index])
    ## TODO
    return X_train, y_train

skip_window = 2 # 何個隣までを予測するか
X_train = []
y_train = []

for sentence in converted_data:
    x, y = make_pair(sentence, skip_window)
    X_train.extend(x)
    y_train.extend(y)
                
X_train = np.array(X_train, dtype=np.int32)
y_train = np.array(y_train, dtype=np.int32)
print('train data : ', len(X_train))

# モデルの保存先ディレクトリのpath
log_path = "./log/"
if os.path.exists(log_path):
    shutil.rmtree(log_path)
os.mkdir(log_path)

model_path = os.path.join(log_path, 'model.ckpt')

tf.reset_default_graph()

N_train = len(X_train)
# 単語の種類の数
vocab_size = len(word2id)

# パラメータ
# 学習率(learning rate)
lr = 1.0
# 学習回数
n_epoch = 8
# ミニバッチサイズ
batch_size = 128
# word2vecのベクトルの次元
embed_dim = 100
# 負例をサンプリングする数
num_sampled = 32

# 入力
x = tf.placeholder(tf.int32, shape=[None])
y = tf.placeholder(tf.int32, shape=[None, 1])

# 単語ごとの埋め込みベクトルの一覧行列．ランダムで初期化する．
embed_W = tf.Variable(
    tf.random_uniform([vocab_size, embed_dim], -1.0, 1.0), name='word_embedding')

# 単語のidから埋め込みを取得する．
## TODO
hidden = tf.nn.embedding_lookup(embed_W, x)
## TODO

# 埋め込み行列から出力層のネットワークのパラメータ
nce_weights = tf.Variable(
    tf.truncated_normal([vocab_size, embed_dim],
                        stddev=1.0 / math.sqrt(embed_dim)))
nce_biases = tf.Variable(tf.zeros([vocab_size]))

# 損失関数: Noise Contrastive Estimation loss
loss = tf.reduce_mean(
    tf.nn.nce_loss(weights=nce_weights,
                   biases=nce_biases,
                   inputs=hidden,
                   labels=y,
                   num_sampled=num_sampled,
                   num_classes=vocab_size))

# SGD(Stochastic Gradient Descent : 確率的勾配降下法)で目的関数を最小化する
optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)

saver = tf.train.Saver()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(n_epoch):
        print ('epoch %d | ' % epoch, end="")

        sum_loss = 0        
        # 訓練データをシャッフルする
        perm = np.random.permutation(N_train)
                              
        for i in range(0, N_train, batch_size):
            # ミニバッチ分のデータを取ってくる
            X_batch = X_train[perm[i:i+batch_size]]
            y_batch = y_train[perm[i:i+batch_size]].reshape(-1, 1)
        
            feed_dict = {x:X_batch, y:y_batch}
            _, loss_val = sess.run([optimizer, loss], feed_dict=feed_dict)
            sum_loss += loss_val * X_batch.shape[0]

        print('Train loss %.5f' %(sum_loss/ N_train))
        
    # 学習されたベクトルの値を取得する
    final_embed = embed_W.eval()
    
    # モデルの保存
    saver.save(sess, model_path)
    
    # 埋め込み空間の可視化用 
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = embed_W.name
    embedding.metadata_path = 'metadata.tsv'
    summary_writer = tf.summary.FileWriter(log_path)
    projector.visualize_embeddings(summary_writer, config)

    sorted_dict = sorted(word2id.items(), key=lambda x: x[1])
    words = ["{}\n".format(x[0]) for x in sorted_dict]
    with open("log/metadata.tsv", "w") as f:
        f.writelines(words)

lines :  49199
vocabulary number is :  9999
train data :  3851436
epoch 0 | Train loss 7.39313
epoch 1 | Train loss 4.18448
epoch 2 | Train loss 4.11805
epoch 3 | Train loss 4.07445
epoch 4 | Train loss 4.04044
epoch 5 | Train loss 4.01317
epoch 6 | Train loss 3.98961
epoch 7 | Train loss 3.97006


# 近傍単語抽出

In [3]:
# ベクトルの正規化
norm_embed = final_embed/np.linalg.norm(final_embed, axis=1, keepdims=True)

#　コサイン距離が近い単語上位top_n個取得する
def get_sim_word(query, top_n=50):
    query = query/np.linalg.norm(query)
    cos = 1- np.dot(query[np.newaxis,:], norm_embed.T)[0]
    sim = np.argsort(cos)
    return cos[sim[:top_n]], sim[:top_n]

input_word = 'tokyo'
result = final_embed[word2id[input_word]]
scores, indices = get_sim_word(result)
for i, index in enumerate(indices):
    if id2word[index]!=input_word:
        print(i, ' : ', id2word[index], '(distance : {0:.2})'.format(scores[i]))

1  :  kick (distance : 0.54)
2  :  beginning (distance : 0.54)
3  :  offices (distance : 0.55)
4  :  similarity (distance : 0.56)
5  :  london (distance : 0.56)
6  :  13th (distance : 0.56)
7  :  listed (distance : 0.57)
8  :  shouted (distance : 0.57)
9  :  addition (distance : 0.57)
10  :  afternoon (distance : 0.57)
11  :  shaking (distance : 0.57)
12  :  cheered (distance : 0.58)
13  :  rally (distance : 0.58)
14  :  resignation (distance : 0.58)
15  :  yesterday (distance : 0.58)
16  :  nasdaq (distance : 0.58)
17  :  rebound (distance : 0.59)
18  :  libor (distance : 0.59)
19  :  waves (distance : 0.59)
20  :  rebounded (distance : 0.59)
21  :  weakness (distance : 0.59)
22  :  aug. (distance : 0.59)
23  :  year-ago (distance : 0.59)
24  :  nippon (distance : 0.59)
25  :  gatt (distance : 0.59)
26  :  near (distance : 0.59)
27  :  winner (distance : 0.59)
28  :  los (distance : 0.59)
29  :  declines (distance : 0.6)
30  :  defeated (distance : 0.6)
31  :  applying (distance : 0.6