# kor2vec train.py

## settings

In [40]:
import numpy as np
import tensorflow as tf
import collections
from konlpy.tag import Twitter
import re
import math
import random
import pandas as pd

DATA_PATH = "./data/"

## 1. Pre-process Data

In [41]:
def build_dataset(train_text, min_count, sampling_rate):
    words = list()
    for line in desc_list:
        sentence = re.sub(r"[^ㄱ-힣a-zA-Z0-9]+", ' ', line).strip().split()
        if sentence:
            words.append(sentence)

    word_counter = [['UNK', -1]]
    word_counter.extend(collections.Counter([word for sentence in words for word in sentence]).most_common())
    word_counter = [item for item in word_counter if item[1] >= min_count or item[0] == 'UNK']

    word_list = list()
    word_dict = dict()
    for word, count in word_counter:
        word_list.append(word) # 학습에 사용된 word를 저장한다. (visualize를 위해)
        word_dict[word] = len(word_dict)
    word_reverse_dict = dict(zip(word_dict.values(), word_dict.keys()))

    word_to_pos_li = dict()
    pos_list = list()
    twitter = Twitter()
    for w in word_dict:
        w_pos_li = list()
        for pos in twitter.pos(w, norm=True):
            w_pos_li.append(pos)

        word_to_pos_li[word_dict[w]] = w_pos_li
        pos_list += w_pos_li

    pos_counter = collections.Counter(pos_list).most_common()

    pos_dict = dict()
    for pos, _ in pos_counter:
        pos_dict[pos] = len(pos_dict)

    pos_reverse_dict = dict(zip(pos_dict.values(), pos_dict.keys()))

    word_to_pos_dict = dict()

    for word_id, pos_li in word_to_pos_li.items():
        pos_id_li = list()
        for pos in pos_li:
            pos_id_li.append(pos_dict[pos])
        word_to_pos_dict[word_id] = pos_id_li

    data = list()
    unk_count = 0
    for sentence in words:
        s = list()
        for word in sentence:
            if word in word_dict:
                index = word_dict[word]
            else:
                index = word_dict['UNK']
                unk_count += 1
            s.append(index)
        data.append(s)
    word_counter[0][1] = max(1, unk_count)

    # data = sub_sampling(data, word_counter, word_dict, sampling_rate)

    return data, word_dict, word_reverse_dict, pos_dict, pos_reverse_dict, word_to_pos_dict, word_list

def sub_sampling(data, word_counter, word_dict, sampling_rate):
    total_words = sum([len(sentence) for sentence in data])
    # print("total_words: {}".format(total_words))
    prob_dict = dict()
    for word, count in word_counter:
        f = count / total_words # 빈도수가 많을수록 f가 1에 가까워짐.
        p = max(0, 1 - math.sqrt(sampling_rate / f)) # sampling_rate가 0.0001이면 f가 클수록 prob이 커진다.
        prob_dict[word_dict[word]] = p
        # print("count : {}, f : {}, p : {}, prob_dict : {}".format(count, f, p, prob_dict))

    new_data = list()
    for sentence in data:
        s = list()
        for word in sentence:
            prob = prob_dict[word]
            if random.random() > prob: # prob이 작을수록 s에 저장되기 쉬움.
                s.append(word)
        new_data.append(s)

    return new_data

In [42]:
# crawling한 데이터를 불러온다.
pk_data = pd.read_csv(DATA_PATH + 'pk_data_g1.csv')
desc_list = []
for i in range(len(pk_data)):
    for desc in pk_data['desc'][i].split('.'):
        desc_list.append(desc)

sampling_rate = 0.0001
min_count = 5

data, word_dict, word_reverse_dict, pos_dict, pos_reverse_dict, word_to_pos_dict, word_list \
        = build_dataset(desc_list, min_count, sampling_rate)

### Save word list

In [43]:
# 학습에 사용된 word list 저장
f = open("word_list.txt", 'w')
for word in word_list:
    input_word = "{} ".format(word)
    f.write(input_word)
f.close()

In [5]:
vocabulary_size = len(word_dict)
pos_size = len(pos_dict)
num_sentences = len(data)

print("number of sentences :", num_sentences)
print("vocabulary size :", vocabulary_size)
print("pos size :", pos_size)

pos_li = []
for key in sorted(pos_reverse_dict):
    pos_li.append(pos_reverse_dict[key])

number of sentences : 4799
vocabulary size : 1660
pos size : 1335


## 2. Function to generate a training batch

In [6]:
window_size = 5
batch_size = 150

# kor2vec 의 input index list와 output index list를 만든다.
# 윈도우 사이즈에 따라 input output pair가 늘어난다.(input이 중복)
def generate_input_output_list(data, window_size):
    input_li = list()
    output_li = list()
    for sentence in data:
        for i in range(len(sentence)):
            for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
                if i != j:
                    if sentence[i]!=word_dict['UNK'] and sentence[j]!=word_dict['UNK']:
                        input_li.append(sentence[i])
                        output_li.append(sentence[j])
    return input_li, output_li

input_li, output_li = generate_input_output_list(data, window_size)
input_li_size = len(input_li)

# 확인
# for i in range(input_li_size):
#     print("-{}-".format(i)) 
#     in_index = word_to_pos_dict[input_li[i]]
#     out_index = word_to_pos_dict[output_li[i]]
#     print(in_index)
#     for ind in in_index:
#         print(pos_reverse_dict[ind])
#     print(out_index)
#     for o in out_index:
#         print(pos_reverse_dict[o])

### generate batch test

In [7]:
print(batch_size)
print(input_li_size)
def generate_batch(iter, batch_size, input_li, output_li):
    index = (iter % (input_li_size//batch_size)) * batch_size
    batch_input = input_li[index:index+batch_size]
    batch_output_li = output_li[index:index+batch_size]
    batch_output = [[i] for i in batch_output_li]

    return np.array(batch_input), np.array(batch_output)

batch_inputs, batch_labels = generate_batch(0, batch_size, input_li, output_li)
print(np.shape(batch_inputs))
print(batch_inputs)
print(np.shape(batch_labels))
print(batch_labels)
word_list = []
for word in batch_inputs:
    word_list.append(word_to_pos_dict[word])
print(word_list)
#     for pos in word_to_pos_dict[word]:
#         print(pos)
#         print(pos_reverse_dict[pos])

150
81434
(150,)
[ 426  426  426  426  426  282  282  282  282  282  282   59   59   59
   59   59   59   59  558  558  558  558  558  558  558  558  874  874
  874  874  874  874  874  874   60   60   60   60   60   60   60   60
  875  875  875  875  875  875  875   55   55   55   55   55   55  184
  184  184  184  184  634  634  634  634  282  282  282  282  282   59
   59   59   59   59   59  104  104  104  104  104  104  104  874  874
  874  874  874  874  874  427  427  427  427  427  427  427  876  876
  876  876  876  876  428  428  428  428  428    3    3    3   91   91
   91 1036 1036 1036  726  726  726  727  727  727   58   58   58   58
 1299 1299 1299 1299   62   62   62   62 1300 1300 1300 1300  383  383
  383  383   58   58 1301 1301 1300 1300  876  876]
(150, 1)
[[ 282]
 [  59]
 [ 558]
 [ 874]
 [  60]
 [ 426]
 [  59]
 [ 558]
 [ 874]
 [  60]
 [ 875]
 [ 426]
 [ 282]
 [ 558]
 [ 874]
 [  60]
 [ 875]
 [  55]
 [ 426]
 [ 282]
 [  59]
 [ 874]
 [  60]
 [ 875]
 [  55]
 [ 184]
 [ 4

## 3. Build model

In [26]:
embedding_size = 150
num_sampled = 50
learning_rate = 1.0

valid_size = 20     # Random set of words to evaluate similarity on.
valid_window = 200  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False) # 200까지 숫자 중에서 랜덤하게 20개 뽑음

# tensorflow 신경망 모델 그래프 생성
graph = tf.Graph()
with graph.as_default():
    # Input data
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    words_matrix = [tf.placeholder(tf.int32, shape=None) for _ in range(batch_size)] # batch_size만큼의 word를 형태소로
    vocabulary_matrix = [tf.placeholder(tf.int32, shape=None) for _ in range(vocabulary_size)] # word_dict만큼의 word를 형태소로.. 인거 같은데 안씀
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # "/device:GPU:0"
    with tf.device('/cpu:0'):
        # embedding vector -> 우리가 원하는 최종 출력
        pos_embeddings = tf.Variable(tf.random_uniform([pos_size, embedding_size], -1.0, 1.0), name='pos_embeddings')

        word_vec_list = []
        for i in range(batch_size):
            word_vec = tf.reduce_sum(tf.nn.embedding_lookup(pos_embeddings, words_matrix[i]), 0)
            word_vec_list.append(word_vec)
        word_embeddings = tf.stack(word_vec_list) # word의 각 형태소를 embedding한 vector
    
        # Noise-Contrastive Estimation
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)), name='nce_weights'
        )
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]), name='nce_biases')

    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                       biases=nce_biases,
                       labels=train_labels,
                       inputs=word_embeddings,
                       num_sampled=num_sampled,
                       num_classes=vocabulary_size))

    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
    init = tf.global_variables_initializer()

    # Compute the cosine similarity between minibatch exaples and all embeddings.
    # 임의의 word로 유사도 검증
    norm = tf.sqrt(tf.reduce_sum(tf.square(pos_embeddings), 1, keep_dims=True))
    normalized_embeddings = pos_embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

Instructions for updating:
keep_dims is deprecated, use keepdims instead


## 4. Train model

In [38]:
num_iterations = input_li_size // batch_size
print("number of iterations for each epoch :", num_iterations)
epochs = 10
num_steps = num_iterations * epochs + 1

with tf.Session(graph=graph) as session:
    init.run()
    print("Initialized - Tensorflow")

    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(step, batch_size, input_li, output_li)

        word_list = []
        for word in batch_inputs:
            word_list.append(word_to_pos_dict[word])

        feed_dict = {}
        for i in range(batch_size):
            feed_dict[words_matrix[i]] = word_list[i]
        feed_dict[train_inputs] = batch_inputs
        feed_dict[train_labels] = batch_labels

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % (num_steps//10) == 0:
            if step > 0:
                average_loss /= 2000
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0

        if step % (num_steps//4) == 0:
            pos_embed = pos_embeddings.eval()

            # Print nearest words
            sim = similarity.eval()
            for i in range(valid_size):
                valid_pos = pos_reverse_dict[valid_examples[i]]
                top_k = 8
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % str(valid_pos)
                for k in range(top_k):
                    close_word = pos_reverse_dict[nearest[k]]
                    log_str = '%s %s,' % (log_str, str(close_word))
                print(log_str)

    pos_embed = pos_embeddings.eval()

number of iterations for each epoch : 542
Initialized - Tensorflow
Average loss at step  0 :  145.46214294433594
Nearest to ('독침', 'Noun'): ('팬텀', 'Noun'), ('날아다니', 'Verb'), ('이끼', 'Noun'), ('인', 'Josa'), ('보인', 'Verb'), ('온몸', 'Noun'), ('지금', 'Noun'), ('아라리', 'Noun'),
Nearest to ('지느러미', 'Noun'): ('향해', 'Verb'), ('소유자', 'Noun'), ('구멍', 'Noun'), ('낼', 'Noun'), ('일격', 'Noun'), ('인해', 'Verb'), ('보호하고', 'Verb'), ('울', 'PreEomi'),
Nearest to ('전', 'Noun'): ('둥지', 'Noun'), ('강력하다', 'Adjective'), ('치는', 'Verb'), ('정밀', 'Noun'), ('안전한', 'Adjective'), ('가진', 'Verb'), ('추위', 'Noun'), ('이지만', 'Josa'),
Nearest to ('진화', 'Noun'): ('사', 'Verb'), ('깊이', 'Noun'), ('힘', 'Noun'), ('발견되', 'Verb'), ('눈', 'Noun'), ('양은', 'Noun'), ('꾸고', 'Verb'), ('까지', 'Noun'),
Nearest to ('빠르', 'Adjective'): ('은', 'Eomi'), ('많이', 'Adverb'), ('것', 'Noun'), ('쉴', 'Verb'), ('잡는', 'Verb'), ('지어', 'Verb'), ('뇌', 'Noun'), ('시킨', 'Verb'),
Nearest to ('뿔', 'Noun'): ('후', 'Noun'), ('가끔', 'Noun'), ('뿌린', 'Verb'), ('어미', 'Noun'), (

## 5. Save embedding vector

In [39]:
# Function to save vectors.
def save_model(pos_list, embeddings, file_name):
    with open(file_name, 'w') as f:
        f.write(str(len(pos_list)))
        f.write(" ")
        f.write(str(embedding_size))
        f.write("\n")
        for i in range(len(pos_list)):
            pos = pos_list[i]
            f.write(str(pos).replace("', '", "','") + " ")
            f.write(' '.join(map(str, embeddings[i])))
            f.write("\n")

# Save vectors
save_model(pos_li, pos_embed, "pos.vec")