# 포켓몬 위키 데이터 RNN

In [None]:
import tensorflow as tf
import numpy as np
import csv
import re
import collections
import pandas as pd
from konlpy.tag import Twitter
import math
import random

tf.set_random_seed(777)

In [None]:
def build_dataset(train_text, min_count, sampling_rate):
    words = list()
        #워드-인덱스
        #인덱스-워드
        #형태소-인덱스
        #인덱스-형태소
        #총 4개의 리스트 생성
    for line in desc_list:
        sentence = re.sub(r"[^ㄱ-힣a-zA-Z0-9]+", ' ', line).strip().split()
        if sentence:
            words.append(sentence)
            
    print("words: ", words)

    word_counter = [['UNK', -1]] #시작점
    word_counter.extend(collections.Counter([word for sentence in words for word in sentence]).most_common())
    word_counter = [item for item in word_counter if item[1] >= min_count or item[0] == 'UNK']

    word_list = list()
    word_dict = dict()
    for word, count in word_counter:
        word_list.append(word) # 학습에 사용된 word를 저장한다. (visualize를 위해)
        word_dict[word] = len(word_dict)
    word_reverse_dict = dict(zip(word_dict.values(), word_dict.keys()))

    word_to_pos_li = dict() #워드-인덱스
    pos_list = list()
    twitter = Twitter()
    for w in word_dict:
        w_pos_li = list()
        for pos in twitter.pos(w, norm=True):
            w_pos_li.append(pos)

        word_to_pos_li[word_dict[w]] = w_pos_li
        pos_list += w_pos_li

    pos_counter = collections.Counter(pos_list).most_common()

    pos_dict = dict()
    for pos, _ in pos_counter:
        pos_dict[pos] = len(pos_dict)

    pos_reverse_dict = dict(zip(pos_dict.values(), pos_dict.keys()))

    word_to_pos_dict = dict()

    for word_id, pos_li in word_to_pos_li.items():
        pos_id_li = list()
        for pos in pos_li:
            pos_id_li.append(pos_dict[pos])
        word_to_pos_dict[word_id] = pos_id_li

    data = list()
    unk_count = 0
    for sentence in words:
        s = list()
        for word in sentence:
            if word in word_dict:
                index = word_dict[word]
            else:
                index = word_dict['UNK']
                unk_count += 1
            s.append(index)
        data.append(s)
    word_counter[0][1] = max(1, unk_count)

    #data = sub_sampling(data, word_counter, word_dict, sampling_rate)
    
    print("new: ", data)

    return data, word_dict, word_reverse_dict, pos_dict, pos_reverse_dict, word_to_pos_dict, word_list

def sub_sampling(data, word_counter, word_dict, sampling_rate):
    total_words = sum([len(sentence) for sentence in data])
    # print("total_words: {}".format(total_words))
    prob_dict = dict()
    for word, count in word_counter:
        f = count / total_words # 빈도수가 많을수록 f가 1에 가까워짐.
        p = max(0, 1 - math.sqrt(sampling_rate / f)) # sampling_rate가 0.0001이면 f가 클수록 prob이 커진다.
        prob_dict[word_dict[word]] = p
        # print("count : {}, f : {}, p : {}, prob_dict : {}".format(count, f, p, prob_dict))

    new_data = list()
    for sentence in data:
        s = list()
        for word in sentence:
            prob = prob_dict[word]
            if random.random() > prob: # prob이 작을수록 s에 저장되기 쉬움.
                s.append(word)
        new_data.append(s)
        
    print("new: ", new_data)

    return new_data

In [None]:
# crawling한 데이터를 불러온다.
pk_data = pd.read_csv('pk_data_g1.csv')
desc_list = []
desc_list_all=[]

#for i in range(len(pk_data)):
for desc in pk_data['desc'][1].split('.'):
    desc_list_all.append(desc)

desc_list.append(desc_list_all[0])

sampling_rate = 0.0001
min_count = 0

print("dec_list: ", desc_list)

data, word_dict, word_reverse_dict, pos_dict, pos_reverse_dict, word_to_pos_dict, word_list \
        = build_dataset(desc_list, min_count, sampling_rate)

print("word_dict: ", word_dict)
print("word_reverse_dict: ", word_reverse_dict)
print("pos_dict: ", pos_dict)
print("pos_reverse_dict: ", pos_reverse_dict)
print("word_to_pos_dict: ", word_to_pos_dict)
print("word_list: ", word_list)

In [None]:
vocabulary_size = len(word_dict)
pos_size = len(pos_dict)
num_sentences = len(data)

print("number of sentences :", num_sentences)
print("vocabulary size :", vocabulary_size)
print("pos size :", pos_size)

pos_li = []
for key in sorted(pos_reverse_dict):
    pos_li.append(pos_reverse_dict[key])

In [None]:
window_size = 3
batch_size = 20

# kor2vec 의 input index list와 output index list를 만든다.
# 윈도우 사이즈에 따라 input output pair가 늘어난다.(input이 중복)
def generate_input_output_list(data, window_size):
    input_li = list()
    output_li = list()
    for sentence in data:
        for i in range(len(sentence)):
            for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
                if i != j:
                    if sentence[i]!=word_dict['UNK'] and sentence[j]!=word_dict['UNK']:
                        input_li.append(sentence[i])
                        output_li.append(sentence[j])
    
    print("input: ", input_li)
    print("output: ", output_li)
                        
    return input_li, output_li

input_li, output_li = generate_input_output_list(data, window_size)
input_li_size = len(input_li)

In [None]:
def generate_batch(iter, batch_size, input_li, output_li):
    print("input_li_size: " , input_li_size)
    print("batch_size: " , batch_size)
    print("iter: " ,iter)
    print('cal: ', input_li_size//batch_size)
    print("input: " ,input_li)
    print("output: " ,output_li)
    index = (iter % (input_li_size//batch_size)) * batch_size
    batch_input = input_li[index:index+batch_size]
    batch_output_li = output_li[index:index+batch_size]
    batch_output = [[i] for i in batch_output_li]

    print("--------end of the function generate_batch----------")
    
    return np.array(batch_input), np.array(batch_output)

batch_inputs, batch_labels = generate_batch(0, batch_size, input_li, output_li)
print("np batch inputs: " , np.shape(batch_inputs))
print("batch inputs: ", batch_inputs)
print("np batch labels: ", np.shape(batch_labels))
print("batch labels: ", batch_labels)
word_list = []
for word in batch_inputs:
    word_list.append(word_to_pos_dict[word])
print("word list: ", word_list)
#     for pos in word_to_pos_dict[word]:
#         print(pos)
#         print(pos_reverse_dict[pos])

In [None]:
embedding_size = 50 #vector size
num_sampled = 20
learning_rate = 1.0

valid_size = 20     # Random set of words to evaluate similarity on.
valid_window = 200  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False) # 200까지 숫자 중에서 랜덤하게 20개 뽑음

# tensorflow 신경망 모델 그래프 생성
graph = tf.Graph()
with graph.as_default():
    # Input data
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    words_matrix = [tf.placeholder(tf.int32, shape=None) for _ in range(batch_size)] # batch_size만큼의 word를 형태소로
    vocabulary_matrix = [tf.placeholder(tf.int32, shape=None) for _ in range(vocabulary_size)] # word_dict만큼의 word를 형태소로.. 인거 같은데 안씀
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # "/device:GPU:0"
    with tf.device('/cpu:0'):
        # embedding vector -> 우리가 원하는 최종 출력
        pos_embeddings = tf.Variable(tf.random_uniform([pos_size, embedding_size], -1.0, 1.0), name='pos_embeddings')

        word_vec_list = []
        for i in range(batch_size):
            word_vec = tf.reduce_sum(tf.nn.embedding_lookup(pos_embeddings, words_matrix[i]), 0)
            word_vec_list.append(word_vec)
        word_embeddings = tf.stack(word_vec_list) # word의 각 형태소를 embedding한 vector
    
        # Noise-Contrastive Estimation
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)), name='nce_weights'
        )
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]), name='nce_biases')

    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                       biases=nce_biases,
                       labels=train_labels,
                       inputs=word_embeddings,
                       num_sampled=num_sampled,
                       num_classes=vocabulary_size))

    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
    init = tf.global_variables_initializer()

    # Compute the cosine similarity between minibatch exaples and all embeddings.
    # 임의의 word로 유사도 검증
    norm = tf.sqrt(tf.reduce_sum(tf.square(pos_embeddings), 1, keep_dims=True))
    normalized_embeddings = pos_embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [None]:
num_iterations = input_li_size // batch_size
print("number of iterations for each epoch :", num_iterations)
epochs = 5
num_steps = num_iterations * epochs + 1

with tf.Session(graph=graph) as session:
    init.run()
    print("Initialized - Tensorflow")

    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(step, batch_size, input_li, output_li)

        word_list = []
        for word in batch_inputs:
            word_list.append(word_to_pos_dict[word])

        feed_dict = {}
        for i in range(batch_size):
            feed_dict[words_matrix[i]] = word_list[i]
        feed_dict[train_inputs] = batch_inputs
        feed_dict[train_labels] = batch_labels

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % (num_steps//10) == 0:
            if step > 0:
                average_loss /= 200
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0

        if step % (num_steps//4) == 0:
            pos_embed = pos_embeddings.eval()

            # Print nearest words
            sim = similarity.eval()
            for i in range(valid_size):
                valid_pos = pos_reverse_dict[valid_examples[i]]
                top_k = 8
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % str(valid_pos)
                for k in range(top_k):
                    close_word = pos_reverse_dict[nearest[k]]
                    log_str = '%s %s,' % (log_str, str(close_word))
                print(log_str)

    pos_embed = pos_embeddings.eval()

In [None]:
# Function to save vectors.
def save_model(pos_list, embeddings, file_name):
    with open(file_name, 'w') as f:
        f.write(str(len(pos_list)))
        f.write(" ")
        f.write(str(embedding_size))
        f.write("\n")
        for i in range(len(pos_list)):
            pos = pos_list[i]
            f.write(str(pos).replace("', '", "','") + " ")
            f.write(' '.join(map(str, embeddings[i])))
            f.write("\n")

# Save vectors
save_model(pos_li, pos_embed, "pos.vec")

In [None]:
pos_li

In [None]:
pos_embed

# ------------------------------------------------------------

In [1]:
# module import
import pandas as pd
import numpy as np
import tensorflow as tf
import collections
from konlpy.tag import Twitter
import re
import math
import random
import pandas as pd

In [2]:
# data load
df=pd.read_csv("pk_data_g1.csv")

In [3]:
# desc
df['desc']

0      태어날 때부터 등에 식물의 씨앗이 있어 조금씩 크게 자란다.태어났을 때부터 등에 이...
1      꽃봉오리가 등에 붙어 있으며 양분을 흡수해가면 커다란 꽃이 핀다고 한다.등의 꽃봉오...
2      꽃에서 황홀한 향기가 퍼져나가 싸우는 자의 기분을 달래게 한다.태양 에너지를 영양으...
3      태어날 때부터 꼬리의 불꽃이 타오르고 있다. 불꽃이 꺼지면 그 생명이 다하고 만다....
4      꼬리를 휘둘러 상대를 쓰러트리고 날카로운 발톱으로 갈기갈기 찢어버린다.불타는 꼬리를...
5      지상 1400미터까지 날개를 사용해 나는 것이 가능하다. 고열의 불꽃을 내뿜는다.암...
6      긴 목을 등껍질 안에 움추릴 때 강한 물대포를 발사한다.태어난 후 등이 부풀어 단단...
7      애완동물으로서 인기가 높다. 또 털로 감싸진 꼬리는 장수의 상징이다.자주 수중에 숨...
8      무거운 몸으로 상대를 덮쳐서 기절시킨다. 위기에 처하면 등껍질에 숨는다.등껍질에 분...
9      푸른 피부로 감싸져 있다. 탈피하여 성장하면 실을 늘어뜨려 번데기로 바꾼다.발은 짧...
10     단단한 껍질에 둘러싸여 있으나 안은 부드러워서 강한 공격에는 견디지 못한다.껍질이 ...
11     물을 튕겨내는 가루가 날개를 보호하고 있다. 비가 오는 날에도 하늘을 날 수 있다....
12     숲이나 풀밭에 많이 서식한다. 머리끝에 5cm 정도의 작고 날카로운 독침을 지니고 ...
13     성체의 몸을 만들기 위해서 일시적인 상태. 스스로로도 거의 움직이지 않는다.스스로는...
14     집단으로 나타나기도 한다. 맹렬한 스피드로 날아 엉덩이의 독침으로 마구 찌른다.고속...
15     싸움을 좋아하지 않는다. 풀 숲 안에 숨어 작은 벌레 등을 잡는다.숲이나 수풀에 많...
16     발톱이 발달해 있다. 먹이인 아라리를 잡아 100km 떨어져 있는 둥지까지 나른다....
17     아름다운 날개를 펼쳐 상대를 위협한다. 마하

In [4]:
# data 정제, 띄어쓰기 기준으로 분리
desc_list = []
words = []
for desc in df['desc'][0].split('.'): # 한 포켓몬에 대해서 정리 한 포켓몬
    desc_list.append(desc)
    sentence = re.sub(r"[^ㄱ-힣a-zA-Z0-9]+", ' ', desc).strip().split()
    if sentence:
        words.append(sentence)
        print(sentence)

['태어날', '때부터', '등에', '식물의', '씨앗이', '있어', '조금씩', '크게', '자란다']
['태어났을', '때부터', '등에', '이상한', '씨앗이', '심어져', '있으며', '몸과', '함께', '자란다고', '한다']
['며칠', '동안', '아무것도', '먹지', '않아도', '건강']
['등의', '씨앗에', '많은', '영양이', '있기에', '괜찮다']
['등의', '씨앗', '안에는', '영양이', '듬뿍']
['씨앗은', '몸과', '같이', '커지고', '있다']
['태어날', '때부터', '등에', '씨앗을', '짊어지고', '있다']
['몸이', '커지면', '자란만큼', '씨앗도', '커지게', '된다']
['태어나서', '잠깐', '동안', '등의', '씨앗에', '모인', '영양을', '받고', '자란다']
['양지에서', '낮잠', '자는', '모습을', '볼', '수', '있다']
['태양의', '빛을', '많이', '받으면', '등의', '씨앗이', '크게', '자란다']
['태어날', '때부터', '등에', '식물의', '씨앗이', '있어', '조금씩', '크게', '자란다']
['태어났을', '때부터', '등에', '이상한', '씨앗이', '심어져', '있으며', '몸과', '함께', '자란다고', '한다']
['태어나면서', '잠깐', '동안', '등의', '씨앗에서', '영양을', '받으며', '크게', '자란다']
['태어나서부터', '얼마', '동안은', '등의', '씨앗으로부터', '영양을', '공급받아', '크게', '성장한다']
['등의', '씨앗', '안에는', '영양이', '가득하다']
['씨앗은', '몸과', '함께', '커진다']
['태어날', '때부터', '등에', '씨앗을', '짊어지고', '있다']
['몸이', '크게', '성장함에', '따라', '씨앗도', '커진다']
['태어나서부터', '얼마', '동안은', '등의', '씨앗으로부터', '영양을', '공급받아', '크게', 

In [5]:
word_counter = [['UNK', -1]] # 빈도수 문제로 word_dict에 없는 word를 처리하기 위함. unknown
word_counter.extend(collections.Counter([word for sentence in words for word in sentence]).most_common())
print(word_counter)
word_counter = [item for item in word_counter if item[1] >= 0 or item[0] == 'UNK']
print(word_counter)

# 저장한 단어의 index를 지정하고 저장.
word_dict = dict()
for word, count in word_counter:
    word_dict[word] = len(word_dict)
word_reverse_dict = dict(zip(word_dict.values(), word_dict.keys()))
print(len(word_dict))
print(word_dict)
print(word_reverse_dict)

[['UNK', -1], ('등의', 10), ('크게', 9), ('때부터', 7), ('등에', 7), ('씨앗이', 7), ('자란다', 6), ('몸과', 5), ('있다', 5), ('영양을', 5), ('태어날', 4), ('함께', 4), ('태어났을', 3), ('이상한', 3), ('심어져', 3), ('있으며', 3), ('자란다고', 3), ('한다', 3), ('동안', 3), ('영양이', 3), ('태어나서부터', 3), ('얼마', 3), ('동안은', 3), ('씨앗으로부터', 3), ('공급받아', 3), ('성장한다', 3), ('식물의', 2), ('있어', 2), ('조금씩', 2), ('씨앗에', 2), ('씨앗', 2), ('안에는', 2), ('씨앗은', 2), ('씨앗을', 2), ('짊어지고', 2), ('몸이', 2), ('씨앗도', 2), ('잠깐', 2), ('양지에서', 2), ('낮잠', 2), ('자는', 2), ('모습을', 2), ('볼', 2), ('수', 2), ('태양의', 2), ('빛을', 2), ('많이', 2), ('받으면', 2), ('커진다', 2), ('며칠', 1), ('아무것도', 1), ('먹지', 1), ('않아도', 1), ('건강', 1), ('많은', 1), ('있기에', 1), ('괜찮다', 1), ('듬뿍', 1), ('같이', 1), ('커지고', 1), ('커지면', 1), ('자란만큼', 1), ('커지게', 1), ('된다', 1), ('태어나서', 1), ('모인', 1), ('받고', 1), ('태어나면서', 1), ('씨앗에서', 1), ('받으며', 1), ('가득하다', 1), ('성장함에', 1), ('따라', 1)]
[['UNK', -1], ('등의', 10), ('크게', 9), ('때부터', 7), ('등에', 7), ('씨앗이', 7), ('자란다', 6), ('몸과', 5), ('있다', 5), ('영양을', 5), ('태어날', 4), ('

In [6]:
word_dict

{'UNK': 0,
 '등의': 1,
 '크게': 2,
 '때부터': 3,
 '등에': 4,
 '씨앗이': 5,
 '자란다': 6,
 '몸과': 7,
 '있다': 8,
 '영양을': 9,
 '태어날': 10,
 '함께': 11,
 '태어났을': 12,
 '이상한': 13,
 '심어져': 14,
 '있으며': 15,
 '자란다고': 16,
 '한다': 17,
 '동안': 18,
 '영양이': 19,
 '태어나서부터': 20,
 '얼마': 21,
 '동안은': 22,
 '씨앗으로부터': 23,
 '공급받아': 24,
 '성장한다': 25,
 '식물의': 26,
 '있어': 27,
 '조금씩': 28,
 '씨앗에': 29,
 '씨앗': 30,
 '안에는': 31,
 '씨앗은': 32,
 '씨앗을': 33,
 '짊어지고': 34,
 '몸이': 35,
 '씨앗도': 36,
 '잠깐': 37,
 '양지에서': 38,
 '낮잠': 39,
 '자는': 40,
 '모습을': 41,
 '볼': 42,
 '수': 43,
 '태양의': 44,
 '빛을': 45,
 '많이': 46,
 '받으면': 47,
 '커진다': 48,
 '며칠': 49,
 '아무것도': 50,
 '먹지': 51,
 '않아도': 52,
 '건강': 53,
 '많은': 54,
 '있기에': 55,
 '괜찮다': 56,
 '듬뿍': 57,
 '같이': 58,
 '커지고': 59,
 '커지면': 60,
 '자란만큼': 61,
 '커지게': 62,
 '된다': 63,
 '태어나서': 64,
 '모인': 65,
 '받고': 66,
 '태어나면서': 67,
 '씨앗에서': 68,
 '받으며': 69,
 '가득하다': 70,
 '성장함에': 71,
 '따라': 72}

In [7]:
type(word_dict)

dict

In [8]:
# 전체 데이터에 대해서 
def build_dataset(train_text, min_count, sampling_rate):
    words = list()
    for line in desc_list:
        sentence = re.sub(r"[^ㄱ-힣a-zA-Z0-9]+", ' ', line).strip().split()
        if sentence:
            words.append(sentence)

    word_counter = [['UNK', -1]]
    word_counter.extend(collections.Counter([word for sentence in words for word in sentence]).most_common())
    word_counter = [item for item in word_counter if item[1] >= min_count or item[0] == 'UNK']

    word_dict = dict()
    for word, count in word_counter:
        word_dict[word] = len(word_dict)
    word_reverse_dict = dict(zip(word_dict.values(), word_dict.keys()))

    word_to_pos_li = dict()
    pos_list = list()
    twitter = Twitter()
    for w in word_dict:
        w_pos_li = list()
        for pos in twitter.pos(w, norm=True):
            w_pos_li.append(pos)

        word_to_pos_li[word_dict[w]] = w_pos_li
        pos_list += w_pos_li

    pos_counter = collections.Counter(pos_list).most_common()

    pos_dict = dict()
    for pos, _ in pos_counter:
        pos_dict[pos] = len(pos_dict)

    pos_reverse_dict = dict(zip(pos_dict.values(), pos_dict.keys()))

    word_to_pos_dict = dict()

    for word_id, pos_li in word_to_pos_li.items():
        pos_id_li = list()
        for pos in pos_li:
            pos_id_li.append(pos_dict[pos])
        word_to_pos_dict[word_id] = pos_id_li

    data = list()
    unk_count = 0
    for sentence in words:
        s = list()
        for word in sentence:
            if word in word_dict:
                index = word_dict[word]
            else:
                index = word_dict['UNK']
                unk_count += 1
            s.append(index)
        data.append(s)
    word_counter[0][1] = max(1, unk_count)

    #data = sub_sampling(data, word_counter, word_dict, sampling_rate)

    return data, word_dict, word_reverse_dict, pos_dict, pos_reverse_dict, word_to_pos_dict

def sub_sampling(data, word_counter, word_dict, sampling_rate):
    total_words = sum([len(sentence) for sentence in data])
    # print("total_words: {}".format(total_words))
    prob_dict = dict()
    for word, count in word_counter:
        f = count / total_words # 빈도수가 많을수록 f가 1에 가까워짐.
        p = max(0, 1 - math.sqrt(sampling_rate / f)) # sampling_rate가 0.0001이면 f가 클수록 prob이 커진다.
        prob_dict[word_dict[word]] = p
        # print("count : {}, f : {}, p : {}, prob_dict : {}".format(count, f, p, prob_dict))

    new_data = list()
    for sentence in data:
        s = list()
        for word in sentence:
            prob = prob_dict[word]
            if random.random() > prob: # prob이 작을수록 s에 저장되기 쉬움.
                s.append(word)
        new_data.append(s)

    return new_data

In [9]:
# crawling한 데이터를 불러온다.
pk_data = pd.read_csv('pk_data_g1.csv')
desc_list = []
for i in range(len(pk_data)):
    for desc in pk_data['desc'][i].split('.'):
        desc_list.append(desc)

sampling_rate = 0.0001
min_count = 2

# dataset 만들기
data, word_dict, word_reverse_dict, pos_dict, pos_reverse_dict, word_to_pos_dict \
        = build_dataset(desc_list, min_count, sampling_rate)
    

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


In [10]:
data

[[426, 282, 59, 558, 874, 60, 875, 55, 184],
 [634, 282, 59, 104, 874, 2204, 427, 876, 428, 2205, 3],
 [0, 91, 1036, 726, 727, 0],
 [58, 1299, 62, 1300, 0, 383],
 [58, 3278, 1301, 1300, 1660],
 [3279, 876, 1302, 0, 1],
 [426, 282, 59, 1661, 3280, 1],
 [43, 2206, 0, 3281, 0, 12],
 [1303, 3282, 91, 58, 1299, 635, 345, 1037, 184],
 [1662, 2207, 145, 21, 96, 2, 1],
 [429, 233, 53, 219, 58, 874, 55, 184],
 [426, 282, 59, 558, 874, 60, 875, 55, 184],
 [634, 282, 59, 104, 874, 2204, 427, 876, 428, 2205, 3],
 [0, 3282, 91, 58, 0, 345, 2208, 55, 184],
 [1663, 1664, 2209, 58, 2210, 345, 2211, 55, 728],
 [58, 3278, 1301, 1300, 1304],
 [3279, 876, 428, 384],
 [426, 282, 59, 1661, 3280, 1],
 [43, 55, 3283, 234, 3281, 384],
 [1663, 1664, 2209, 58, 2210, 345, 2211, 55, 728],
 [634, 282, 59, 104, 874, 2204, 427, 876, 428, 2205, 3],
 [1663, 1664, 2209, 58, 2210, 345, 2211, 55, 728],
 [1662, 2207, 145, 21, 96, 2, 1],
 [429, 233, 53, 219, 58, 874, 55, 184],
 [559, 59, 346, 427, 877, 2212, 44, 283, 2213, 

In [11]:
vocabulary_size = len(word_dict)
pos_size = len(pos_dict)
num_sentences = len(data)

print("number of sentences :", num_sentences)
print("vocabulary size :", vocabulary_size)
print("pos size :", pos_size)

pos_li = []
for key in sorted(pos_reverse_dict):
    pos_li.append(pos_reverse_dict[key])

number of sentences : 4799
vocabulary size : 5703
pos size : 4087


In [12]:
pos_li # 형태소

[('을', 'Josa'),
 ('의', 'Josa'),
 ('이', 'Josa'),
 ('에', 'Josa'),
 ('를', 'Josa'),
 ('가', 'Josa'),
 ('로', 'Josa'),
 ('으로', 'Josa'),
 ('은', 'Josa'),
 ('에서', 'Josa'),
 ('는', 'Josa'),
 ('도', 'Josa'),
 ('한다', 'Verb'),
 ('한', 'Josa'),
 ('이다', 'Josa'),
 ('하는', 'Verb'),
 ('과', 'Josa'),
 ('할', 'Verb'),
 ('에는', 'Josa'),
 ('고', 'Josa'),
 ('하고', 'Josa'),
 ('것', 'Noun'),
 ('곳', 'Noun'),
 ('하여', 'Verb'),
 ('공격', 'Noun'),
 ('다', 'Josa'),
 ('처럼', 'Josa'),
 ('버린다', 'Verb'),
 ('면', 'Josa'),
 ('인', 'Josa'),
 ('와', 'Josa'),
 ('다', 'Adverb'),
 ('까지', 'Josa'),
 ('해서', 'Verb'),
 ('에게', 'Josa'),
 ('때', 'Noun'),
 ('개', 'Noun'),
 ('포켓몬', 'Noun'),
 ('등', 'Noun'),
 ('발견', 'Noun'),
 ('몸', 'Noun'),
 ('껍질', 'Noun'),
 ('해', 'Verb'),
 ('하기', 'Verb'),
 ('하면', 'Verb'),
 ('사람', 'Noun'),
 ('들', 'Suffix'),
 ('시작', 'Noun'),
 ('땅', 'Noun'),
 ('뒤', 'Noun'),
 ('km', 'Alpha'),
 ('서', 'Josa'),
 ('정도', 'Noun'),
 ('되어', 'Verb'),
 ('독', 'Noun'),
 ('속', 'Noun'),
 ('적', 'Suffix'),
 ('헤엄', 'Noun'),
 ('바다', 'Noun'),
 ('날', 'Noun'),
 ('아'

In [13]:
sentences=data
WINDOW_SIZE=3

data=[]
for sentence in sentences:
    for idx,word in enumerate(sentence):
        for neighbor in sentence[max(idx-WINDOW_SIZE,0) : min(idx+WINDOW_SIZE, len(sentence))+1]:
            if neighbor !=word:
                data.append([word,neighbor])

In [14]:
data

[[426, 282],
 [426, 59],
 [426, 558],
 [282, 426],
 [282, 59],
 [282, 558],
 [282, 874],
 [59, 426],
 [59, 282],
 [59, 558],
 [59, 874],
 [59, 60],
 [558, 426],
 [558, 282],
 [558, 59],
 [558, 874],
 [558, 60],
 [558, 875],
 [874, 282],
 [874, 59],
 [874, 558],
 [874, 60],
 [874, 875],
 [874, 55],
 [60, 59],
 [60, 558],
 [60, 874],
 [60, 875],
 [60, 55],
 [60, 184],
 [875, 558],
 [875, 874],
 [875, 60],
 [875, 55],
 [875, 184],
 [55, 874],
 [55, 60],
 [55, 875],
 [55, 184],
 [184, 60],
 [184, 875],
 [184, 55],
 [634, 282],
 [634, 59],
 [634, 104],
 [282, 634],
 [282, 59],
 [282, 104],
 [282, 874],
 [59, 634],
 [59, 282],
 [59, 104],
 [59, 874],
 [59, 2204],
 [104, 634],
 [104, 282],
 [104, 59],
 [104, 874],
 [104, 2204],
 [104, 427],
 [874, 282],
 [874, 59],
 [874, 104],
 [874, 2204],
 [874, 427],
 [874, 876],
 [2204, 59],
 [2204, 104],
 [2204, 874],
 [2204, 427],
 [2204, 876],
 [2204, 428],
 [427, 104],
 [427, 874],
 [427, 2204],
 [427, 876],
 [427, 428],
 [427, 2205],
 [876, 874],
 [

In [15]:
df=pd.DataFrame(data,columns=['input','label'])

In [16]:
df.head()

Unnamed: 0,input,label
0,426,282
1,426,59
2,426,558
3,282,426
4,282,59


In [17]:
df

Unnamed: 0,input,label
0,426,282
1,426,59
2,426,558
3,282,426
4,282,59
5,282,558
6,282,874
7,59,426
8,59,282
9,59,558


In [18]:
len(word_dict)

5703

In [19]:
import tensorflow as tf
import numpy as np

ONE_HOT_DIM = len(word_dict) # 인코딩 차원

# one_hot_encoding code
def to_one_hot_encoding(data_point_index):
    one_hot_encoding=np.zeros(ONE_HOT_DIM)
    one_hot_encoding[data_point_index]=1
    return one_hot_encoding


X=[]
Y=[]

for x,y in zip(df['input'],df['label']):
    X.append(to_one_hot_encoding(x))
    Y.append(to_one_hot_encoding(y))
    
    
X_train=np.asarray(X)
Y_train=np.asarray(Y)


x=tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))
y_label=tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))


EMBEDDING_DIM=30

W1=tf.Variable(tf.random_normal([ONE_HOT_DIM, EMBEDDING_DIM]))
b1=tf.Variable(tf.random_normal([1]))
hidden_layer=tf.add(tf.matmul(x,W1),b1)


W2=tf.Variable(tf.random_normal([EMBEDDING_DIM, ONE_HOT_DIM]))
b2=tf.Variable(tf.random_normal([1]))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, W2), b2))

loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))

train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)


In [21]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) 

iteration = 5
for i in range(iteration):
    # input is X_train which is one hot encoded word
    # label is Y_train which is one hot encoded neighbor word
    sess.run(train_op, feed_dict={x: X_train, y_label: Y_train})
    if i % 1 == 0:
        print('iteration '+str(i)+' loss is : ', sess.run(loss, feed_dict={x: X_train, y_label: Y_train}))

iteration 0 loss is :  21.105164
iteration 1 loss is :  20.984026
iteration 2 loss is :  20.978737
iteration 3 loss is :  20.973574
iteration 4 loss is :  20.96844


In [22]:
# Now the hidden layer (W1 + b1) is actually the word look up table
vectors = sess.run(W1 + b1)
print(vectors)

[[-0.44619775  2.125433    1.3021779  ... -0.22737275  0.05627072
  -0.18830541]
 [-0.6725791   0.34827143  0.24037406 ... -0.15821266  0.03145853
   1.323789  ]
 [-0.15434411  0.66663456 -0.32548842 ...  0.38285506  2.396309
   2.158944  ]
 ...
 [-0.5872908   0.51662666 -0.3861329  ... -0.9472835   0.78712475
  -0.09222429]
 [ 1.1423874   1.3012801  -1.4372578  ...  0.12348446 -0.81779623
   0.07488071]
 [ 1.0779465   0.25878724  1.4170743  ... -1.668224   -0.5494686
  -1.0749173 ]]


In [23]:
print(type(vectors))

<class 'numpy.ndarray'>


## ////////////////////////////////////////////////////////////////////////////////

In [None]:
# sample dataset
idx2char = ['h', 'i', 'e', 'l', 'o']
# Teach hello: hihell -> ihello
x_data = [[0, 1, 0, 2, 3, 3]]   # hihell
# index value
x_one_hot = [[[1, 0, 0, 0, 0],   # h 0
              [0, 1, 0, 0, 0],   # i 1
              [1, 0, 0, 0, 0],   # h 0
              [0, 0, 1, 0, 0],   # e 2
              [0, 0, 0, 1, 0],   # l 3
              [0, 0, 0, 1, 0]]]  # l 3

y_data = [[1, 0, 2, 3, 3, 4]]    # ihello

In [24]:
num_classes = 4087 # pos 수
input_dim = 30  # embedding size
hidden_size = 30  # output from the LSTM. 5 to directly predict one-hot
batch_size = 4799   # sentence 수
sequence_length = 5703  # word 수
learning_rate = 0.1

In [25]:
X = tf.placeholder(
    tf.float32, [None, sequence_length, input_dim])  # X embedding vector
Y = tf.placeholder(tf.int32, [None, sequence_length])  # Y dictionary

In [26]:
cell = tf.contrib.rnn.BasicLSTMCell(num_units=hidden_size, state_is_tuple=True)
initial_state = cell.zero_state(batch_size, tf.float32) #(30, 1660, 1)
outputs, _states = tf.nn.dynamic_rnn(
    cell, X, initial_state=initial_state, dtype=tf.float32)

Instructions for updating:
Use the retry module or similar alternatives.


In [27]:
# Fully Connected layer
# 앞 stage의 정보가 뒤 stage와 연결되어 있는 구조

X_for_fc = tf.reshape(outputs, [-1, hidden_size])
# fc_w = tf.get_variable("fc_w", [hidden_size, num_classes])
# fc_b = tf.get_variable("fc_b", [num_classes])
# outputs = tf.matmul(X_for_fc, fc_w) + fc_b
outputs = tf.contrib.layers.fully_connected(
    inputs=X_for_fc, num_outputs=num_classes, activation_fn=None)

In [28]:
# reshape out for sequence_loss
outputs = tf.reshape(outputs, [batch_size, sequence_length, num_classes])

weights = tf.ones([batch_size, sequence_length])
sequence_loss = tf.contrib.seq2seq.sequence_loss(
    logits=outputs, targets=Y, weights=weights)
loss = tf.reduce_mean(sequence_loss)
train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

prediction = tf.argmax(outputs, axis=2)

In [29]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(20):
        l, _ = sess.run([loss, train], feed_dict={X: vectors, Y: data})
        result = sess.run(prediction, feed_dict={X: vectors})
        print(i, "loss:", l, "prediction: ", result, "true Y: ", data)
        
        # print char using dic
        result_str=[]
        flag = 0
        for c in np.squeeze(result):
            for key in word_dict:
                if word_dict[key]==c:
                    flag =1
                else:
                    continue
                    
                if flag==1:
                    result_str.append(key)
                    flag=0
                    break
        
        print("\tPrediction str: ", ''.join(result_str))

ValueError: Cannot feed value of shape (5703, 30) for Tensor 'Placeholder_2:0', which has shape '(?, 5703, 30)'