In [2]:
data_path = '../data/books_text_full/test/'

# I. Define vocabulary

In [3]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

# 텍스트 파일의 내용을 변수 text로 리턴하는 함수
def load_doc(filename):
    file = open(filename, 'r', errors='ignore')
    text = file.read()
    file.close()
    return text

def clean_doc(doc):
    tokens = doc.split()
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

def add_doc_to_vocab(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    vocab.update(tokens)

def process_docs(directory, vocab, is_train):
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        add_doc_to_vocab(path, vocab)

def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

    
vocab = Counter()
process_docs(data_path, vocab, True)

# token을 min_occurence 기준으로 유지합니다.
min_occurence = 1
tokens = [k for k,c in vocab.items() if c >= min_occurence]
print(len(tokens))
# token을 vocab 파일로 저장합니다.
save_list(tokens, 'corpusToLines_vocab.txt')
print("\n# 단어 {}개의 [corpusToLines_vocab.txt]로 저장했습니다.".format(len(tokens)))

# 보카를 불러옵니다.
vocab_filename = 'corpusToLines_vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
print("# {} words is loaded as [{}] with [vocab].".format(len(vocab), vocab_filename))

139867

# 단어 139867개의 [corpusToLines_vocab.txt]로 저장했습니다.
# 139867 words is loaded as [corpusToLines_vocab.txt] with [vocab].


# II. corpusToLines

In [5]:
from string import punctuation
from os import listdir
from gensim.models import Word2Vec

def load_doc(filename):
    file = open(filename, 'r', errors='ignore')
    text = file.read()
    file.close()
    return text

def doc_to_lines(doc):
    total_lines = []
    lines = [i.lower() for i in doc.splitlines() if i]  # 공백 문장 제거 및 모든 문장 소문자 변경
    
    return lines

def process_directory(data_path):
    result_lines = []
    for filename in listdir(data_path):
        filepath = data_path + '/' + filename
        doc = load_doc(filepath)
        lines = doc_to_lines(doc)
#         print(filename, ":", len(lines))
        result_lines += lines
    return result_lines


total_lines = process_directory(data_path)
save_list(total_lines, 'total_lines.txt')
print("# {} sentences are stored as [total_lines.txt].".format(len(total_lines)))
filename = 'total_lines.txt'
total_lines = load_doc(filename)
total_lines = [i for i in total_lines.splitlines()]
total_vocab = set()
for i in total_lines:
    total_vocab.update(i)
print("# unique words in [total_lines.txt]: [{}]".format(len(total_vocab)))

# 780955 sentences are stored as [total_lines.txt].
# unique words in [total_lines.txt]: [285]


In [6]:
def doc_to_clean_lines(filename):
    total_lines = load_doc(filename)
    # 5개 단어 이상으로 이루어지고 마침표가 있는 문장만 포함
    clean_lines = [i.lower() for i in total_lines.splitlines() if len(i) > 5 if "." in i] 
    
    return clean_lines

filename = "total_lines.txt"
clean_lines = doc_to_clean_lines(filename)
save_list(clean_lines, 'clean_lines.txt')
print("# {} sentences are stored as [clean_lines.txt].".format(len(clean_lines)))
filename = 'clean_lines.txt'
clean_lines = load_doc(filename)
clean_lines = [i for i in clean_lines.splitlines()]
clean_vocab = set()
for i in clean_lines:
    clean_vocab.update(i)
print("# unique words in [clean_lines.txt]: [{}]".format(len(clean_vocab)))

# 680708 sentences are stored as [clean_lines.txt].
# unique words in [clean_lines.txt]: [245]


In [7]:
def doc_to_vocab_lines(filename):
    clean_lines = load_doc(filename)
    vocab_lines = []
    for i in clean_lines.splitlines():
        words = i.split()
        words = [word for word in words if word in vocab]
        words = [word for word in words if len(words) >= 5]
        vocab_line = " ".join(words)
        if len(vocab_line):
            vocab_line += "."
            vocab_line = [vocab_line]
            vocab_lines += vocab_line
    
    return vocab_lines

filename = "clean_lines.txt"
vocab_lines = doc_to_vocab_lines(filename)
save_list(vocab_lines, 'vocab_lines.txt')
print("# {} sentences are stored as [vocab_lines.txt].".format(len(vocab_lines)))
filename = 'vocab_lines.txt'
vocab_lines = load_doc(filename)
vocab_lines = [i for i in vocab_lines.splitlines()]
vocab_vocab = set()
for i in vocab_lines:
    vocab_vocab.update(i)
print("# unique words in [vocab_lines.txt]: [{}]".format(len(vocab_vocab)))

# 464461 sentences are stored as [vocab_lines.txt].
# unique words in [vocab_lines.txt]: [116]


In [8]:
import tensorflow as tf
import numpy as np

from tensorflow.contrib import learn

max_length = max([len(s.split()) for s in vocab_lines])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_length)
encoded_lines = np.array(list(vocab_processor.fit_transform(vocab_lines)))
print("-"*80,"# [vocab_lines]is transformed into [encoded_lines]. (max_length:{})".format(max_length), "-"*80, sep='\n')
print("BEFORE: \n{}".format(vocab_lines[0]))
print("\nAFTER: \n{}".format(encoded_lines[0]))

x_data = np.array(list(encoded_lines))
print("\n", "-"*80,"# final [x_data] (max_length: {})".format(max_length), "-"*80, sep='\n')
print("EXAMPLE: \n{}".format(x_data[0]))

vocab_dict = vocab_processor.vocabulary_._mapping
vocab_size = len(vocab_dict.keys())
print("\n", "-"*80,"# final [vocab_dict] (vocab_size: {})".format(vocab_size), "-"*80, sep='\n')
print("EXAMPLE: \n[{}] is mapped to [{}].".format(vocab_lines[0].split()[0], vocab_dict[vocab_lines[0].split()[0]]))

--------------------------------------------------------------------------------
# [vocab_lines]is transformed into [encoded_lines]. (max_length:742)
--------------------------------------------------------------------------------
BEFORE: 
rights part publication may reproduced transmitted form electronic including information storage retrieval without permission writing.

AFTER: 
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  

In [9]:
vocab_processor.save("vocab_processor")

# III. word2vec

In [10]:
filename = "vocab_lines.txt"

file = open(filename, 'r', errors='replace')
text = file.read()
file.close()

vocab_lines = [i for i in text.splitlines()]

list_lines = []
for i in vocab_lines:
    i = i.split()
    list_lines.append(i)

print(list_lines[0])

['rights', 'part', 'publication', 'may', 'reproduced', 'transmitted', 'form', 'electronic', 'including', 'information', 'storage', 'retrieval', 'without', 'permission', 'writing.']


In [11]:
sentences = list_lines
print("Total training sentences:{}".format(len(sentences)))

wv_sz = 100
# word2vec 모델을 훈련시킵니다.
model = Word2Vec(sentences, size=wv_sz, window=5, workers=8, min_count=1)
# 모델의 vocabulary size를 요약합니다.
words = list(model.wv.vocab)
print("Vocabulary size: %d" % len(words))
print("Wordvector size: %d" % (wv_sz))
print("Embedding size: {}x{}".format(len(words), wv_sz))

# 모델을 ASCII 포맷으로 저장합니다.
filename = 'fantasy_embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)
print("\n# word2vec 파일 [{}]이 저장되었습니다.".format(filename))

Total training sentences:464461
Vocabulary size: 78465
Wordvector size: 100
Embedding size: 78465x100

# word2vec 파일 [fantasy_embedding_word2vec.txt]이 저장되었습니다.


# IV. Use pre-trained word vector

In [12]:
def load_word2vec(filename):
    vocab = []
    embd = []
    file = open(filename,'r')
    lines = file.readlines()[1:]
    for line in lines:
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append(row[1:])
    print('# [{}] is successfully loaded!'.format(filename))
    file.close()
    return vocab,embd

filename = './fantasy_embedding_word2vec.txt'
vocab,embd = load_word2vec(filename)
vocab_size = len(vocab)
embedding_dim = len(embd[0])
embedding = np.asarray(embd)
print("# embedding vocabulary size: {}".format(len(embedding)))

# [./fantasy_embedding_word2vec.txt] is successfully loaded!
# embedding vocabulary size: 78465


In [13]:
print(embedding.shape)
print(type(embedding))
print(x_data.shape)
print(type(x_data))

(78465, 100)
<class 'numpy.ndarray'>
(464461, 742)
<class 'numpy.ndarray'>


# V. tensorflow로 모델 구성하기

## 01 Embedding Layer

**hyperparameters**

+ `wv_sz` = 100 (III. word2vec)

In [14]:
sequence_length = max_length

In [15]:
sequence_length

742

In [16]:
batch_size = 32
num_sentence = batch_size

**tensor graph**

In [17]:
def cossim(a, b):
    dot=tf.cast(tf.tensordot(a, b, axes=1), tf.float32)

    norm1=tf.sqrt(tf.cast(tf.tensordot(a, a, axes=1), tf.float32))
    norm2=tf.sqrt(tf.cast(tf.tensordot(b, b, axes=1), tf.float32))

    mycossi=tf.div(dot, tf.multiply(norm1, norm2))
    
    return mycossi

In [18]:
global_step = tf.Variable(0, trainable=False, name='global_step')

In [19]:
input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
tf_embedding = tf.constant(embedding, dtype=tf.float32)

embedded_chars = tf.nn.embedding_lookup(tf_embedding, input_x) 
embedded_chars_expanded = tf.expand_dims(embedded_chars, -1) 

filter_sizes = [3, 4, 5]
num_filters = 128

pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
    with tf.name_scope("conv-maxpool-%s" % filter_size):
        # Convolution Layer
        filter_shape = [filter_size, wv_sz, 1, num_filters]
        W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="conv_W")
        b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="conv_b")
        conv = tf.nn.conv2d(
            embedded_chars_expanded,
            W,
            strides=[1, 1, 1, 1],
            padding="VALID",
            name="conv")
        # Apply nonlinearity
        h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
        # Max-pooling over the outputs
        pooled = tf.nn.max_pool(
            h,
            ksize=[1, sequence_length - filter_size + 1, 1, 1],
            strides=[1, 1, 1, 1],
            padding='VALID',
            name="pool")
        pooled_outputs.append(pooled)

num_filters_total = num_filters * len(filter_sizes)
h_pool = tf.concat( pooled_outputs, 3)
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

cnn_output = h_pool_flat

loss = tf.zeros(1)
for si in range(num_sentence - 3):
    for ssi in range(si, num_sentence-2):
        if not si == ssi:

            cossi1 = cossim(cnn_output[si], cnn_output[ssi])
            cossi2 = cossim(cnn_output[si+1], cnn_output[ssi+2])
            cossi3 = cossim(cnn_output[si+2], cnn_output[ssi+2])

            cossi = tf.abs(tf.subtract(tf.div(tf.add(cossi1, cossi3),2), cossi2))

            loss = tf.add(loss,cossi)

loss = tf.reshape(loss, [])

In [20]:
int(h_pool_flat.shape[1])

384

In [21]:
tf.summary.scalar("loss", loss)

<tf.Tensor 'loss:0' shape=() dtype=string>

In [22]:
W

<tf.Variable 'conv-maxpool-5/conv_W:0' shape=(5, 100, 1, 128) dtype=float32_ref>

---

In [23]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss, global_step=global_step)

In [24]:
merge_op = tf.summary.merge_all()

In [25]:
saver = tf.train.Saver()

In [33]:
import time
start = time.time()

sess = tf.Session()
summary_writer = tf.summary.FileWriter("./logs", sess.graph)
sess.run(tf.global_variables_initializer())

batch_size = 32
total_batch = int(len(x_data) / batch_size)

for epoch in range(5):
    total_loss = 0
    k = 0
    
    for i in range(0, len(x_data), batch_size):    
        x_batch = x_data[i:i+batch_size]

        _, loss_val = sess.run([optimizer,loss], 
                                feed_dict={input_x: x_batch})

        total_loss += loss_val
    
        summary = sess.run(merge_op, feed_dict={input_x: x_batch})
        summary_writer.add_summary(summary, global_step=sess.run(global_step))

    print("Epoch: %04d" % (epoch + 1))
    print("Avg. cost: {}".format(total_loss / total_batch))

print("최적화 완료!")

import time
current_time = time.strftime("_%H%M%S")
current_date = time.strftime("%Y%m%d")

model_path = "./logs/model" + current_date + current_time + ".ckpt"

save_path = saver.save(sess, model_path)
print("모델 저장 완료!")

end = time.time()
print ("Total running time: {} seconds".format(end-start))

InvalidArgumentError: slice index 20 of dimension 0 out of bounds.
	 [[Node: strided_slice_107 = StridedSlice[Index=DT_INT32, T=DT_FLOAT, begin_mask=0, ellipsis_mask=0, end_mask=0, new_axis_mask=0, shrink_axis_mask=1, _device="/job:localhost/replica:0/task:0/device:CPU:0"](Reshape, strided_slice_101/stack_1, strided_slice_107/stack_1, gradients/Add_1_grad/Shape)]]

Caused by op 'strided_slice_107', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.5/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python3.5/dist-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-4991a4a1b2f2>", line 47, in <module>
    cossi3 = cossim(cnn_output[si+2], cnn_output[ssi+2])
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/array_ops.py", line 538, in _SliceHelper
    name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/array_ops.py", line 706, in strided_slice
    shrink_axis_mask=shrink_axis_mask)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 5430, in strided_slice
    name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): slice index 20 of dimension 0 out of bounds.
	 [[Node: strided_slice_107 = StridedSlice[Index=DT_INT32, T=DT_FLOAT, begin_mask=0, ellipsis_mask=0, end_mask=0, new_axis_mask=0, shrink_axis_mask=1, _device="/job:localhost/replica:0/task:0/device:CPU:0"](Reshape, strided_slice_101/stack_1, strided_slice_107/stack_1, gradients/Add_1_grad/Shape)]]


---

In [31]:
import time
current_time = time.strftime("_%H%M%S")
current_date = time.strftime("%Y%m%d")

logs = "./logs/model" + current_date + current_time + ".ckpt"

In [32]:
logs

'./logs/model20180214_205631.ckpt'