In [18]:
data_path = '../data/books_text_full/test/'

# I. Define vocabulary

In [19]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

# 텍스트 파일의 내용을 변수 text로 리턴하는 함수
def load_doc(filename):
    # read only로 파일을 엽니다.
    file = open(filename, 'r', errors='replace')
    # 모든 텍스트를 읽습니다.
    text = file.read()
    # 파일을 닫습니다.
    file.close()
    return text

def clean_doc(doc):
    # white space 기준으로 tokenize 합니다.
    tokens = doc.split()
    # 각 token에서 모든 구두점을 삭제합니다.
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # 각 token에서 alaphabet으로만 이루어지지 않은 모든 단어를 삭제합니다.
    tokens = [word for word in tokens if word.isalpha()]
    # 각 token에서 stopwrods를 삭제합니다.
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # 각 token에서 1글자 이하인 모든 단어를 삭제합니다.
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# 텍스트 파일을 불러와서 vocab에 추가하는 함수
def add_doc_to_vocab(filename, vocab):
    # 텍스트 파일을 불러옵니다.
    doc = load_doc(filename)
    # 텍스트 파일을 clean toekn으로 리턴합니다.
    tokens = clean_doc(doc)
    # clean token을 vocab에 추가합니다.
    vocab.update(tokens)

# 폴더에 있는 모든 문서를 vocab에 추가하는 함수
def process_docs(directory, vocab, is_train):
    # 폴더에 있는 모든 파일을 순회합니다.
    for filename in listdir(directory):
        # 인덱스가 새겨진 파일 이름과 is_train 인자를 기준으로 test set으로 분류할 모든 파일을 건너뜁니다.
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        # 폴더에 있는 파일의 절대 경로를 구합니다.
        path = directory + '/' + filename
        # 텍스트 파일을 불러와서 vocab에 추가하는 함수를 실행합니다.
        add_doc_to_vocab(path, vocab)

def save_list(lines, filename):
    # 각 문장을 하나의 텍스트 일부로 바꿉니다.
    data = '\n'.join(lines)
    # 파일을 쓰기 모드로 엽니다.
    file = open(filename, 'w')
    # 변환한 텍스트를 파일에 씁니다.
    file.write(data)
    # 파일을 닫습니다.
    file.close()

# vocab을 Counter() 객체로 할당합니다.
vocab = Counter()
# 폴더를 지정하고 폴더 내 모든 문서를 vocab에 추가합니다.
process_docs(data_path, vocab, True)
# vocab의 크기를 출력합니다.
print(len(vocab))
# vocab에서 가장 많이 등장한 50개 단어를 출력합니다.
print(vocab.most_common(50))

# token을 min_occurence 기준으로 유지합니다.
min_occurence = 1
tokens = [k for k,c in vocab.items() if c >= min_occurence]
print(len(tokens))
# token을 vocab 파일로 저장합니다.
save_list(tokens, 'corpusToLines_vocab.txt')
print("\n# 단어 {}개의 [corpusToLines_vocab.txt]로 저장했습니다.".format(len(tokens)))

# 보카를 불러옵니다.
vocab_filename = 'corpusToLines_vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
print("# 단어 {}개의 [{}]을 [vocab]으로 불러왔습니다.".format(len(vocab), vocab_filename))

13538
[('Tick', 2045), ('He', 1384), ('The', 1296), ('said', 1071), ('like', 1003), ('Paul', 881), ('back', 777), ('Sofia', 730), ('one', 687), ('George', 670), ('Master', 573), ('could', 562), ('Jane', 527), ('But', 512), ('Sato', 495), ('looked', 483), ('know', 480), ('It', 475), ('Chu', 461), ('time', 455), ('didnt', 449), ('And', 445), ('eyes', 441), ('right', 429), ('She', 426), ('something', 423), ('hed', 403), ('man', 393), ('felt', 380), ('What', 376), ('around', 374), ('asked', 364), ('see', 360), ('Im', 353), ('away', 346), ('Mothball', 339), ('face', 334), ('get', 331), ('air', 327), ('Rutger', 324), ('would', 312), ('thought', 309), ('made', 308), ('head', 307), ('us', 307), ('You', 302), ('way', 296), ('thing', 277), ('things', 276), ('think', 273)]
13538

# 단어 13538개의 [corpusToLines_vocab.txt]로 저장했습니다.
# 단어 13538개의 [corpusToLines_vocab.txt]을 [vocab]으로 불러왔습니다.


# II. corpusToLines

In [20]:
from string import punctuation
from os import listdir
from gensim.models import Word2Vec

def load_doc(filename):
    file = open(filename, 'r', errors='replace')
    text = file.read()
    file.close()
    return text

def doc_to_lines(doc):
    total_lines = []
    lines = [i.lower() for i in doc.splitlines() if i]  # 공백 문장 제거 및 모든 문장 소문자 변경
    
    return lines

def process_directory(data_path):
    total_lines = []
    for filename in listdir(data_path):
        filepath = data_path + '/' + filename
        doc = load_doc(filepath)
        lines = doc_to_lines(doc)
        print(filename, ":", len(lines))
        total_lines += lines
    return total_lines


sentences = process_directory(data_path)
save_list(sentences, 'total_lines.txt')
print("\n# 문장 {}개의 [total_lines.txt]로 저장했습니다.".format(len(sentences)))
filename = 'total_lines.txt'
total_lines = load_doc(filename)
total_lines = [i for i in total_lines.splitlines()]
total_vocab = set()
for i in total_lines:
    total_vocab.update(i)
print("# unique words in [total_lines.txt]: [{}]".format(len(total_vocab)))

13th_Reality-4.txt : 2388
13th_Reality-2.txt : 3494
13th_Reality-1.txt : 1738

# 문장 7620개의 [total_lines.txt]로 저장했습니다.
# unique words in [total_lines.txt]: [54]


In [21]:
def doc_to_clean_lines(filename):
    total_lines = load_doc(filename)
    clean_lines = [i.lower() for i in total_lines.splitlines() if len(i) > 5 if "." in i] # 5개 단어 이상으로 이루어지고 마침표가 있는 문장만 포함
    
    return clean_lines

filename = "total_lines.txt"
clean_lines = doc_to_clean_lines(filename)
save_list(clean_lines, 'clean_lines.txt')
print("# 문장 {}개가 [clean_lines.txt]로 저장되었습니다.".format(len(clean_lines)))
filename = 'clean_lines.txt'
clean_lines = load_doc(filename)
clean_lines = [i for i in clean_lines.splitlines()]
clean_vocab = set()
for i in clean_lines:
    clean_vocab.update(i)
print("# unique words in [clean_lines.txt]: [{}]".format(len(clean_vocab)))

# 문장 6921개가 [clean_lines.txt]로 저장되었습니다.
# unique words in [clean_lines.txt]: [51]


In [22]:
def doc_to_vocab_lines(filename):
    clean_lines = load_doc(filename)
    vocab_lines = []
    for i in clean_lines.splitlines():
        words = i.split()
        words = [word for word in words if word in vocab]
        words = [word for word in words if len(words) >= 5]
        vocab_line = " ".join(words)
        if len(vocab_line):
            vocab_line += "."
            vocab_line = [vocab_line]
            vocab_lines += vocab_line
    
    return vocab_lines

filename = "clean_lines.txt"
vocab_lines = doc_to_vocab_lines(filename)
save_list(vocab_lines, 'vocab_lines.txt')
print("# 문장 {}개가 [vocab_lines.txt]로 저장되었습니다.".format(len(vocab_lines)))
filename = 'vocab_lines.txt'
vocab_lines = load_doc(filename)
vocab_lines = [i for i in vocab_lines.splitlines()]
vocab_vocab = set()
for i in vocab_lines:
    vocab_vocab.update(i)
print("# unique words in [vocab_lines.txt]: [{}]".format(len(vocab_vocab)))

# 문장 4832개가 [vocab_lines.txt]로 저장되었습니다.
# unique words in [vocab_lines.txt]: [28]


# III. word2vec

In [23]:
filename = "vocab_lines.txt"

file = open(filename, 'r', errors='replace')
text = file.read()
file.close()

vocab_lines = [i for i in text.splitlines()]

list_lines = []
for i in vocab_lines:
    i = i.split()
    list_lines.append(i)

print(list_lines[0])

['master', 'sat', 'lights', 'purring', 'first', 'light', 'birth', 'still', 'hour', 'stared', 'wall', 'fascinating', 'thing', 'realities', 'stapled', 'see', 'whenever', 'knot', 'wood', 'knot', 'two', 'eyes', 'mouth', 'looked', 'reason', 'reminded', 'boy', 'named.']


In [24]:
sentences = list_lines
print("Total training sentences:{}".format(len(sentences)))

wv_sz = 100
# word2vec 모델을 훈련시킵니다.
model = Word2Vec(sentences, size=wv_sz, window=5, workers=8, min_count=1)
# 모델의 vocabulary size를 요약합니다.
words = list(model.wv.vocab)
print("Vocabulary size: %d" % len(words))
print("Wordvector size: %d" % (wv_sz))
print("Embedding size: {}x{}".format(len(words), wv_sz))

# 모델을 ASCII 포맷으로 저장합니다.
filename = 'fantasy_embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)
print("\n# word2vec 파일 [{}]이 저장되었습니다.".format(filename))

Total training sentences:4832




Vocabulary size: 10065
Wordvector size: 100
Embedding size: 10065x100

# word2vec 파일 [fantasy_embedding_word2vec.txt]이 저장되었습니다.


# IV. Use pre-trained word vector

In [58]:
import tensorflow as tf
import numpy as np

from tensorflow.contrib import learn

max_length = max([len(s.split()) for s in vocab_lines])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_length)
encoded_lines = np.array(list(vocab_processor.fit_transform(vocab_lines)))
print("-"*80,"# [vocab_lines]가 [encoded_lines]로 인코딩 및 패딩 되었습니다. (max_length:{})".format(max_length), "-"*80, sep='\n')
print("BEFORE: \n{}".format(vocab_lines[0]))
print("\nAFTER: \n{}".format(encoded_lines[0]))

x_data = np.array(list(encoded_lines))
print("\n", "-"*80,"# 최종 [x_data] (max_length: {})".format(max_length), "-"*80, sep='\n')
print("EXAMPLE: \n{}".format(x_data[0]))

vocab_dict = vocab_processor.vocabulary_._mapping
vocab_size = len(vocab_dict.keys())
print("\n", "-"*80,"# 최종 [vocab_dict] (vocab_size: {})".format(vocab_size), "-"*80, sep='\n')
print("EXAMPLE: \n[{}] is mapped to [{}].".format(vocab_lines[0].split()[0], vocab_dict[vocab_lines[0].split()[0]]))

--------------------------------------------------------------------------------
# [vocab_lines]가 [encoded_lines]로 인코딩 및 패딩 되었습니다. (max_length:62)
--------------------------------------------------------------------------------
BEFORE: 
master sat lights purring first light birth still hour stared wall fascinating thing realities stapled see whenever knot wood knot two eyes mouth looked reason reminded boy named.

AFTER: 
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 18 20 21 22 23 24
 25 26 27  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0]


--------------------------------------------------------------------------------
# 최종 [x_data] (max_length: 62)
--------------------------------------------------------------------------------
EXAMPLE: 
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 18 20 21 22 23 24
 25 26 27  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0

In [74]:
def load_word2vec(filename):
    vocab = []
    embd = []
    file = open(filename,'r')
    lines = file.readlines()[1:]
    for line in lines:
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append(row[1:])
    print('Loaded {}!'.format(filename))
    file.close()
    return vocab,embd

filename = '../CNN-pairwise/fantasy_embedding_word2vec.txt'
vocab,embd = load_word2vec(filename)
vocab_size = len(vocab)
embedding_dim = len(embd[0])
embedding = np.asarray(embd)
print(len(embedding))

Loaded ../CNN-pairwise/fantasy_embedding_word2vec.txt!
10065


# V. tensorflow로 모델 구성하기

## 01 Embedding Layer

**hyperparameters**

+ `wv_sz` = 100 (III. word2vec)

In [80]:
sequence_length = max_length

**layer input**

In [81]:
input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")

**embedding layer and embedding lookup**

In [82]:
dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

with tf.device('/cpu:0'), tf.name_scope("embedding"):
    W = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]), trainable=False, name="W")
    embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim])
    embedding_init = W.assign(embedding_placeholder)
    
    embedded_chars = tf.nn.embedding_lookup(W, input_x) 
    embedded_chars_expanded = tf.expand_dims(embedded_chars, -1) 

**layer output**

In [83]:
embedded_chars_expanded.shape

TensorShape([Dimension(None), Dimension(62), Dimension(100), Dimension(1)])

## 02 Convolution and Max-pooling Layers

...라인바이라인

In [91]:
filter_sizes = [3, 4, 5]
num_filters = 128

pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
    with tf.name_scope("conv-maxpool-%s" % filter_size):
        # Convolution Layer
        filter_shape = [filter_size, wv_sz, 1, num_filters]
        W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
        b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
        conv = tf.nn.conv2d(
            embedded_chars_expanded,
            W,
            strides=[1, 1, 1, 1],
            padding="VALID",
            name="conv")
        # Apply nonlinearity
        h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
        # Max-pooling over the outputs
        pooled = tf.nn.max_pool(
            h,
            ksize=[1, sequence_length - filter_size + 1, 1, 1],
            strides=[1, 1, 1, 1],
            padding='VALID',
            name="pool")
        pooled_outputs.append(pooled)
        
# Combine all the pooled features
num_filters_total = num_filters * len(filter_sizes)
h_pool = tf.concat(pooled_outputs, axis=3)
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

In [92]:
print(h_pool)
print(h_pool_flat)

Tensor("concat_6:0", shape=(?, 1, 1, 384), dtype=float32)
Tensor("Reshape_3:0", shape=(?, 384), dtype=float32)


...함수

In [104]:
def conv_and_pool(embedded_chars_expanded):
    """
    arg : embedded_chars_expanded means embedding layer's output
    """
    
    #convolution layer
    
    filter_sizes = [3, 4, 5]
    num_filters = 128

    pooled_outputs = []
    for i, filter_size in enumerate(filter_sizes):
        with tf.name_scope("conv-maxpool-%s" % filter_size):
            # Convolution Layer
            filter_shape = [filter_size, wv_sz, 1, num_filters]
            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
            b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
            conv = tf.nn.conv2d(
                embedded_chars_expanded,
                W,
                strides=[1, 1, 1, 1],
                padding="VALID",
                name="conv")
            # Apply nonlinearity
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
            # Max-pooling over the outputs
            pooled = tf.nn.max_pool(
                h,
                ksize=[1, sequence_length - filter_size + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID',
                name="pool")
            pooled_outputs.append(pooled)

    num_filters_total = num_filters * len(filter_sizes)
    h_pool = tf.concat(pooled_outputs, 3)
    h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

    return h_pool_flat

In [121]:
output = conv_and_pool(embedded_chars_expanded)
print(output)

Tensor("Reshape_7:0", shape=(?, 384), dtype=float32)


In [122]:
_, output_y_size = output.shape
output_y_size = int(output_y_size)
print(output_y_size)

384


In [123]:
type(output_y_size)

int

## 03 Loss

...라인바이라인

In [107]:
print(sequence_length)

62


In [108]:
print(wv_sz)

100


In [127]:
s_center = [output[i] for i in range(1, output_y_size-1)]

In [128]:
s_left = [output[i] for i in range(0, output_y_size-2)]

In [129]:
s_right = [output[i] for i in range(2, output_y_size)]

In [144]:
s_zip = list(zip(s_left, s_center, s_right))

In [145]:
len(s_zip)

382

In [None]:
for i in range(len(s_zip)):
    for j in range(i+1, len(s_zip)):
        ( cosine(s_zip[i][0], ) + cosine(s_zip[i][2], ) )/2 - cosine(s_zip[i][1], )

In [147]:
loss = 0
for i in range(len(s_zip)):
    for j in range(len(s_zip)):
        if j > i:
            loss = \
            ( cosine(s_zip[i][0], s_zip[j][0]) + cosine(s_zip[i][2], s_zip[j][2]) )/2 \
            - cosine(s_zip[i][1], s_zip[j][1])
            loss += loss
            
print(loss)

ValueError: setting an array element with a sequence.

...함수

In [None]:
from sklearn.metrics.pairwise import cosine_similarity as cos

def loss(output):
    s_center = [output[i] for i in range(1, output_y_size-1)]
    s_left = [output[i] for i in range(0, output_y_size-2)]
    s_right = [output[i] for i in range(2, output_y_size)]
    
    s_zip = list(zip(s_left, s_center, s_right))
    
    loss = 0
    for i in range(len(s_zip)):
        for j in range(len(s_zip)):
            if j > i:
                loss = \
                ( cosine(s_zip[i][0], s_zip[j][0]) + cosine(s_zip[i][2], s_zip[j][2]) )/2 \
                - cosine(s_zip[i][1], s_zip[j][1])
                loss += loss
            else:
                continue
    
    return loss

loss = loss(output)

In [152]:
from sklearn.metrics.pairwise import cosine_similarity as cos

def loss(output):
    s_center = [output[i] for i in range(1, output_y_size-1)]
    s_left = [output[i] for i in range(0, output_y_size-2)]
    s_right = [output[i] for i in range(2, output_y_size)]
    
    s_zip = list(zip(s_left, s_center, s_right))
    
    loss = 0
    for i in range(len(s_zip)):
        for j in range(len(s_zip)):
            if j > i:
                loss = \
                ( cos(s_zip[i][0], s_zip[j][0]) + cos(s_zip[i][2], s_zip[j][2]) )/2 \
                - cos(s_zip[i][1], s_zip[j][1])
                loss += loss
            else:
                continue
    
    return loss

loss = loss(output)

ValueError: setting an array element with a sequence.

---

In [None]:
optimizer = tf.train.AdamOptimizer(learning_rate=0.01).minimize(loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())

epoch = 1

for i in range(epoch):
    

In [None]:
import math

def cos_s(v1, v2):
    """
    compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)
    """
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(output_y_size):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

def loss(cnn_output):
    """
    Side cosine_similarity - Center cosine_similarity
    Args:
    logits: cnn_output ( sentence_number * 384 : 3 kernel * 128 kernel number )
    Returns:
    Loss tensor of type float.
    """
    unstack_cnn_output = tf.unstack(cnn_output, axis=0)
    unstack_fin = session.run(unstack_cnn_output)
    
    newinput=[]
    cos_s_sum = 0
    
    num_sentense = len(unstack_fin)
    
    for si in range(num_sentense - 3):
        for ssi in range(si, num_sentense - 2):
            if not si == ssi:
                calc = (cos_s(unstack_fin[si], unstack_fin[ssi]) + cos_s(unstack_fin[si+2], unstack_fin[ssi+2]))/2 - cos_s(unstack_fin[si+1], unstack_fin[ssi+1])
                newinput.append(calc)
                cos_s_sum += calc
    
    return tf.convert_to_tensor(cos_s_sum)
    

In [None]:
logits = inference(embedded_chars_expanded)
loss = loss(cnn_output)
train_op = train(loss, global_step)

In [71]:
print(W)

Tensor("conv-maxpool-5/W/read:0", shape=(5, 100, 1, 128), dtype=float32)


In [None]:
train_op.minimize(cost)