In [None]:
filepath = 'corpus/input_zh.txt' # 'corpus/text8.txt'

In [None]:
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
    
def load(filepath, window_size, vocab_size=None):

    words = []
    with open(filepath, 'r', encoding='utf8') as file:
        words = word_tokenize(file.readline())    

    x_train, y_train = [], []
    for i in range(len(words) - window_size + 1):
        x_train.append(words[i: i + window_size - 1])
        y_train.append(words[i +  window_size - 1])
    
    vocab = [word[0] for word in Counter(words).most_common(vocab_size)]
    word2id = { vocab[i]: i for i in range(len(vocab)) }
    
    return np.array(x_train), np.array(y_train)[:,None], np.array(vocab), word2id

def load_zh(filepath, window_size, vocab_size=None):

    words = []
    with open(filepath, 'r', encoding='utf8') as file:
        for line in file:
            words += word_tokenize(line.strip())
        

    x_train, y_train = [], []
    for i in range(len(words) - window_size + 1):
        x_train.append(words[i: i + window_size - 1])
        y_train.append(words[i +  window_size - 1])
    
    vocab = [word[0] for word in Counter(words).most_common(vocab_size)]
    word2id = { vocab[i]: i for i in range(len(vocab)) }
    
    return np.array(x_train), np.array(y_train)[:,None], np.array(vocab), word2id
            
def convert_to_id(x_train, y_train, vocab):
    
    word_to_id = {}
    for i, vocab in enumerate(vocab):
        word_to_id[vocab] = i
        
    for i in range(len(x_train)):
        x_train[i] = [word_to_id[word] for word in x_train[i]]
        y_train[i] = word_to_id[y_train[i][0]]
        
    return x_train.astype(int), y_train.astype(int)


def next_batch(x_train, y_train, batch_size):
    
    num_batch = len(x_train) // batch_size + 1
    for n in range(num_batch):        
        offset = n * batch_size
        x_batch = x_train[offset: offset + batch_size]
        y_batch = y_train[offset: offset + batch_size]
        
        yield x_batch, y_batch
        
# def convert_to_word(x_train, y_train, id_to_word):
#     for i in range(len(x_train)):
#         print(x_train[i])
#         x_train[i] = id_to_word[x_train[i]]
#         y_train[i] = id_to_word[y_train[i]]
#     return x_train, y_train

In [None]:
# hyperparameter
batch_size = 32
window_size = 5
vocab_size = None
hidden_size = 10
emb_dim = 50
learning_rate = 0.8
epoch_size = 1

In [None]:
x_raw, y_raw, vocab, word2id = load_zh(filepath, window_size, vocab_size)
vocab_size = len(vocab)
print('vocab_size: {}'.format(vocab_size))

In [None]:
x_train, y_train = convert_to_id(x_raw, y_raw, vocab)
print('Length: {}'.format(len(x_train)))
print('Number of batch: {}'.format(len(x_train) / batch_size))

In [None]:
import tensorflow as tf

In [None]:
# Model Parameter Definition


# Input && Output
input_words = tf.placeholder(dtype=tf.int32, shape=(batch_size, window_size-1))
output_word = tf.placeholder(dtype=tf.int32, shape=(batch_size, 1))


# Word Features
C = tf.Variable(tf.truncated_normal(shape=(vocab_size, emb_dim), mean=-1, stddev=-1), name='word_embedding')


# Hidden Layer Weight && Bias
H = tf.Variable(tf.random_normal(shape=(hidden_size, (window_size - 1 ) * emb_dim)))
d = tf.Variable(tf.random_normal(shape=(hidden_size, )))

# Hidden-to-Output Weight && Bias
U = tf.Variable(tf.random_normal(shape=(vocab_size, hidden_size)))
b = tf.Variable(tf.random_normal(shape=(vocab_size, )))

# Projection-to-Output Weight
W = tf.Variable(tf.random_normal(shape=(vocab_size, (window_size - 1) * emb_dim)))

In [None]:
# y = b + Wx + Utanh(d + Hx)

# x = (C(w(t-1)), C(w(t-2), ..., C(w(t-n+1))), n == window_size
with tf.name_scope('Projection_Layer'):
    x  = tf.nn.embedding_lookup(C, input_words) # (batch_size, window_size-1, emb_dim)
    x  = tf.reshape(x, shape=(batch_size, (window_size - 1) * emb_dim))
    
with tf.name_scope('Hidden_Layer'):
    Hx = tf.matmul(x, tf.transpose(H)) # (batch_size, hidden_size)
    o  = tf.add(d, Hx) # (batch_size, hidden_size)
    a  = tf.nn.tanh(o)  # (batch_size, hidden_size)
     
with tf.name_scope('Output_Layer'):
    Ua = tf.matmul(a, tf.transpose(U)) # (batch_size, vocab_size)
    Wx = tf.matmul(x, tf.transpose(W)) # (batch_size, vocab_size)
    y  = tf.nn.softmax(tf.clip_by_value(tf.add(b, tf.add(Wx, Ua)), 0.0, 10)) # (batch_size, vocab_size)
    

with tf.name_scope('Loss'):
    onehot_tgt = tf.one_hot(tf.squeeze(output_word), vocab_size)  # (batch_size, vocab_size)
    loss = -1 * tf.reduce_mean(tf.reduce_sum(tf.log(y) * onehot_tgt, 1)) # 乘 -1 -> maximize loss
    print(loss)
    
optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss) 


In [None]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=True)) as sess:
    initializer = tf.global_variables_initializer()
    initializer.run()
    
    step = 0
    avg_loss = 0
    for epoch in range(epoch_size):
        for x_batch, y_batch in next_batch(x_train, y_train, batch_size):
            if len(x_batch) != batch_size:
                continue
            feed_dict = {input_words: x_batch, output_word: y_batch}
            fetches = [loss, optimizer]
            Loss, _ = sess.run(fetches, feed_dict)
            avg_loss += Loss
            if step % 100 == 0:
                print('Step {}, Loss: {}'.format(step, avg_loss / 100))
                avg_loss = 0
            
            step += 1
    
    print('Training Done.')
    word_embedding = C.eval()
    
    

In [None]:
from pprint import pprint
# from sklearn.metrics.pairwise import cosine_similarity
def cosine_similarity(wordvec1, wordvec2):
    return np.dot(wordvec1, wordvec2) / (np.linalg.norm(wordvec1) * np.linalg.norm(wordvec2))

def distance(wordvec1, wordvec2):
    return (np.linalg.norm(wordvec1 - wordvec2))
    
def most_similar(word_embedding, word):
    
    target = word2id[word]
    n = 10
    top10 = [('', 100000) for i in range(n)]
    
    
    for i in range(len(word_embedding)):
        if i == target:
            continue
        
        sim = distance(word_embedding[target], word_embedding[i])
        for j in range(n):
            if sim <= top10[j][1]:
                top10[j+1:] = top10[j:-1]
                top10[j] = (vocab[i], sim)
                break        
    pprint(top10)

most_similar(word_embedding, '九月')

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline

pca = PCA(n_components=2)
wordemb_2D = pca.fit_transform(word_embedding)

In [None]:
n = 100
x_coords = wordemb_2D[:n, 0]
y_coords = wordemb_2D[:n, 1]

plt.scatter(x_coords, y_coords, c='b')
for label, x, y in zip(vocab[:n], x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
plt.show()

In [None]:
from gensim.models import word2vec

sentences = word2vec.LineSentence(filepath)
model = word2vec.Word2Vec(sentences, size=50)

In [None]:
model.wv.most_similar(positive='九月')