**Set up the library**

In [1]:
import jieba
import string
import re

In [2]:
f_read =  open("zhwiki-20190401_all.txt", 'r')
all_text_str = f_read.readlines()
all_words = []

**Chinese Word Segmentation**

In [3]:
%%time

for i in range(len(all_text_str)):
    current_line = jieba.lcut(all_text_str[i].strip('\n'))
    all_words += current_line
    
len(all_words)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.593 seconds.
Prefix dict has been built succesfully.


CPU times: user 9min 18s, sys: 2.66 s, total: 9min 21s
Wall time: 9min 21s


In [4]:
all_words[-10:]

['谈判', '修改', '现有', '的', '条约', '这不应', '直接', '违背', '其', '精神']

In [5]:
words_set = set(all_words)
len(words_set)

1687575

**Get most common words**

In [20]:
import collections

word_counters = collections.Counter(all_words)

most_common_words = set()

for tup in word_counters.most_common(500000):
    most_common_words.add(tup[0])
    
len(most_common_words)

500000

**Change less frequent words to uncommon**

In [21]:
for i in range(len(all_words)):
    if all_words[i] not in most_common_words:
        all_words[i] = 'uncommon'
        
all_words.count('uncommon')

1399913

**Convert text to int**

In [22]:
def str_to_int(lst):
    counter = 0
    new_list = []
    existed = {}
    
    for string in lst:
        if string not in existed:
            existed[string] = counter
            new_list.append(existed[string])
            counter += 1
        else:
            new_list.append(existed[string])
            
    return new_list

In [23]:
word2int = str_to_int(all_words)

In [24]:
min(word2int)

0

In [25]:
max(word2int)

499999

**Set up the constants**

In [26]:
n_Train = 50
n_Voc = max(word2int) + 1
n_Corpus = len(word2int)
WINDOW_SIZE = 2

In [27]:
n_Voc

500000

**Initialize two arrays**

In [28]:
import numpy as np

def data_generate(n_Train, n_Voc, n_Corpus, WINDOW_SIZE, word2int):

    X = np.zeros((n_Train, n_Voc))
    Y = np.zeros((n_Train, n_Voc))

    for i in range(n_Train):
        j = np.random.choice(n_Corpus, 1)[0]

        tw = word2int[j]

        Y[i][tw] = 1

        for k in range(j - WINDOW_SIZE, j + WINDOW_SIZE + 1):
            if k != j and k >= 0 and k < n_Corpus:
                cw = word2int[k]
                X[i][cw] = 1
                
    return X, Y

In [29]:
data_generate(n_Train, n_Voc, n_Corpus, WINDOW_SIZE, word2int)

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]))

In [30]:
import tensorflow as tf

ONE_HOT_DIM = n_Voc

# # function to convert numbers to one hot vectors
# def to_one_hot_encoding(data_point_index):
#     one_hot_encoding = np.zeros(ONE_HOT_DIM)
#     one_hot_encoding[data_point_index] = 1
#     return one_hot_encoding

# X = [] # input word
# Y = [] # target word

# for x, y in zip(df['input'], df['label']):
#     X.append(to_one_hot_encoding(word2int[ x ]))
#     Y.append(to_one_hot_encoding(word2int[ y ]))

# # convert them to numpy arrays
# X_train = np.asarray(X)
# Y_train = np.asarray(Y)

# making placeholders for X_train and Y_train
x = tf.placeholder(tf.float64, shape=(None, ONE_HOT_DIM))
y_label = tf.placeholder(tf.float64, shape=(None, ONE_HOT_DIM))

# word embedding will be 2 dimension for 2d visualization
EMBEDDING_DIM = 30

# hidden layer: which represents word vector eventually
W1 = tf.Variable(tf.random_normal([ONE_HOT_DIM, EMBEDDING_DIM], dtype=tf.float64))
b1 = tf.Variable(tf.random_normal([1], dtype=tf.float64)) #bias
hidden_layer = tf.add(tf.matmul(x,W1), b1)

# output layer
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, ONE_HOT_DIM], dtype=tf.float64))
b2 = tf.Variable(tf.random_normal([1], dtype=tf.float64))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, W2), b2))

# loss function: cross entropy
loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))

# training operation(learning rate)
train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)

**Train**

In [31]:
%%time

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) 

iteration = 20000
for i in range(iteration + 1):
    X, Y = data_generate(n_Train, n_Voc, n_Corpus, WINDOW_SIZE, word2int)
    # input is X_train which is one hot encoded word
    # label is Y_train which is one hot encoded neighbor word
    sess.run(train_op, feed_dict={x: X, y_label: Y})
    if i % 500 == 0:
        print('iteration '+str(i)+' loss is : ', sess.run(loss, feed_dict={x: X, y_label: Y}))

iteration 0 loss is :  53.79406934925481
iteration 500 loss is :  47.01017057432335
iteration 1000 loss is :  42.0574215200344
iteration 1500 loss is :  44.18546820230902
iteration 2000 loss is :  43.17851355761522
iteration 2500 loss is :  41.7592424140059
iteration 3000 loss is :  41.14403758967832
iteration 3500 loss is :  41.210695901329515
iteration 4000 loss is :  39.1890421073057
iteration 4500 loss is :  38.269837078076314
iteration 5000 loss is :  38.75588280891252
iteration 5500 loss is :  40.72979611129072
iteration 6000 loss is :  40.69517773504901
iteration 6500 loss is :  40.66577103356729
iteration 7000 loss is :  38.374525806967384
iteration 7500 loss is :  38.09929523931144
iteration 8000 loss is :  37.754595317570725
iteration 8500 loss is :  40.52318791816758
iteration 9000 loss is :  38.61011647155477
iteration 9500 loss is :  40.13962325479208
iteration 10000 loss is :  35.796861674327864
iteration 10500 loss is :  38.6545392371024
iteration 11000 loss is :  34.528

**Visualization**

In [32]:
import sklearn
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.manifold import TSNE

Z = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
Z_embedded = TSNE(n_components=2).fit_transform(Z)
Z_embedded.shape

(4, 2)

In [33]:
vectors = sess.run(W1 + b1)
vectors

array([[ 1.41398457,  0.480239  ,  0.30743152, ..., -0.01250384,
         2.06692195,  1.43803316],
       [ 0.7081365 ,  0.67223525,  0.65581915, ...,  0.72718272,
         0.76164327,  0.67042133],
       [ 1.35394486,  0.83903413,  0.26914348, ...,  0.93283867,
         0.84879975,  0.75797476],
       ...,
       [ 2.08022564, -1.88437571, -0.71724912, ...,  0.87569395,
         0.60750555,  1.91521527],
       [ 0.7604972 ,  1.21063999, -0.44965853, ...,  2.02046053,
         1.05567307,  1.61227256],
       [ 0.56006262,  0.94461503,  1.24609762, ...,  1.79657295,
        -0.37421855, -0.19310459]])

**Cosine Similiarity**

In [5]:
import pandas as pd

source_file = pd.read_excel("trial_data_ws_50(submitted).xlsx")