**Set up the library**

In [1]:
import jieba
import string
import re

In [2]:
f_read =  open("zhwiki-20190401_all.txt", 'r')
all_text_str = f_read.readlines()
all_words = []

**Chinese Word Segmentation**

In [3]:
%%time

for i in range(len(all_text_str)):
    current_line = jieba.lcut(all_text_str[i].strip('\n'))
    all_words += current_line
    
len(all_words)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.593 seconds.
Prefix dict has been built succesfully.


CPU times: user 9min 18s, sys: 2.66 s, total: 9min 21s
Wall time: 9min 21s


In [4]:
all_words[-10:]

['谈判', '修改', '现有', '的', '条约', '这不应', '直接', '违背', '其', '精神']

In [5]:
words_set = set(all_words)
len(words_set)

1687575

**Get most common words**

In [6]:
import collections

word_counters = collections.Counter(all_words)

most_common_words = set()

for tup in word_counters.most_common(1000000):
    most_common_words.add(tup[0])
    
len(most_common_words)

1000000

**Change less frequent words to uncommon**

In [7]:
for i in range(len(all_words)):
    if all_words[i] not in most_common_words:
        all_words[i] = 'uncommon'
        
all_words.count('uncommon')

687575

**Convert text to int**

In [8]:
def str_to_int(lst):
    counter = 0
    new_list = []
    existed = {}
    
    for string in lst:
        if string not in existed:
            existed[string] = counter
            new_list.append(existed[string])
            counter += 1
        else:
            new_list.append(existed[string])
            
    return new_list

In [9]:
word2int = str_to_int(all_words)

In [10]:
min(word2int)

0

In [11]:
max(word2int)

1000000

**Set up the constants**

In [12]:
n_Train = 50
n_Voc = max(word2int) + 1
n_Corpus = len(word2int)
WINDOW_SIZE = 2

In [13]:
n_Voc

1000001

**Initialize two arrays**

In [14]:
import numpy as np

def data_generate(n_Train, n_Voc, n_Corpus, WINDOW_SIZE, word2int):

    X = np.zeros((n_Train, n_Voc))
    Y = np.zeros((n_Train, n_Voc))

    for i in range(n_Train):
        j = np.random.choice(n_Corpus, 1)[0]

        tw = word2int[j]

        Y[i][tw] = 1

        for k in range(j - WINDOW_SIZE, j + WINDOW_SIZE + 1):
            if k != j and k >= 0 and k < n_Corpus:
                cw = word2int[k]
                X[i][cw] = 1
                
    return X, Y

In [15]:
data_generate(n_Train, n_Voc, n_Corpus, WINDOW_SIZE, word2int)

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]))

In [16]:
import tensorflow as tf

ONE_HOT_DIM = n_Voc

# # function to convert numbers to one hot vectors
# def to_one_hot_encoding(data_point_index):
#     one_hot_encoding = np.zeros(ONE_HOT_DIM)
#     one_hot_encoding[data_point_index] = 1
#     return one_hot_encoding

# X = [] # input word
# Y = [] # target word

# for x, y in zip(df['input'], df['label']):
#     X.append(to_one_hot_encoding(word2int[ x ]))
#     Y.append(to_one_hot_encoding(word2int[ y ]))

# # convert them to numpy arrays
# X_train = np.asarray(X)
# Y_train = np.asarray(Y)

# making placeholders for X_train and Y_train
x = tf.placeholder(tf.float64, shape=(None, ONE_HOT_DIM))
y_label = tf.placeholder(tf.float64, shape=(None, ONE_HOT_DIM))

# word embedding will be 2 dimension for 2d visualization
EMBEDDING_DIM = 30

# hidden layer: which represents word vector eventually
W1 = tf.Variable(tf.random_normal([ONE_HOT_DIM, EMBEDDING_DIM], dtype=tf.float64))
b1 = tf.Variable(tf.random_normal([1], dtype=tf.float64)) #bias
hidden_layer = tf.add(tf.matmul(x,W1), b1)

# output layer
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, ONE_HOT_DIM], dtype=tf.float64))
b2 = tf.Variable(tf.random_normal([1], dtype=tf.float64))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, W2), b2))

# loss function: cross entropy
loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))

# training operation(learning rate)
train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


**Train**

In [17]:
%%time

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) 

iteration = 20000
for i in range(iteration + 1):
    X, Y = data_generate(n_Train, n_Voc, n_Corpus, WINDOW_SIZE, word2int)
    # input is X_train which is one hot encoded word
    # label is Y_train which is one hot encoded neighbor word
    sess.run(train_op, feed_dict={x: X, y_label: Y})
    if i % 500 == 0:
        print('iteration '+str(i)+' loss is : ', sess.run(loss, feed_dict={x: X, y_label: Y}))

iteration 0 loss is :  51.86691596610763
iteration 500 loss is :  49.97241429931691
iteration 1000 loss is :  47.666399378094866
iteration 1500 loss is :  48.96725584069278
iteration 2000 loss is :  43.60268463901727
iteration 2500 loss is :  45.62289750878261
iteration 3000 loss is :  44.846478017937756
iteration 3500 loss is :  42.86972258313058
iteration 4000 loss is :  40.70336120461813
iteration 4500 loss is :  43.81406950035069
iteration 5000 loss is :  42.75522304353021
iteration 5500 loss is :  40.35201593363885
iteration 6000 loss is :  41.687316811872535
iteration 6500 loss is :  38.85005114050509
iteration 7000 loss is :  39.00162116204112
iteration 7500 loss is :  41.84470323988038
iteration 8000 loss is :  33.88606290894088
iteration 8500 loss is :  42.168658714458125
iteration 9000 loss is :  40.01659987190755
iteration 9500 loss is :  41.00725540925132
iteration 10000 loss is :  39.801117146318255
iteration 10500 loss is :  37.79810159786446
iteration 11000 loss is :  39

**Visualization**

In [18]:
import sklearn
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.manifold import TSNE

Z = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
Z_embedded = TSNE(n_components=2).fit_transform(Z)
Z_embedded.shape

(4, 2)

In [19]:
vectors = sess.run(W1 + b1)
vectors

array([[-1.59156672, -0.57026222, -0.69728516, ..., -0.62832601,
        -0.31088168, -1.35810445],
       [-0.63528738, -0.66813511, -0.6124754 , ..., -0.57947239,
        -0.58757997, -0.64819941],
       [-1.09588446, -1.36926045, -0.44086647, ..., -1.29533656,
        -1.16729813, -1.18687859],
       ...,
       [-0.68109982, -1.9117866 ,  0.75411158, ..., -0.01693344,
        -0.14862241,  0.02020211],
       [-0.88872772,  0.27664483, -0.46467466, ..., -0.76794293,
         0.62109894, -2.96886936],
       [-0.98949307,  0.33704659,  1.30515209, ...,  0.64238983,
        -2.31984718, -1.4030897 ]])

In [0]:
vector_embedded_2d = TSNE(perplexity=100).fit_transform(vectors)
vector_embedded_2d.shape

In [0]:
len(words)

In [0]:
def tsne_plot_2d(label, embeddings, words=[], a=1):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, 1))
    x = embeddings[:,0]
    y = embeddings[:,1]
    plt.scatter(x, y, c=colors, alpha=a, label=label)
    for i, word in enumerate(words):
        plt.annotate(word, alpha=0.3, xy=(x[i], y[i]), xytext=(5, 2), 
                     textcoords='offset points', ha='right', va='bottom', size=10)
    plt.legend(loc=4)
    plt.grid(True)
    plt.savefig("hhh.png", format='png', dpi=150, bbox_inches='tight')
    plt.show()

tsne_plot_2d('Visualizing Embeddings using t-SNE', vector_embedded_2d, word_set, a=0.1)

In [0]:
vector_embedded_3d = TSNE(n_components=3).fit_transform(vectors)
vector_embedded_3d.shape

In [0]:
from mpl_toolkits.mplot3d import Axes3D

In [0]:
def tsne_plot_3d(title, label, embeddings, a=1):
    fig = plt.figure()
    ax = Axes3D(fig)
    colors = cm.rainbow(np.linspace(0, 1, 1))
    plt.scatter(embeddings[:, 0], embeddings[:, 1], embeddings[:, 2], c=colors, alpha=a, label=label)
    plt.legend(loc=4)
    plt.title(title)
    plt.show()


tsne_plot_3d('Visualizing Embeddings using t-SNE', 'tasa_1.txt', vector_embedded_3d, a=0.1)

**Remove stop words**

In [0]:
# import nltk
# nltk.download('stopwords')

In [0]:
# stopwords = nltk.corpus.stopwords.words('english')
# cleaned_words = []

# for word in words:
#     if word not in stopwords:
#         cleaned_words.append(word)