In [9]:
# 导包
import collections
import math
import os
import random
import zipfile
import numpy as np
import urllib
import tensorflow as tf
import pandas as pd

In [10]:
def get_words(data_df, feature = 'words'):
    data_df = data_df[feature].values
    dictionary = []
    for sent in data_df:
        sent = sent[1:-1]
        sent = sent.split(';')
        for word in sent:
            if word != '':
                dictionary.append(word)
    return dictionary

In [16]:
# hancks 分词
model_path = '../input/'

train_word = pd.read_csv(model_path + 'train_word.csv')
test_word = pd.read_csv(model_path + 'predict_word.csv')
data_word = pd.concat([train_word, test_word])
    
words = get_words(data_word, feature = 'words')
print(len(words), words[0:100])

3169331 ['好', '大', '的', '一个', '游乐', '公园', '已经', '去', '了', '2', '次', '但', '感觉', '还', '没有', '玩', '够', '似的', '会', '有', '第', '三', '第', '四', '次', '的', '新中国', '成立', '也', '是', '在', '这', '举行', '对', '我们', '中国人', '来说', '有些', '重要', '及', '深刻', '的', '意义', '庐山', '瀑布', '非常', '有名', '也', '有', '非常', '多', '个', '瀑布', '只是', '最', '好看', '的', '非', '三叠泉', '莫', '属', '推荐', '一', '去', '个人', '觉得', '颐和园', '是', '北京', '最', '值', '的', '一起', '的', '地方', '不过', '相比', '下', '门票', '也', '是', '最贵', '的', '比起', '故宫', '的', '雄伟', '与', '气势磅礴', '颐和园', '的', '宁静', '与', '波光粼粼', '更加', '美', '吧', '~', '迪斯尼', '一日游']


In [17]:
vocabulary_size = 50000
def build_dataset(words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words) # 用编号存储 可以节省内存

In [18]:
del words
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

Most common words (+UNK) [['UNK', 46942], ('的', 206104), ('是', 54711), ('了', 54278), ('去', 41036)]
Sample data [25, 66, 1, 22, 1178, 143, 161, 4, 3, 171] ['好', '大', '的', '一个', '游乐', '公园', '已经', '去', '了', '2']


In [19]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape = (batch_size), dtype = np.int32) # 行向量
    labels = np.ndarray(shape = (batch_size, 1), dtype  = np.int32) # 列向量
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen = span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels



In [20]:
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

66 大 -> 25 好
66 大 -> 1 的
1 的 -> 22 一个
1 的 -> 66 大
22 一个 -> 1178 游乐
22 一个 -> 1 的
1178 游乐 -> 22 一个
1178 游乐 -> 143 公园


In [21]:
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2

valid_size = 16
valid_window = 100
valid＿examples = np.random.choice(valid_window, valid_size, replace=False) # [0, valid_window) 的随机数 不重复 len = valid_size
num_sampled = 64

In [22]:
graph = tf.Graph()
with graph.as_default():
    train_inputs = tf.placeholder(tf.int32, shape = [batch_size])
    train_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype = tf.int32)
    embeddings = tf.Variable(
                tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)
    nce_weights = tf.Variable(
                tf.truncated_normal([vocabulary_size, embedding_size], 
                                   stddev = 1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
    loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                         biases=nce_biases,
                                         labels=train_labels,
                                         inputs=embed,
                                         num_sampled=num_sampled,
                                         num_classes=vocabulary_size))
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings =tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)
    init = tf.global_variables_initializer()

In [23]:
num_steps = 100001

with tf.Session(graph=graph) as session:
    init.run()
    print('Initialized')
    
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(
            batch_size, num_skips, skip_window)
        feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val
        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            print('Average loss at step', step, ': ', average_loss)
            average_loss = 0
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to % s: ' % valid_word
                for k in range(top_k):
                    closed_word = reverse_dictionary[nearest[k]]
                    log_str = "%s %s, " % (log_str, closed_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step 0 :  294.903564453125
Nearest to 等:  西对,  SPA,  吹发,  灯会,  白马涧,  對於,  大元帅,  花雨, 
Nearest to 去:  水殿,  1.4,  朗朗上口,  坐船来,  上下,  慎,  氧气瓶,  拆卸, 
Nearest to 这个:  外人,  2.3,  昆明,  雪花,  宝冠,  黄线,  短途,  穿衣服, 
Nearest to 西湖:  自然保护区,  里能,  灵兽,  够劲,  母亲节,  宋真宗,  南广场,  整个, 
Nearest to 最:  藉,  翘角,  武当,  没赶上,  摸着,  溜溜,  中小,  适应, 
Nearest to 有:  鎏金,  揣,  沟沟,  形制,  基本相同,  砥砺,  那一次,  西部, 
Nearest to 但:  捐,  伤痕,  铁栏杆,  腹心,  吴隐,  古塔,  设立,  有客栈, 
Nearest to 这里:  慵懒,  有机,  水乡好点,  不卖,  夸张,  岱山,  胎,  雄壮, 
Nearest to 两:  乡试,  >n--,  回头,  少少,  基金,  堆成,  太阳石,  桃, 
Nearest to 不错:  冰上,  上官,  三十三,  太酷,  负,  仅,  稍,  航空公司网站, 
Nearest to 特别:  土砖,  山谷,  以备,  节日期间,  拒人千里,  小王子,  岭南,  比赛, 
Nearest to 看看:  耳熟能详,  联想起,  扬州,  爱情,  石景山,  那多,  贪吃,  山间, 
Nearest to 了:  坝顶,  哪知,  井冈山革命博物馆,  中国,  西魏,  倒插,  彷佛,  滑润, 
Nearest to 能:  平台,  地理书,  参天大树,  吉普车,  逐,  附身,  盐池,  6666, 
Nearest to 因为:  总投资,  族规,  秦勇,  嫂,  飞来为,  降临,  快照,  调味, 
Nearest to 不是:  山和云,  抽走,  大佛像,  玉渊潭公园,  淮北,  翻造,  梨木,  大华山镇, 
Averag