In [1]:
import collections
import math
import os
import sys
import argparse
import random
from tempfile import gettempdir
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

from tensorflow.contrib.tensorboard.plugins import projector

  return f(*args, **kwds)


In [33]:
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words."""
  with zipfile.ZipFile(filename) as f:
    # zipfile读取文件后，得到文件名，再用f.read得到bytes。python3中bytes是对应编码的，这里应该是默认的utf-8
    # 再使用tensorflow的as_str进行解码后，切分为单词list。as_str将bytes或者unicode均解码为unicode，且能兼容不同的python版本
    # compat的意思就是compatible
#     print(type(f.read(f.namelist()[0])))
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data

In [56]:
# 读取文件
filename = 'data/text8.zip'
vocabulary = read_data(filename)
print('Data size', len(vocabulary))

Data size 17005207


In [45]:
# 通过给定的词典大小，将出现次数较少的单词设置为UNK词，从而缩小词典
# 返回各个词在词典中的编号（codes），以及各词的词频等信息
# data - list of codes (integers from 0 to vocabulary_size-1).
#   This is the original text but words are replaced by their codes
# count - map of words(strings) to count of occurrences
# dictionary - map of words(strings) to their codes(integers)
# reverse_dictionary - maps codes(integers) to words(strings)
def build_dataset(words, n_words):
  """Process raw inputs into a dataset."""
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(n_words - 1))
  # dictionary中存储每个单词到编号的映射
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  # data为存储所有单词编号的list
  data = list()
  # 将所有低频词的频率相加，作为UNK的频率
  unk_count = 0
  for word in words:
    index = dictionary.get(word, 0)
    if index == 0:  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  # 设置UNK的词频
  count[0][1] = unk_count
  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reversed_dictionary

In [57]:
# 生成词典和编号与词的映射
vocabulary_size = 50000
data, count, dictionary, reverse_dictionary = build_dataset(
    vocabulary, vocabulary_size)
# del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])



Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [59]:
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  # 为何batch_size一定要能整除num_skips?
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  # 生成一个大小为span的双端队列
  buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
  # 当前数据索引 + 滑动窗口大小如果大于了data的长度，则从头开始
  if data_index + span > len(data):
    data_index = 0
  # 初始化双端队列，设置为data中的第一个窗口[0 ~ span]，不包括span, list的特性是大于等于第一个索引，小于最后一个索引  
  buffer.extend(data[data_index:data_index + span])
  data_index += span
  # 总共需要生成batch_size个（contest，target)样本，分为batch_size // num_skips组，每组里面有num_skip个sample
  for i in range(batch_size // num_skips): # 取整
    # 由于 span一定是奇数，且一定以skip_window所在的位置为中心，所以可以以此获取context
    context_words = [w for w in range(span) if w != skip_window]
    # 从上下文词语中随机抽取num_skips个词语，用于生成一个batch中的一组sample
    words_to_use = random.sample(context_words, num_skips)
    for j, context_word in enumerate(words_to_use):
      # batch里面的索引公式，提现了每组有nums_skips个sample，总共i组的推断
      # COBOW由contexts推断target，即包含多个(contexts, target)组成的sample，如：([the, brown], quick), ([quick, fox], brown)
      # 而skip-gram正好相反，由target预测某一个contex的概率，即：(quick, the), (quick, brown), (brown, quick), (brown, fox)
      # 这里batch存储的是target单测的索引，lables存储的是对应的context word的索引，每一对batch-labels为一个正样本
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[context_word]
    # 若滑动窗口到了数据的结尾，则双端队列的滑动窗口又从0开始
    if data_index == len(data):
      buffer.extend(data[0:span])
      data_index = span
    else: # 否则的话，双端队列对应的窗口往前移动一位
      buffer.append(data[data_index])
      data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
  # 将data_index往前回溯span个位置，防止下次采样跳过上次batch中结尾的单词
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels

In [61]:
data_index = 0
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
  print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
        reverse_dictionary[labels[i, 0]])
len(batch)


3081 originated -> 5234 anarchism
3081 originated -> 12 as
12 as -> 3081 originated
12 as -> 6 a
6 a -> 12 as
6 a -> 195 term
195 term -> 6 a
195 term -> 2 of


8

In [66]:
# Step 4: Build and train a skip-gram model.

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1  # How many words to consider left and right.
num_skips = 2  # How many times to reuse an input to generate a label.
num_sampled = 64  # Number of negative examples to sample.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

graph = tf.Graph()

with graph.as_default():

  # Input data.
  with tf.name_scope('inputs'):
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

  # Ops and variables pinned to the CPU because of missing GPU implementation
  with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    with tf.name_scope('embeddings'):
      embeddings = tf.Variable(
          tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
      embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the NCE loss
    with tf.name_scope('weights'):
      nce_weights = tf.Variable(
          tf.truncated_normal(
              [vocabulary_size, embedding_size],
              stddev=1.0 / math.sqrt(embedding_size)))
    with tf.name_scope('biases'):
      nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

  # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  # Explanation of the meaning of NCE loss:
  #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
  with tf.name_scope('loss'):
    loss = tf.reduce_mean(
        tf.nn.nce_loss(
            weights=nce_weights,
            biases=nce_biases,
            labels=train_labels,
            inputs=embed,
            num_sampled=num_sampled,
            num_classes=vocabulary_size))

  # Add the loss value as a scalar to summary.
  tf.summary.scalar('loss', loss)

  # Construct the SGD optimizer using a learning rate of 1.0.
  with tf.name_scope('optimizer'):
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                            valid_dataset)
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

  # Merge all summaries.
  merged = tf.summary.merge_all()

  # Add variable initializer.
  init = tf.global_variables_initializer()

  # Create a saver.
  saver = tf.train.Saver()

TypeError: reduce_sum() got an unexpected keyword argument 'keepdims'

[1B 90177[17;26H[31m[1m38.3G [m[36m5064M 2450M [m[mS 15.4  3.1 [31m[1m11h[m[m02:15 [32mpython train_ima[18;1H[m[m 90184[18;26H[31m[1m38.3G [m[36m5064M 2450M [m[mS 14.7  3.1 [31m[1m11h[m[m02:19 [32mpython train_ima[19;4H[m[m218[19;35H[36m4[19;47H[m[m4.7[6C[31m[1m 7h[m[m01:2[20;5H72[20;35H[36m4[20;47H[m[m4.7[20;62H36[21;4H210[21;35H[36m4[21;44H[32mR [m[m14.0[6C[31m[1m 7h[m[m01:54[22;5H69[22;35H[36m4[22;47H[m[m4.0[22;62H19[23;5H83[23;35H[36m4[23;44H[32mR [m[m14.0[6C[31m[1m11h[m[m02:47[H [m[36m5060M 2450M [m[mS 104.  3.1 [31m[1m 6h[m[m59:15 [32mpython train_ima[24;1H[m[mF1[30m[46mHelp  [m[mF2[30m[46mSetup [m[mF3[30m[46mSearch[m[mF4[30m[46mFilter[m[mF5[30m[46mTree  [m[mF6[30m[46mSortBy[m[mF7[30m[46mNice -[m[mF8[30m[46mNice +[m[mF9[30m[46mKill  [m[mF10[30m[46mQui[4ht[4l[H[m[m|||[30m[1m50.0%[m][m   [36m17 [m[1m[[m[32m|||||100.0%[m[1m][m   [36m23 

[2;12H[31m|[30m[1m44.0[2;35H5[2;47H[m[31m|[30m[1m   21[2;70H30.0[3;14H5.0[3;31H[m[31m|[30m[1m49.3[3;47H[m[31m||[30m[1m  26.7[3;67H[m[32m|[31m||[30m[1m40.4[4;9H[m[31m|[30m[1m   21.3[4;33H9.3[4;47H[m[32m|[31m||[30m[1m 28.1[4;71H4.8[5;13H33.8[5;32H31.5[5;47H[m[32m|[31m|[30m[1m  24.1[5;67H[m[31m|[6;14H[30m[1m2.9[6;29H[m[32m|[31m||[30m[1m43.7[6;48H[m[32m|[31m||[30m[1m42[6;70H28.6[7;12H[m[31m|[30m[1m43.5[7;32H32.9[7;52H9.2[7;67H[m[31m||[30m[1m 35.2[8;30H[m[33m1[8;61H[32m[1m16[9;59H[m8 [36m13.35 [m[36m12.84[10;68H[1m40[13;29H[m[30m[46m4G 5055[13;47H77[14;33H[m[36m371[14;48H[m[m6.  2.1  0:52.51[15;29H[31m[1m4G [m[36m5055[15;44H[32mR [m[m21.2[16;6H8[16;29H[31m[1m4G [m[36m5055[16;44H[m[mS 17.7[16;60H2:08[17;6H2[17;29H[31m[1m4G [m[36m5055[17;44H[m[mS 17.0[17;60H1:38[18;5H69[18;29H[31m[1m4G [m[36m5055[18;44H[m[mS 17.0[18;60H1[19;5H86[19;29H[31m[1m4G [m

[2;10H[31m|[30m[1m  33.6[2;28H[m[31m||[30m[1m  32.7[2;47H[m[31m||[30m[1m  26.0[2;65H[m[31m|[30m[1m     6.2[3;9H[m[31m||[30m[1m  29.1[3;30H  26[3;49H  31.1[3;67H[m[31m||[30m[1m 37.0[4;9H[m[31m|[30m[1m   15.0[4;31H[m[31m|[30m[1m47[4;50H[m[32m|80.[30m[1m3[4;68H  30.8[5;10H[m[31m||[30m[1m 35.2[5;26H       0.0[5;48H[m[31m|[30m[1m  27.7[6;10H[m[31m||[30m[1m 34.5[6;32H[m[32m70[31m.[30m[1m1[6;51H[m[31m6[30m[1m0.8[6;68H  28.1[7;14H9[7;30H[m[31m|[30m[1m 42[7;47H[m[31m|[30m[1m   20.8[7;67H[m[31m|[30m[1m  21.9[8;30H[m[33m7[8;62H[32m[1m7[10;68H[36m51[13;34H[m[30m[46m65[13;46H652.[14;33H[m[36m95[14;47H[m[m03.  2.5  1:03.10[15;6H4[15;34H[36m65[15;47H[m[m7.9[7C[31m[1m6h[m[m59:02[16;6H2[16;34H[36m65[16;46H[m[m60.9[16;60H0:49[17;5H18[17;34H[36m65[17;44H[m[mS 42.5[7C[31m[1m7h[m[m01:25[18;5H01[18;34H[36m65[18;44H[m[mS 15.1[7C[31m[1m7h[m[m09:34[19;3H0168[19

[?1l>[30m[1m5.9[2;33H8.4[2;51H38[2;68H  29.2[3;9H[m[31m||||66[30m[1m.7[3;28H[m[32m|[31m|[30m[1m  22.1[3;54H2[3;70H24[4;14H0.4[4;28H[m[32m|[31m|[30m[1m  25.4[4;52H4.8[4;67H[m[31m|[30m[1m  32.6[5;12H[m[31m|[30m[1m43.0[5;31H[m[31m|[30m[1m37.7%[5;52H8[5;64H[m[32m|[31m|||[30m[1m  31.8[6;14H8.6[6;28H[m[31m|[30m[1m   22.8[6;52H2.9[6;68H  28.0[7;9H[m[31m||[30m[1m  22.3[7;29H[m[32m|[31m||[30m[1m39.6[7;48H[m[32m|[31m|[30m[1m 33.8[7;71H8.7[8;30H[m[33m8[10;69H[36m[1m1[13;35H[m[30m[46m2[13;46H49[14;32H[m[36m409[14;48H[m[m1.  2.5  1:13.55[15;4H220[15;35H[36m2[15;46H[m[m26.4[7C[31m[1m2h[m[m23:10[16;5H70[16;35H[36m2[16;44H[32mR [m[m17.4[7C[31m[1m1h[m[m01:50[17;4H177[17;35H[36m2[17;49H[m[m7[7C[31m[1m1h[m[m02:1[18;35H[36m2[18;47H[m[m6.0[19;6H9[19;35H[36m2[19;44H[32mR [m[m15.3[19;60H8:39[20;5H86[20;35H[36m2[20;47H[m[m3.2[20;63H0[21;6H2[21;35H[36m2[21;47H[

In [64]:
!ls


Basic_word2vector.ipynb		 book
Recommander  System.xmind	 cf
Recommender System.ipynb	 data
__init__.py			 tensorFlow DeepFM实验.ipynb
algo.qq.com_641013010_testa.zip  util
