In [1]:
import sys
import os

import tensorflow as tf
print('TensorFlow:{}'.format(tf.__version__))
import numpy as np
print('NumPy:{}'.format(np.__version__))
import pandas as pd
print("Pandas:{}".format(pd.__version__))

import matplotlib.pyplot as plt

current_path = os.getcwd()
base_dir = os.path.dirname(current_path)

if not base_dir in sys.path:
    sys.path.append(base_dir)

print(sys.path)

%reload_ext autoreload
%autoreload 2
import datasetslib


from keras.models import Model
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.sequence import skipgrams 
from keras.layers import Input, Dense, Reshape, Dot, merge
import keras

from datasetslib.ptb import PTBSimple
from datasetslib import util as dsu
from datasetslib import nputil

ptb = PTBSimple()
# 加载数据，将单词转换为ids，将文件转换为ids列表
ptb.load_data()

tf.reset_default_graph()
keras.backend.clear_session()

valid_size = 8
x_valid = np.random.choice(valid_size * 10, valid_size, replace=False)
print('valid: ', x_valid)

batch_size = 1024
embedding_size = 512
n_negative_samples = 64
ptb.skip_window = 2

sample_table = sequence.make_sampling_table(ptb.vocab_len)
pairs, labels = sequence.skipgrams(ptb.part['train'], ptb.vocab_len, window_size=ptb.skip_window, sampling_table=sample_table)

print('The skip-gram pairs: target, context')
for i in range(5 * ptb.skip_window):
    print(['{} {}'.format(id, ptb.id2word[id]) for id in pairs[i]], ':', labels[i])
    
    

TensorFlow:1.13.1
NumPy:1.16.3
Pandas:0.24.2
['c:\\python36\\python36.zip', 'c:\\python36\\DLLs', 'c:\\python36\\lib', 'c:\\python36', '', 'c:\\python36\\lib\\site-packages', 'c:\\python36\\lib\\site-packages\\pip-9.0.1-py3.6.egg', 'c:\\python36\\lib\\site-packages\\IPython\\extensions', 'C:\\Users\\WJ\\.ipython', 'D:\\Work\\tensorflow_test']


Using TensorFlow backend.


Already exists: ./datasets\ptb-simple\simple-examples.tgz
valid:  [63 60 46  3 33 68 22 26]
The skip-gram pairs: target, context
['691 pacific', '9508 trademark'] : 0
['5684 guests', '4261 discounts'] : 0
['9058 insider-trading', '2735 forms'] : 0
['349 without', '314 until'] : 1
['3163 controlling', '866 acquire'] : 1
['4779 reopen', '779 failed'] : 0
['4086 buick', '3658 whitbread'] : 0
['716 chemical', '88 such'] : 1
['211 among', '1233 death'] : 1
['6097 lag', '1 <unk>'] : 1


In [1]:
# 将目标和上下文单词转换为二维数组
x, y = zip(*pairs)
x = np.array(x, dtype=np.int32)
x = nputil.to2d(x, unit_axis=1)

y = np.array(y, dtype=np.int32)
y = nputil.to2d(y, unit_axis=1)

labels = np.array(labels, dtype=np.int32)
labels = nputil.to2d(labels, unit_axis=1)

# 构建目标词模型
target_in = Input(shape=(1,), name='target_in')
target = Embedding(ptb.vocab_len, embedding_size, input_length=1,
                  name='target_em')(target_in)
target = Reshape((embedding_size, 1), name='target_re')(target)

# 构建上下文词模型
context_in = Input((1,), name='context_in')
context = Embedding(ptb.vocab_len, embedding_size, input_length=1,
                   name='context_em')(context_in)
context = Reshape((embedding_size, 1), name='context_re')(context)

# 将两个模型内积以检查相似性并添加sigmoid层
output = Dot(axes=1, name='output_dot')([target, context])
output = Reshape((1,), name='output_re')(output)
output = Dense(1, activation='sigmoid', name='output_sig')(output)

# 创建用于查找词向量的功能性模型l
model = Model(inputs=[target_in, context_in], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

# 合并模型并创建模型以检查余弦相似性
similarity = Dot(axes=0, normalize=True, name='sim_dot')([target, context])
similarity_model = Model(inputs=[target_in, context_in], outputs=similarity)

n_epochs = 5
batch_size = 1024

model.fit([x, y], labels, batch_size=batch_size, epochs=n_epochs)

top_k = 5
batch_size = 1024
y_val = np.arange(ptb.vocab_len, dtype=np.int32)
y_val = nputil.to2d(y_val, unit_axis=1)

for i in range(valid_size):
    x_val = np.full(shape=(ptb.vocab_len, 1), fill_value=x_valid[1], dtype=np.int32)
    similarity_scores = similarity_model.predict([x_val, y_val])
    similarity_scores = similarity_scores.flatten()
    similar_words = (-similarity_scores).argsort()[1:top_k+1]
    similar_str = 'Similar to {0:}:'.format(ptb.id2word[x_valid[i]])
    for k in range(top_k):
        similar_str = '{0:} {1:}'.format(similar_str, ptb.id2word[similar_words[k]])
    print(similar_str)
    
    

NameError: name 'pairs' is not defined