<a href="https://colab.research.google.com/github/forest1102/-/blob/master/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Requirements:

In [11]:
!pip install tensorflow-gpu==2.0.0b1 cupy numpy

Collecting tensorflow-gpu==2.0.0b1
[?25l  Downloading https://files.pythonhosted.org/packages/2b/53/e18c5e7a2263d3581a979645a185804782e59b8e13f42b9c3c3cfb5bb503/tensorflow_gpu-2.0.0b1-cp36-cp36m-manylinux1_x86_64.whl (348.9MB)
[K     |████████████████████████████████| 348.9MB 42kB/s 
[?25hCollecting cupy
[?25l  Downloading https://files.pythonhosted.org/packages/dc/89/99f980706c61e6b96a579a81dea3eb68c22df1b526bf357673be5e18fe31/cupy-7.0.0.tar.gz (3.7MB)
[K     |████████████████████████████████| 3.7MB 30.5MB/s 
Collecting tf-estimator-nightly<1.14.0.dev2019060502,>=1.14.0.dev2019060501
[?25l  Downloading https://files.pythonhosted.org/packages/32/dd/99c47dd007dcf10d63fd895611b063732646f23059c618a373e85019eb0e/tf_estimator_nightly-1.14.0.dev2019060501-py2.py3-none-any.whl (496kB)
[K     |████████████████████████████████| 501kB 37.7MB/s 
Collecting tb-nightly<1.14.0a20190604,>=1.14.0a20190603
[?25l  Downloading https://files.pythonhosted.org/packages/a4/96/571b875cd81dda9d5dfa1422

In [2]:
import tensorflow as tf
tf.test.gpu_device_name()
print(tf.reduce_sum(tf.random.normal([1000, 1000])))

tf.Tensor(-67.05016, shape=(), dtype=float32)


In [0]:
!mkdir dataset
!wget https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt -P /content/dataset
!wget https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.test.txt -P /content/dataset

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, Lambda, Input, Reshape, Dot, Flatten
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard

import pickle

from util import create_contexts_target, most_similar
from dataset import ptb
from negative_sampling import generate_with_negative_sample

tensorboard_callback = TensorBoard(log_dir='logs/cbow')

window_size = 10
hidden_size = 100
batch_size = 100
max_epoch = 15
sample_size = 5

corpus, word_to_id, id_to_word = ptb.load_data('train')
test_corpus = ptb.load_data('test')[0]

vocab_size = len(word_to_id)
contexts, target = create_contexts_target(corpus, window_size)
test_contexts, test_target = create_contexts_target(test_corpus, window_size)

print('corpus', corpus.shape)
print('contexts', contexts.shape)
print('target', target.shape)


contexts_input = Input(shape=(window_size * 2,), name='contexts_input')
target_input = Input(shape=(1,), name='target_input')

embed = Embedding(vocab_size, hidden_size, input_length=window_size * 2)

contexts_embed = embed(contexts_input)
contexts_hidden = Lambda(lambda arr: K.mean(arr, axis=1))(contexts_embed)

target_embed = Embedding(vocab_size, hidden_size, input_length=1)(target_input)
target_hidden = Reshape((hidden_size, ))(target_embed)

embed_dot = Dot(axes=1)([contexts_hidden, target_hidden])
output = Dense(1, activation='sigmoid')(embed_dot)

model = Model(inputs=[contexts_input, target_input], outputs=output)
print('corpus', corpus.shape)
print('contexts', contexts.shape)
print('target', target.shape)

model.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=['acc']
)

print(model.summary())
hist = model.fit_generator(
    generate_with_negative_sample(
        corpus, contexts, target, batch_size, sample_size=sample_size),
    steps_per_epoch=len(contexts) // batch_size,
    initial_epoch=0,
    epochs=max_epoch, callbacks=[tensorboard_callback],
)

word_vecs = model.get_weights()[0]
params = {}
params['word_vecs'] = word_vecs.astype(np.float16)
params['word_to_id'] = word_to_id
params['id_to_word'] = id_to_word
pkl_file = 'cbow_params.pkl'  # or 'skipgram_params.pkl'
with open(pkl_file, 'wb') as f:
    pickle.dump(params, f, -1)



ModuleNotFoundError: ignored