In [34]:
import numpy as np

from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input
from keras.layers.merge import Dot
from keras.optimizers import SGD
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams

import gensim
import codecs   

In [86]:
np.random.seed(13)

# path = get_file('alice.txt', origin='http://www.gutenberg.org/files/11/11-0.txt')
path = '/home/junsoo/PycharmProjects/word2vec_sample/corpus.txt'
corpus = codecs.open(path, "r", encoding='utf-8', errors='ignore').readlines()

corpus = [sentence for sentence in corpus if sentence.count(' ') >= 2]

In [87]:
#print(corpus[0:10])
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\r\t\n')
tokenizer.fit_on_texts(corpus)

V = len(tokenizer.word_index) + 1
V

12195

In [88]:
dim_embedddings = 128

In [89]:
# inputs
word_inputs = Input(shape=(1,), dtype='int32')
w = Embedding(V, dim_embedddings)(word_inputs)

In [90]:
# context
context_inputs = Input(shape=(1,), dtype='int32')
context  = Embedding(V, dim_embedddings)(context_inputs)
print(context_inputs.shape)
print(context.shape)
output_layer = Dot(axes=2)([w, context])

print(output_layer.shape)
output_layer = Reshape((1,), input_shape=(1, 1))(output_layer)
output_layer = Activation('sigmoid')(output_layer)

(?, 1)
(?, 1, 128)
(?, 1, 1)


In [91]:
SkipGram = Model(inputs=[word_inputs, context_inputs], outputs=output_layer)
SkipGram.summary()
SkipGram.compile(loss='binary_crossentropy', optimizer='Adadelta', metrics=['accuracy'])


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_13 (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
input_14 (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_13 (Embedding)         (None, 1, 128)        1560960     input_13[0][0]                   
____________________________________________________________________________________________________
embedding_14 (Embedding)         (None, 1, 128)        1560960     input_14[0][0]                   
___________________________________________________________________________________________

In [93]:
epochs = 1
import time

t2s = tokenizer.texts_to_sequences(corpus[:1])
len_t2s = len(t2s)

for cur_epoch in range(epochs):
    loss = 0.
    accuracy = 0.

    start_time = time.time()        
    for i, doc in enumerate(t2s):
        print('----------------')
        print(doc)
        data, labels = skipgrams(sequence=doc, vocabulary_size=V, window_size=2, negative_samples=5., shuffle=False)
        print('----------------')
        print(data)
        print('----------------')
        print(labels)
        x = [np.array(x) for x in zip(*data)]
        y = np.array(labels, dtype=np.int32)
        print('----------------')
        print(x)
        print('----------------')
        print(y)
        print('----------------')
        print('----------------')
        if x:
            # print(SkipGram.train_on_batch(x, y))
            train_result = SkipGram.train_on_batch(x, y)
            loss += train_result[0]
            accuracy += train_result[1]
    
    avg_loss = loss / len_t2s
    avg_acc = accuracy / len_t2s
    end_time = time.time()
    duration = end_time - start_time
    print("\t%d/%d: %s\t%s\t%f sec" % (cur_epoch, epochs, avg_loss, avg_acc, duration))
    


----------------
[7018, 5098, 964, 1296, 6, 1863, 28, 1115, 3953]
----------------
[[7018, 5098], [7018, 964], [5098, 7018], [5098, 964], [5098, 1296], [964, 7018], [964, 5098], [964, 1296], [964, 6], [1296, 5098], [1296, 964], [1296, 6], [1296, 1863], [6, 964], [6, 1296], [6, 1863], [6, 28], [1863, 1296], [1863, 6], [1863, 28], [1863, 1115], [28, 6], [28, 1863], [28, 1115], [28, 3953], [1115, 1863], [1115, 28], [1115, 3953], [3953, 28], [3953, 1115], [6, 5326], [1296, 5528], [7018, 9905], [1863, 5903], [1296, 7329], [6, 2947], [5098, 1143], [1115, 2654], [6, 6974], [1115, 2138], [6, 6494], [28, 7361], [964, 11944], [1115, 9865], [7018, 3601], [964, 11339], [1296, 10932], [1863, 11756], [3953, 9605], [28, 1005], [1863, 9457], [1296, 963], [3953, 8206], [1863, 9156], [28, 9967], [5098, 6404], [28, 5013], [5098, 2294], [964, 4763], [964, 7569]]
----------------
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [95]:
num_negative_samples = 30 * 5
print(num_negative_samples)

150


In [17]:
print("Save weights...")
vector_filename = 'vectors_notebook.txt'
f = open(vector_filename ,'w')
f.write('{} {}\n'.format(V-1, dim_embedddings))
vectors = SkipGram.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

Save weights...


In [16]:
# TEST
def most_similar(positive=[], negative=[], topn=20):
    w2v = gensim.models.KeyedVectors.load_word2vec_format(vector_filename, binary=False)
    for v in w2v.most_similar(positive=positive, negative=negative):
        print(v)


In [15]:
print("Check for queen...")
most_similar(positive=['queen'], topn=10)
print("Check for alice...")
most_similar(positive=['alice'], topn=10)
print("Check for the...")
most_similar(positive=['the'], topn=10)
print("Check for king-he+she...")
most_similar(positive=['king', 'she'], negative=['he'], topn=10)

Check for queen...
('king', 0.7746679782867432)
('hearts', 0.7561852335929871)
('tarts', 0.749153733253479)
('suppressed', 0.7379329204559326)
('mock', 0.736181378364563)
('took', 0.73515385389328)
('march', 0.7342950105667114)
('white', 0.7261685132980347)
('lobster', 0.7245875597000122)
('end', 0.7231849431991577)
Check for alice...
('thought', 0.6647700071334839)
('glad', 0.658423125743866)
('curious', 0.6461477279663086)
('it’s', 0.6376360654830933)
('wasn’t', 0.6288649439811707)
('i’m', 0.6193730235099792)
('remarked', 0.6177995204925537)
('‘but', 0.6165159940719604)
('certainly', 0.6160486340522766)
('she’ll', 0.6087310314178467)
Check for the...
('queen', 0.7193769216537476)
('of', 0.6276025772094727)
('other', 0.5941148996353149)
('king', 0.5940526723861694)
('by', 0.5509505271911621)
('with', 0.5400514602661133)
('from', 0.5364124774932861)
('those', 0.5277484655380249)
('tax', 0.518311083316803)
('owner', 0.5155541300773621)
Check for king-he+she...
('beginning', 0.4409085214