In [14]:
import numpy as np

In [23]:
ll_tokens = [line.rstrip() for line in open('ll_words.txt')]

In [24]:
ll_types = [line.rstrip() for line in open('ll_types.txt')]

In [29]:
index = {}
rev_index = {}
for i, ll_type in enumerate(ll_types):
    index[ll_type] = i + 1
    rev_index[i + 1] = ll_type

In [35]:
seq = []
for token in ll_tokens:
    seq.append(index.get(token, 0))
seq = np.asarray(seq)

In [40]:
from keras.preprocessing.sequence import make_sampling_table, skipgrams

vocab_size = len(ll_types) + 1
window_size = 4
vector_dim = 300


sampling_table = make_sampling_table(vocab_size)
couples, labels = skipgrams(seq, vocab_size, window_size=window_size, sampling_table=sampling_table)

word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

[[5136, 115574], [23295, 97268], [1416, 911], [9154, 305], [12611, 48087], [125, 140654], [214472, 194463], [276, 543], [85994, 425], [19301, 51156]] [0, 0, 1, 1, 1, 0, 0, 1, 1, 0]


In [41]:
word_target.shape

(90538344,)

In [42]:
from keras import layers, Input
from keras.models import Model
import keras.backend as K

def build_word2vec_model(vocab_size, vector_dim):
    input_target = layers.Input((1,))
    input_context = layers.Input((1,))

    embed = layers.Embedding(vocab_size, vector_dim, input_length=1, trainable=True)
    
    target = embed(input_target)
    target = layers.Reshape((vector_dim,))(target)
    context = embed(input_context)
    context = layers.Reshape((vector_dim,))(context)
    
    dot = layers.dot([target, context], axes=1, normalize=True)
    
    out = layers.Dense(1, activation='sigmoid')(dot)
    
    model = Model([input_target, input_context], out)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
    
    return model

    

In [43]:
K.clear_session()
m = build_word2vec_model(vocab_size, vector_dim)
m.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 300)       78000300    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 300)          0           embedding_1[0][0]                
__________

In [None]:
m.fit([word_target, word_context], labels, epochs=20, batch_size=102400)

Epoch 1/20

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

In [84]:
w = m.get_weights()

In [85]:
def sim(w, x, y):
    dot = np.sum(w[0][x] * w[0][y])
    norm_x = np.linalg.norm(w[0][x])
    norm_y = np.linalg.norm(w[0][y])
    return dot/(norm_x * norm_y)

In [117]:
[(rev_vocab[i], rev_vocab[j], sim(w, i, j)) for i in range(vocab_size) for j in range(vocab_size) if i != j]

KeyboardInterrupt: 

In [111]:
max, rev_vocab[best]


(0.48255926, 'þaceaflas')

In [112]:
import pickle

In [118]:
m.save_weights('oe_w2z.bin')

In [115]:
with open('oe_w2v.model', 'w') as out:
    out.write(m.to_yaml())

In [121]:
with open('oe_vectors.bin', 'rb') as infile:
    oe_vectors = pickle.load(infile)

In [7]:
with open('oe_types.txt', 'w') as outfile:
    for word, id in vocab.items():
        outfile.write('{0}\t{1}\n'.format(id, word))