In [1]:
import numpy as np

def load_corpus(filename, tagged=True):
    with open(filename, "r") as f:
        sequence = []
        
        for line in f:
            pairs = line.rstrip('\n').split(' ')
            for pair in pairs:
                if pair != '':
                    if tagged == True:
                        word, _ = pair.split('/')             
                    else:
                        word = pair
                    sequence.append(word)
            sequence.append('.')
                
        return sequence

In [2]:
s = load_corpus('../corpora/oe/oe.all_features')

In [4]:
s1 = load_corpus('../texts/oe/oe_all.txt', tagged=False)

In [5]:
def indexify(string_list):
    word_types = list(set(string_list))
    vocab = dict(zip(word_types, range(len(word_types))))
    rev_vocab = dict(zip(range(len(word_types)), word_types))
    
    word_ids = np.asarray([vocab[elem] for elem in string_list])
    return word_ids, vocab, rev_vocab

In [6]:
seq, vocab, rev_vocab = indexify(s+s1)

In [7]:
rev_vocab

{0: 'tomiddes.',
 1: 'brihte',
 2: 'æmyrian',
 3: 'rind',
 4: 'Nichodemus.',
 5: 'Geæaðmedod',
 6: 'asecgan,',
 7: 'teoþ',
 8: 'cyreað;',
 9: 'aweorpan',
 10: 'deaðe',
 11: 'Paradisum.',
 12: 'unmyndlenga',
 13: 'uenisti.',
 14: 'Aðweah',
 15: 'Besencte',
 16: 'ræt.',
 17: 'rixodon',
 18: 'bigang',
 19: 'unfullfremednesse,',
 20: 'eallne;',
 21: 'forbugon.',
 22: 'Helchana',
 23: 'GOD',
 24: 'ceapiað.',
 25: 'hordað.',
 26: 'adylgiað.',
 27: 'laðian,',
 28: 'foldwela',
 29: 'cuið',
 30: 'geglenged',
 31: 'Andetten',
 32: 'hyder,',
 33: 'ældo',
 34: 'gecyrreþ.',
 35: 'þeodland',
 36: 'guðwudu',
 37: 'Hofan',
 38: 'ongyteð',
 39: 'behyldon',
 40: 'brymmas,',
 41: 'stocce,',
 42: 'idellic,',
 43: 'Wilfreðincg',
 44: 'unbesprecæn',
 45: 'forslea,',
 46: 'adwæscð',
 47: 'gelicode',
 48: 'fostormeder',
 49: 'mereweard',
 50: 'preoste,',
 51: 'spell.',
 52: 'forforan.',
 53: 'wyrhta',
 54: 'forwurde;',
 55: 'Wicganbeorhge,',
 56: 'æðelfrið,',
 57: 'hæþengild,',
 58: 'Speciosus,',
 59: 'hopedo

In [8]:
from keras.preprocessing.sequence import make_sampling_table, skipgrams

vocab_size = len(vocab)
window_size = 4
vector_dim = 300


sampling_table = make_sampling_table(vocab_size)
couples, labels = skipgrams(seq, vocab_size, window_size=window_size, sampling_table=sampling_table)

word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


[[64739, 113594], [90185, 182520], [122213, 47707], [197118, 168765], [58046, 137293], [83130, 187165], [27939, 60499], [133532, 88634], [79233, 5789], [146081, 220840]] [0, 1, 1, 1, 0, 0, 1, 0, 0, 1]


In [50]:
word_target.shape

(39006248,)

In [9]:
from keras import layers, Input
from keras.models import Model
import keras.backend as K

def build_word2vec_model(vocab_size, vector_dim):
    input_target = layers.Input((1,))
    input_context = layers.Input((1,))

    embed = layers.Embedding(vocab_size, vector_dim, input_length=1, trainable=True)
    
    target = embed(input_target)
    target = layers.Reshape((vector_dim,))(target)
    context = embed(input_context)
    context = layers.Reshape((vector_dim,))(context)
    
    dot = layers.dot([target, context], axes=1, normalize=True)
    
    out = layers.Dense(1, activation='sigmoid')(dot)
    
    model = Model([input_target, input_context], out)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
    
    return model

    

In [10]:
K.clear_session()
m = build_word2vec_model(vocab_size, vector_dim)
m.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 300)       66871200    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 300)          0           embedding_1[0][0]                
__________

In [82]:
m.fit([word_target, word_context], labels, epochs=20, batch_size=102400)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f46d5ab8828>

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

In [84]:
w = m.get_weights()

In [85]:
def sim(w, x, y):
    dot = np.sum(w[0][x] * w[0][y])
    norm_x = np.linalg.norm(w[0][x])
    norm_y = np.linalg.norm(w[0][y])
    return dot/(norm_x * norm_y)

In [117]:
[(rev_vocab[i], rev_vocab[j], sim(w, i, j)) for i in range(vocab_size) for j in range(vocab_size) if i != j]

KeyboardInterrupt: 

In [111]:
max, rev_vocab[best]


(0.48255926, 'þaceaflas')

In [112]:
import pickle

In [118]:
m.save_weights('oe_w2z.bin')

In [115]:
with open('oe_w2v.model', 'w') as out:
    out.write(m.to_yaml())

In [121]:
with open('oe_vectors.bin', 'rb') as infile:
    oe_vectors = pickle.load(infile)

In [7]:
with open('oe_types.txt', 'w') as outfile:
    for word, id in vocab.items():
        outfile.write('{0}\t{1}\n'.format(id, word))