In [18]:
import numpy as np

from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input
from keras.layers.merge import Dot
from keras.optimizers import SGD
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams

import gensim
import codecs   

In [19]:
np.random.seed(13)

path = get_file('alice.txt', origin='http://www.gutenberg.org/files/11/11-0.txt')
corpus = codecs.open(path, "r", encoding='utf-8', errors='ignore').readlines()

corpus = [sentence for sentence in corpus if sentence.count(' ') >= 2]

for sentence in corpus:
    print(sentence)

﻿Project Gutenberg’s Alice’s Adventures in Wonderland, by Lewis Carroll

This eBook is for the use of anyone anywhere at no cost and with

almost no restrictions whatsoever.  You may copy it, give it away or

re-use it under the terms of the Project Gutenberg License included

with this eBook or online at www.gutenberg.org

Title: Alice’s Adventures in Wonderland

Author: Lewis Carroll

Posting Date: June 25, 2008 [EBook #11]

Release Date: March, 1994

Last Updated: October 6, 2016

Character set encoding: UTF-8

*** START OF THIS PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***

ALICE’S ADVENTURES IN WONDERLAND

THE MILLENNIUM FULCRUM EDITION 3.0

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on the

bank, and of having nothing to do: once or twice she had peeped into the

book her sister was reading, but it had no pictures or conversations in

it, ‘and what is the use of a book,’ thought Alice ‘without p

In [20]:
#print(corpus[0:10])
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\r\t\n')
tokenizer.fit_on_texts(corpus)

print(tokenizer.word_docs)


V = len(tokenizer.word_index) + 1
V



3388

In [21]:
dim_embedddings = 128

In [22]:
# inputs
word_inputs = Input(shape=(1,), dtype='int32')
w = Embedding(V, dim_embedddings)(word_inputs)

In [23]:
# context
context_inputs = Input(shape=(1,), dtype='int32')
context  = Embedding(V, dim_embedddings)(context_inputs)
output_layer = Dot(axes=2)([w, context])
output_layer = Reshape((1,), input_shape=(1, 1))(output_layer)
output_layer = Activation('sigmoid')(output_layer)

In [24]:
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9)
SkipGram = Model(inputs=[word_inputs, context_inputs], outputs=output_layer)
SkipGram.summary()
SkipGram.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_3 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
input_4 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_3 (Embedding)          (None, 1, 128)        433664      input_3[0][0]                    
____________________________________________________________________________________________________
embedding_4 (Embedding)          (None, 1, 128)        433664      input_4[0][0]                    
___________________________________________________________________________________________

In [32]:
epochs = 5
for cur_epoch in range(epochs):
    loss = 0.
    accuracy = 0.
        
    t2s = tokenizer.texts_to_sequences(corpus)
    len_t2s = len(t2s)
    print(len(tokenizer.texts_to_sequences(corpus)))
    
    for i, doc in enumerate(t2s):
        data, labels = skipgrams(sequence=doc, vocabulary_size=V, window_size=5, negative_samples=5.)
        x = [np.array(x) for x in zip(*data)]
        y = np.array(labels, dtype=np.int32)
        if x:
            # print(SkipGram.train_on_batch(x, y))
            train_result = SkipGram.train_on_batch(x, y)
            loss += train_result[0]
            accuracy += train_result[1]
    
    avg_loss = loss / len_t2s
    avg_acc = accuracy / len_t2s
    
    print("\t%d/%d: %s\t%s" % (cur_epoch, epochs, avg_loss, avg_acc))
    


2665
	0/5: 0.683245722058	0.576898686177
2665
	1/5: 0.681792932089	0.600158494532
2665
	2/5: 0.679818605974	0.628200187714
2665
	3/5: 0.6771518128	0.656888829924
2665
	4/5: 0.673711164785	0.687313264966


In [17]:
f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(V-1, dim_embedddings))
vectors = SkipGram.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

print("Save weights...")

Save weights...


In [16]:
def most_similar(positive=[], negative=[], topn=20):
    w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)
    for v in w2v.most_similar(positive=positive, negative=negative):
        print(v)

In [15]:
print("Check for queen...")
most_similar(positive=['queen'], topn=10)
print("Check for alice...")
most_similar(positive=['alice'], topn=10)
print("Check for the...")
most_similar(positive=['the'], topn=10)
print("Check for king-he+she...")
most_similar(positive=['king', 'she'], negative=['he'], topn=10)

Check for queen...
('king', 0.7746679782867432)
('hearts', 0.7561852335929871)
('tarts', 0.749153733253479)
('suppressed', 0.7379329204559326)
('mock', 0.736181378364563)
('took', 0.73515385389328)
('march', 0.7342950105667114)
('white', 0.7261685132980347)
('lobster', 0.7245875597000122)
('end', 0.7231849431991577)
Check for alice...
('thought', 0.6647700071334839)
('glad', 0.658423125743866)
('curious', 0.6461477279663086)
('it’s', 0.6376360654830933)
('wasn’t', 0.6288649439811707)
('i’m', 0.6193730235099792)
('remarked', 0.6177995204925537)
('‘but', 0.6165159940719604)
('certainly', 0.6160486340522766)
('she’ll', 0.6087310314178467)
Check for the...
('queen', 0.7193769216537476)
('of', 0.6276025772094727)
('other', 0.5941148996353149)
('king', 0.5940526723861694)
('by', 0.5509505271911621)
('with', 0.5400514602661133)
('from', 0.5364124774932861)
('those', 0.5277484655380249)
('tax', 0.518311083316803)
('owner', 0.5155541300773621)
Check for king-he+she...
('beginning', 0.4409085214