In [1]:
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, Reshape, Activation, Input
from keras.models import Model
from keras.layers.merge import Dot
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import skipgrams
import numpy as np
from google.colab import files
from keras.utils.data_utils import get_file
from keras.utils import np_utils
import gensim

Using TensorFlow backend.


In [2]:
path = get_file('alice.txt', origin='http://www.gutenberg.org/files/11/11-0.txt')
all_lines = open(path).readlines()

Downloading data from http://www.gutenberg.org/files/11/11-0.txt


In [0]:
corpus = [line for line in all_lines if line.count(" ") > 2]

In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [0]:
word_input = Input(shape=(1, ), dtype='int32')
context_input = Input(shape=(1, ), dtype='int32')

In [0]:
embedding_input_dim,embedding_output_dim  = len(tokenizer.word_index) + 1, 128
word_embedding = Embedding(embedding_input_dim, embedding_output_dim)(word_input)
context_embedding  = Embedding(embedding_input_dim, embedding_output_dim)(context_input)

In [0]:
merged = Dot(axes=2)([word_embedding, context_embedding])
output = Activation('sigmoid')(Reshape((1,), input_shape=(1, 1))(merged))

In [0]:
skipgram_model = Model(inputs=[word_input, context_input], outputs=output)
skipgram_model.compile(loss='binary_crossentropy', optimizer='adam')

In [10]:
skipgram_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 128)       432384      input_3[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1, 128)       432384      input_4[0][0]                    
____________________________________________________________________________________________

In [0]:
# prepare skipgram pairs 
batches = [] # contains tuples with (data, labels)
for s in tokenizer.texts_to_sequences(corpus):
  data, labels = skipgrams(sequence=s, vocabulary_size=embedding_input_dim, window_size=5, negative_samples=5.)
  if data:
    batches.append(([np.array(x) for x in zip(*data)], np.array(labels, dtype=np.int32)))

In [12]:

# train model 
epochs = 40
losses = []
for epoch in range(1, epochs +1):
  loss = 0
  for batch in batches:
    loss += skipgram_model.train_on_batch(batch[0], batch[1])
  print("Epoch: {} -> loss: {}".format(epoch, loss))
  losses.append(loss)









Epoch: 1 -> loss: 1054.6984449252486
Epoch: 2 -> loss: 724.8388156890869
Epoch: 3 -> loss: 671.0902065634727
Epoch: 4 -> loss: 640.2406772375107
Epoch: 5 -> loss: 610.096641805023
Epoch: 6 -> loss: 579.3902470842004
Epoch: 7 -> loss: 548.8557943329215
Epoch: 8 -> loss: 520.2734726909548
Epoch: 9 -> loss: 493.78132791956887
Epoch: 10 -> loss: 469.7628724973183
Epoch: 11 -> loss: 448.0848562103929
Epoch: 12 -> loss: 428.9148502268945
Epoch: 13 -> loss: 411.8761678931769
Epoch: 14 -> loss: 396.8759813544748
Epoch: 15 -> loss: 383.67520884303667
Epoch: 16 -> loss: 372.0865664871526
Epoch: 17 -> loss: 362.0074434807175
Epoch: 18 -> loss: 353.14048891247876
Epoch: 19 -> loss: 345.53245187430184
Epoch: 20 -> loss: 338.88994501821617
Epoch: 21 -> loss: 333.1728288323875
Epoch: 22 -> loss: 328.23556815011125
Epoch: 23 -> loss: 323.9927028423208
Epoch: 24 -> loss: 320.3210468927973
Epoch: 25 -> loss: 317.1504904914095
Epoch: 26 -> loss: 314.42360822773935
Epoch: 27 -> loss: 312.102456021

In [0]:
filename = 'vectors.txt'
f = open(filename ,'w')
f.write('{} {}\n'.format(embedding_input_dim-1, embedding_output_dim))
vectors = skipgram_model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

In [0]:
files.download(filename)

In [15]:
word2vector = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [24]:
word2vector.most_similar(positive=['computer'])

  if np.issubdtype(vec.dtype, np.int):


[('codes', 0.82143235206604),
 ('damage', 0.8130426406860352),
 ('virus', 0.745389461517334),
 ('types', 0.4282836318016052),
 ('exclusion', 0.4189019799232483),
 ('limitation', 0.41098594665527344),
 ('performing', 0.4026317000389099),
 ('warranties', 0.39886412024497986),
 ('incomplete', 0.39837589859962463),
 ('f', 0.39460140466690063)]

In [25]:
word2vector.most_similar(negative=['computer'])

  if np.issubdtype(vec.dtype, np.int):


[('opportunity', 0.25291144847869873),
 ('farmer', 0.23624968528747559),
 ('stick', 0.23007938265800476),
 ('hear', 0.21194015443325043),
 ('nine', 0.20398567616939545),
 ('‘stand', 0.20273800194263458),
 ('“‘tis', 0.190240278840065),
 ('knowledge', 0.18969321250915527),
 ('prosecute', 0.17776885628700256),
 ('‘they', 0.176834374666214)]

In [27]:
word2vector.most_similar(positive=['money'])

  if np.issubdtype(vec.dtype, np.int):


[('notifies', 0.6260496377944946),
 ('refund', 0.596167802810669),
 ('electronically', 0.5760830044746399),
 ('lieu', 0.5704810619354248),
 ('specific', 0.5474052429199219),
 ('entity', 0.5354718565940857),
 ('fee', 0.5305605530738831),
 ('reported', 0.5083633661270142),
 ('defect', 0.5044872760772705),
 ('60', 0.49787914752960205)]

In [29]:
word2vector.most_similar(positive=['sound'])

  if np.issubdtype(vec.dtype, np.int):


[('footsteps', 0.44984138011932373),
 ('tremulous', 0.4198402166366577),
 ('cartwheels', 0.4021366238594055),
 ('timid', 0.3783169388771057),
 ('ledge', 0.3741518259048462),
 ('walks', 0.37401431798934937),
 ('happy', 0.37210342288017273),
 ('pattering', 0.3638741970062256),
 ('dismay', 0.36237549781799316),
 ('rumbling', 0.3395119905471802)]

In [34]:
word2vector.most_similar(positive=['grass'])

  if np.issubdtype(vec.dtype, np.int):


[('merely', 0.4346730411052704),
 ('crawled', 0.427352637052536),
 ('blades', 0.4190056324005127),
 ('reality', 0.40309464931488037),
 ('delay', 0.39584994316101074),
 ('flowers', 0.37092721462249756),
 ('teacups', 0.36846277117729187),
 ('pool', 0.33596664667129517),
 ('bells', 0.31079035997390747),
 ('favourite', 0.31042033433914185)]