In [1]:
from keras.models import Model
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.sequence import skipgrams
from keras.layers import Input,Dense,Reshape,Dot,merge
import keras
import tensorflow as tf

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
import pandas as pd
import numpy as np

In [3]:
# reset the jupyter buffers
tf.reset_default_graph()
keras.backend.clear_session()

In [4]:
DATASETSLIB_HOME = os.path.join(os.path.expanduser('~'),'dl-ts','datasetslib')


In [5]:
import sys
if not DATASETSLIB_HOME in sys.path:
    sys.path.append(DATASETSLIB_HOME)
%reload_ext autoreload
%autoreload 2
import datasetslib
from datasetslib import util as dsu
from datasetslib import nputil
datasetslib.datasets_root = os.path.join(os.path.expanduser('~'),'datasets')

In [6]:
from datasetslib.ptb import PTBSimple
ptb = PTBSimple()
# downloads data, converts words to ids, converts files to a list of ids
ptb.load_data()
print('Train :', ptb.part['train'][0:5])
print('Test: ', ptb.part['test'][0:5])
print('Valid: ', ptb.part['valid'][0:5])
print('Vocabulary Length = ', ptb.vocab_len)

Already exists: C:\Users\ledra\datasets\ptb-simple\simple-examples.tgz
Train : [9970 9971 9972 9974 9975]
Test:  [102  14  24  32 752]
Valid:  [1132   93  358    5  329]
Vocabulary Length =  10000


In [7]:
ptb.skip_window = 2
ptb.reset_index()
# in CBOW input is the context word and output is the target word
y_batch, x_batch = ptb.next_batch_cbow()

print('The CBOW pairs : context,target')
for i in range(5 * ptb.skip_window):
    print('(', [ptb.id2word[x_i] for x_i in x_batch[i]],
          ',', y_batch[i], ptb.id2word[y_batch[i]], ')')

The CBOW pairs : context,target
( ['aer', 'banknote', 'calloway', 'centrust'] , 9972 berlitz )
( ['banknote', 'berlitz', 'centrust', 'cluett'] , 9974 calloway )
( ['berlitz', 'calloway', 'cluett', 'fromstein'] , 9975 centrust )
( ['calloway', 'centrust', 'fromstein', 'gitano'] , 9976 cluett )
( ['centrust', 'cluett', 'gitano', 'guterman'] , 9980 fromstein )
( ['cluett', 'fromstein', 'guterman', 'hydro-quebec'] , 9981 gitano )
( ['fromstein', 'gitano', 'hydro-quebec', 'ipo'] , 9982 guterman )
( ['gitano', 'guterman', 'ipo', 'kia'] , 9983 hydro-quebec )
( ['guterman', 'hydro-quebec', 'kia', 'memotec'] , 9984 ipo )
( ['hydro-quebec', 'ipo', 'memotec', 'mlx'] , 9986 kia )


In [8]:
ptb.skip_window = 2
ptb.reset_index()
# in skip-gram input is the target word and output is the context word
x_batch, y_batch = ptb.next_batch_sg()

print('The skip-gram pairs : target,context')
for i in range(5 * ptb.skip_window):
    print('(', x_batch[i], ptb.id2word[x_batch[i]],
          ',', y_batch[i], ptb.id2word[y_batch[i]], ')')

The skip-gram pairs : target,context
( 9972 berlitz , 9970 aer )
( 9972 berlitz , 9971 banknote )
( 9972 berlitz , 9974 calloway )
( 9972 berlitz , 9975 centrust )
( 9974 calloway , 9971 banknote )
( 9974 calloway , 9972 berlitz )
( 9974 calloway , 9975 centrust )
( 9974 calloway , 9976 cluett )
( 9975 centrust , 9972 berlitz )
( 9975 centrust , 9974 calloway )


In [9]:
valid_size = 8
x_valid = np.random.choice(valid_size * 10, valid_size, replace=False)
print('valid: ',x_valid)

valid:  [72 20 44 64 45 67  9  2]


In [10]:
batch_size=1024
embedding_size=512
n_negative_samples=64
ptb.skip_window=2

In [11]:
sample_table=sequence.make_sampling_table(ptb.vocab_len)
pairs,labels=sequence.skipgrams(ptb.part['train'],ptb.vocab_len,window_size=ptb.skip_window,sampling_table=sample_table)

In [12]:
print('the skip-gram pairs: target,context' )
for i in range(5*ptb.skip_window):
    print(['{}{}'.format(id,ptb.id2word[id]) for id in pairs[i]],':',labels[i])

the skip-gram pairs: target,context
['1383option', '2<eos>'] : 1
['27its', '315contract'] : 0
['1313hands', '10that'] : 1
['4689entering', '2750restaurants'] : 1
['5045wellington', '7343corrected'] : 0
['9173modify', '5902appetite'] : 0
['8548widow', '6012earns'] : 1
['10that', '25be'] : 1
['936receive', '5to'] : 1
['1852ground', '1280southern'] : 1


In [13]:
x,y=zip(*pairs)
x=np.array(x,dtype=np.int32)
x=nputil.to2d(x,unit_axis=1)
y=np.array(y,dtype=np.int32)
y=nputil.to2d(y,unit_axis=1)

In [14]:
#build the target world
target_in=Input(shape=(1,),name='target_in')
target=Embedding(ptb.vocab_len,embedding_size,input_length=1,name='target_em')(target_in)
target=Reshape((embedding_size,1),name='target_re')(target)

In [15]:
# build context model
context_in=Input((1,),name='context_in')
context=Embedding(ptb.vocab_len,embedding_size,input_length=1,name='context_em')(context_in)
context=Reshape((embedding_size,1),name='context_re')(context)

In [16]:
#merge the models with the dot product to check for  similarity and add sigmoid layer
output=Dot(axes=1,name='output_dot')([target,context])
output=Reshape((1,),name='output_re')(output)
output=Dense(1,activation='sigmoid',name='output_sig')(output)

In [17]:
#create the functional model for finding word vectors
model=Model(inputs=[target_in,context_in],outputs=output)
model.compile(loss='binary_crossentropy',optimizer='adam')

In [18]:
# merge the models and create model to check for cosine similarity
similarity=Dot(axes=0,normalize=True,name='sim_dot')([target_in, context_in])
similarity_model=Model(inputs=[target_in,context_in],outputs=similarity)

In [19]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
target_in (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
context_in (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
target_em (Embedding)           (None, 1, 512)       5120000     target_in[0][0]                  
__________________________________________________________________________________________________
context_em (Embedding)          (None, 1, 512)       5120000     context_in[0][0]                 
__________________________________________________________________________________________________
target_re 

In [20]:
n_epochs=5
batch_size=1024
model.fit([x,y],labels,batch_size=batch_size,epochs=n_epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1ab0f3ebe48>

In [22]:
# print closest words to validation set at end of training
top_k = 5  
y_val = np.arange(ptb.vocab_len, dtype=np.int32)
y_val = nputil.to2d(y_val,unit_axis=1)
for i in range(valid_size):
    x_val = np.full(shape=(ptb.vocab_len,1),fill_value=x_valid[i], dtype=np.int32)
    similarity_scores = similarity_model.predict([x_val,y_val])
    similarity_scores=similarity_scores.flatten()
    similar_words = (-similarity_scores).argsort()[1:top_k + 1]
    similar_str = 'Similar to {0:}:'.format(ptb.id2word[x_valid[i]])
    for k in range(top_k):
        similar_str = '{0:} {1:},'.format(similar_str, ptb.id2word[similar_words[k]])
    print(similar_str)

Similar to years: ipo, isi, kia, memotec, mlx,
Similar to from: ipo, isi, kia, memotec, mlx,
Similar to says: ipo, isi, kia, memotec, mlx,
Similar to we: swapo, isi, kia, memotec, mlx,
Similar to more: ipo, isi, kia, memotec, mlx,
Similar to when: swapo, isi, kia, memotec, mlx,
Similar to 's: ipo, isi, kia, memotec, mlx,
Similar to <eos>: swapo, isi, kia, memotec, mlx,
