# Word Embedding Representation with Keras

In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
# Sentences

sentences = ['Well done!',
             'Good work',
             'Great effort',
             'nice work',
             'Excellent!',
             'Weak',
             'Poor effort!',
             'not good',
             'poor work',
             'Could have done better.',
             'the glass of milk',
             'the glass of juice',
             'the cup of tea',
             'I am a good person',
             'I am a good developer',
             'understand the meaning of words',
             'your presentation is good']

In [3]:
sentences

['Well done!',
 'Good work',
 'Great effort',
 'nice work',
 'Excellent!',
 'Weak',
 'Poor effort!',
 'not good',
 'poor work',
 'Could have done better.',
 'the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good person',
 'I am a good developer',
 'understand the meaning of words',
 'your presentation is good']

In [4]:
# Initialize the vocabulary size which is the size of the dictionary

vocab_size = 30000

## One Hot Representation

In [5]:
# convert sentences into one hot representation
# one hot representation --> index from the dictionary
# index will be less than vocab_size 

onehot_representation = [one_hot(words, vocab_size) for words in sentences]
print(onehot_representation)

[[23413, 6085], [7558, 1685], [8964, 20883], [25625, 1685], [5766], [23953], [2744, 20883], [14943, 7558], [2744, 1685], [25696, 11160, 6085, 7544], [15461, 4271, 15976, 12702], [15461, 4271, 15976, 2105], [15461, 20776, 15976, 12268], [7707, 19320, 1023, 7558, 25857], [7707, 19320, 1023, 7558, 2696], [18136, 15461, 21648, 15976, 28643], [12718, 7412, 12373, 7558]]


## Word Embedding


In [6]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

In [7]:
# pad documents to a max length of 8 words

maximum_sentence_length = 8
embedded_docs=pad_sequences(onehot_representation,padding='pre',maxlen=maximum_sentence_length)

In [8]:
embedded_docs

array([[    0,     0,     0,     0,     0,     0, 23413,  6085],
       [    0,     0,     0,     0,     0,     0,  7558,  1685],
       [    0,     0,     0,     0,     0,     0,  8964, 20883],
       [    0,     0,     0,     0,     0,     0, 25625,  1685],
       [    0,     0,     0,     0,     0,     0,     0,  5766],
       [    0,     0,     0,     0,     0,     0,     0, 23953],
       [    0,     0,     0,     0,     0,     0,  2744, 20883],
       [    0,     0,     0,     0,     0,     0, 14943,  7558],
       [    0,     0,     0,     0,     0,     0,  2744,  1685],
       [    0,     0,     0,     0, 25696, 11160,  6085,  7544],
       [    0,     0,     0,     0, 15461,  4271, 15976, 12702],
       [    0,     0,     0,     0, 15461,  4271, 15976,  2105],
       [    0,     0,     0,     0, 15461, 20776, 15976, 12268],
       [    0,     0,     0,  7707, 19320,  1023,  7558, 25857],
       [    0,     0,     0,  7707, 19320,  1023,  7558,  2696],
       [    0,     0,    

In [9]:
dimension=10

# compile the model
model=Sequential()
model.add(Embedding(vocab_size,dimension,input_length=maximum_sentence_length))
model.compile('adam','mse')

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             300000    
                                                                 
Total params: 300,000
Trainable params: 300,000
Non-trainable params: 0
_________________________________________________________________


In [11]:
print(model.predict(embedded_docs))

[[[ 2.1711025e-02 -1.0425821e-03 -2.2798467e-02 ... -4.7315348e-02
   -5.4656491e-03  4.3474760e-02]
  [ 2.1711025e-02 -1.0425821e-03 -2.2798467e-02 ... -4.7315348e-02
   -5.4656491e-03  4.3474760e-02]
  [ 2.1711025e-02 -1.0425821e-03 -2.2798467e-02 ... -4.7315348e-02
   -5.4656491e-03  4.3474760e-02]
  ...
  [ 2.1711025e-02 -1.0425821e-03 -2.2798467e-02 ... -4.7315348e-02
   -5.4656491e-03  4.3474760e-02]
  [ 2.3266673e-03  4.8162136e-02  1.3865199e-02 ...  4.9666073e-02
   -2.7825499e-02  1.7601494e-02]
  [ 9.3919411e-03 -4.5011528e-03 -9.2045553e-03 ...  4.2869236e-02
    9.4975233e-03 -1.4600158e-03]]

 [[ 2.1711025e-02 -1.0425821e-03 -2.2798467e-02 ... -4.7315348e-02
   -5.4656491e-03  4.3474760e-02]
  [ 2.1711025e-02 -1.0425821e-03 -2.2798467e-02 ... -4.7315348e-02
   -5.4656491e-03  4.3474760e-02]
  [ 2.1711025e-02 -1.0425821e-03 -2.2798467e-02 ... -4.7315348e-02
   -5.4656491e-03  4.3474760e-02]
  ...
  [ 2.1711025e-02 -1.0425821e-03 -2.2798467e-02 ... -4.7315348e-02
   -5.4656

In [12]:
embedded_docs[10]

array([    0,     0,     0,     0, 15461,  4271, 15976, 12702])

In [13]:
print(model.predict(embedded_docs)[10])

[[ 0.02171103 -0.00104258 -0.02279847  0.02940999 -0.01806449  0.04067775
  -0.00288896 -0.04731535 -0.00546565  0.04347476]
 [ 0.02171103 -0.00104258 -0.02279847  0.02940999 -0.01806449  0.04067775
  -0.00288896 -0.04731535 -0.00546565  0.04347476]
 [ 0.02171103 -0.00104258 -0.02279847  0.02940999 -0.01806449  0.04067775
  -0.00288896 -0.04731535 -0.00546565  0.04347476]
 [ 0.02171103 -0.00104258 -0.02279847  0.02940999 -0.01806449  0.04067775
  -0.00288896 -0.04731535 -0.00546565  0.04347476]
 [-0.00745826  0.03311503  0.01934205  0.02664414  0.03143356 -0.01541539
  -0.01924293 -0.01298719 -0.0237463  -0.00023611]
 [ 0.04405868 -0.00368507  0.03827066  0.01495269 -0.04886691  0.02305743
  -0.01137793 -0.00472318 -0.03809013  0.00666231]
 [ 0.01674834  0.03433552 -0.00462591  0.02586609 -0.04122503 -0.03808634
  -0.04500758  0.02301003  0.02769855  0.01091569]
 [ 0.00418583  0.02337858  0.01373689  0.04185222  0.02479756 -0.01643366
   0.01393462  0.01693881 -0.0058934   0.04838565]]