### WordEmbedding is a vector representation of words based on semantic attributes

In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
sent = ['the glass of milk',
       'the glass of juice',
        'the cup of tea',
        'I am a good boy',
        'I am a good developer',
        'understand the meaning of words',
        'your videos are good'
       ]

In [3]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [4]:
### Vocabulary Size
voc_size = 10000

### One Hot Representation

In [5]:
onehot_repr =[one_hot(words,voc_size) for words in sent]
print(onehot_repr)

[[3947, 9160, 2638, 8231], [3947, 9160, 2638, 8004], [3947, 5568, 2638, 9199], [2677, 5072, 4474, 8814, 7174], [2677, 5072, 4474, 8814, 8880], [2054, 3947, 2645, 2638, 519], [1498, 2630, 5790, 8814]]


##### The numbers are the representation of the index in the dictionary

### Word Embedding Representation

In [7]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [9]:
sen_length = 8

embedded_docs = pad_sequences(onehot_repr, padding = 'pre', maxlen = sen_length)
#With the paremeter "pre", the zeros come at the beggining of the matrix
#With the paremeter "post", the zeros come at the end of the matrix

In [10]:
print(embedded_docs)

[[   0    0    0    0 3947 9160 2638 8231]
 [   0    0    0    0 3947 9160 2638 8004]
 [   0    0    0    0 3947 5568 2638 9199]
 [   0    0    0 2677 5072 4474 8814 7174]
 [   0    0    0 2677 5072 4474 8814 8880]
 [   0    0    0 2054 3947 2645 2638  519]
 [   0    0    0    0 1498 2630 5790 8814]]


In [11]:
dim = 10
#Is the number of features or "categories" for the words.
#For example, (Royalty, Man, Women, Child, King, Queen)

In [12]:
model = Sequential()
model.add(Embedding(voc_size, dim, input_length = sen_length))
model.compile('adam', 'mse')

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             100000    
                                                                 
Total params: 100000 (390.62 KB)
Trainable params: 100000 (390.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
print(model.predict(embedded_docs))

[[[ 0.01929991 -0.01931752 -0.03332885 -0.04497341 -0.03710966
   -0.03820052  0.02985601 -0.0407258   0.03910153 -0.03611892]
  [ 0.01929991 -0.01931752 -0.03332885 -0.04497341 -0.03710966
   -0.03820052  0.02985601 -0.0407258   0.03910153 -0.03611892]
  [ 0.01929991 -0.01931752 -0.03332885 -0.04497341 -0.03710966
   -0.03820052  0.02985601 -0.0407258   0.03910153 -0.03611892]
  [ 0.01929991 -0.01931752 -0.03332885 -0.04497341 -0.03710966
   -0.03820052  0.02985601 -0.0407258   0.03910153 -0.03611892]
  [-0.00327891 -0.01320316  0.03180157  0.01029927  0.0275546
   -0.02618942 -0.01489148  0.04263851  0.0230594  -0.02719582]
  [ 0.00785841 -0.02304766  0.02876702 -0.00981153 -0.03408442
    0.04852648 -0.00597327  0.02305721  0.03439984  0.04572621]
  [-0.03562672  0.00141467 -0.00279667 -0.03886837 -0.03762393
   -0.01827069 -0.04061228 -0.00567155 -0.00152367 -0.04590945]
  [ 0.03281764  0.03856781  0.00762818  0.03332916 -0.04795057
    0.00073183  0.00904219  0.00182887  0.0056756

In [15]:
embedded_docs[0]

array([   0,    0,    0,    0, 3947, 9160, 2638, 8231], dtype=int32)

In [17]:
print(model.predict(embedded_docs[0]))

[[ 0.01929991 -0.01931752 -0.03332885 -0.04497341 -0.03710966 -0.03820052
   0.02985601 -0.0407258   0.03910153 -0.03611892]
 [ 0.01929991 -0.01931752 -0.03332885 -0.04497341 -0.03710966 -0.03820052
   0.02985601 -0.0407258   0.03910153 -0.03611892]
 [ 0.01929991 -0.01931752 -0.03332885 -0.04497341 -0.03710966 -0.03820052
   0.02985601 -0.0407258   0.03910153 -0.03611892]
 [ 0.01929991 -0.01931752 -0.03332885 -0.04497341 -0.03710966 -0.03820052
   0.02985601 -0.0407258   0.03910153 -0.03611892]
 [-0.00327891 -0.01320316  0.03180157  0.01029927  0.0275546  -0.02618942
  -0.01489148  0.04263851  0.0230594  -0.02719582]
 [ 0.00785841 -0.02304766  0.02876702 -0.00981153 -0.03408442  0.04852648
  -0.00597327  0.02305721  0.03439984  0.04572621]
 [-0.03562672  0.00141467 -0.00279667 -0.03886837 -0.03762393 -0.01827069
  -0.04061228 -0.00567155 -0.00152367 -0.04590945]
 [ 0.03281764  0.03856781  0.00762818  0.03332916 -0.04795057  0.00073183
   0.00904219  0.00182887  0.0056756  -0.028571  ]]

There is a dimension of features for each word