<a href="https://colab.research.google.com/github/feniltailor22/Natural-Language-Processing/blob/main/Word_Embedding_Techniques_using_Embedding_Layer_in_Keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from tensorflow.keras.preprocessing.text import one_hot
import numpy as np

In [6]:
# sentences
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]

In [7]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [8]:
#Vocabulary size
voc_size=10000

In [9]:
#one hot representation
#getting index from the dictionary
onehot_repr= [one_hot(words, voc_size) for words in sent]

In [10]:
onehot_repr

[[2004, 2692, 8641, 2299],
 [2004, 2692, 8641, 8558],
 [2004, 9917, 8641, 6496],
 [2261, 1580, 9186, 7729, 8152],
 [2261, 1580, 9186, 7729, 7037],
 [3592, 2004, 4593, 8641, 6204],
 [1827, 87, 8380, 7729]]

In [11]:
#passing one hot repr to embedding layers to form embedding matrices

In [13]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

#here pas_sequences is used to equal the length of each sentence by adding extra zeroes.

In [14]:
sent_len=8

In [15]:
embedded_docs= pad_sequences(sequences=onehot_repr, padding='pre', maxlen=sent_len)

In [16]:
embedded_docs

array([[   0,    0,    0,    0, 2004, 2692, 8641, 2299],
       [   0,    0,    0,    0, 2004, 2692, 8641, 8558],
       [   0,    0,    0,    0, 2004, 9917, 8641, 6496],
       [   0,    0,    0, 2261, 1580, 9186, 7729, 8152],
       [   0,    0,    0, 2261, 1580, 9186, 7729, 7037],
       [   0,    0,    0, 3592, 2004, 4593, 8641, 6204],
       [   0,    0,    0,    0, 1827,   87, 8380, 7729]], dtype=int32)

In [18]:
#passing indexes of words to embedding layers

In [21]:
model= Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=10, input_length=sent_len))
model.compile(optimizer='adam', loss='mse')

In [22]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 8, 10)             100000    
                                                                 
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [23]:
print(model.predict(embedded_docs))

[[[ 4.85629328e-02 -1.08713880e-02  6.35166094e-03  2.51858942e-02
    2.27284320e-02 -2.16272827e-02  4.97870706e-02 -3.65713947e-02
    4.70908321e-02  3.77242081e-02]
  [ 4.85629328e-02 -1.08713880e-02  6.35166094e-03  2.51858942e-02
    2.27284320e-02 -2.16272827e-02  4.97870706e-02 -3.65713947e-02
    4.70908321e-02  3.77242081e-02]
  [ 4.85629328e-02 -1.08713880e-02  6.35166094e-03  2.51858942e-02
    2.27284320e-02 -2.16272827e-02  4.97870706e-02 -3.65713947e-02
    4.70908321e-02  3.77242081e-02]
  [ 4.85629328e-02 -1.08713880e-02  6.35166094e-03  2.51858942e-02
    2.27284320e-02 -2.16272827e-02  4.97870706e-02 -3.65713947e-02
    4.70908321e-02  3.77242081e-02]
  [-1.45706646e-02 -3.93941030e-02  2.84135342e-04  2.36130618e-02
    2.08941214e-02 -4.73699942e-02 -1.41201727e-02 -7.48322159e-03
    6.48499653e-03 -4.86319140e-03]
  [-3.21066007e-02 -4.54469435e-02 -2.90343519e-02  4.42026369e-02
    3.86946276e-03  2.92630121e-03  4.11619805e-02 -4.21628356e-02
    4.98082153e-

In [24]:
embedded_docs[0]

array([   0,    0,    0,    0, 2004, 2692, 8641, 2299], dtype=int32)

In [25]:
print(model.predict(embedded_docs[0]))

[[ 0.04856293 -0.01087139  0.00635166  0.02518589  0.02272843 -0.02162728
   0.04978707 -0.03657139  0.04709083  0.03772421]
 [ 0.04856293 -0.01087139  0.00635166  0.02518589  0.02272843 -0.02162728
   0.04978707 -0.03657139  0.04709083  0.03772421]
 [ 0.04856293 -0.01087139  0.00635166  0.02518589  0.02272843 -0.02162728
   0.04978707 -0.03657139  0.04709083  0.03772421]
 [ 0.04856293 -0.01087139  0.00635166  0.02518589  0.02272843 -0.02162728
   0.04978707 -0.03657139  0.04709083  0.03772421]
 [-0.01457066 -0.0393941   0.00028414  0.02361306  0.02089412 -0.04736999
  -0.01412017 -0.00748322  0.006485   -0.00486319]
 [-0.0321066  -0.04544694 -0.02903435  0.04420264  0.00386946  0.0029263
   0.04116198 -0.04216284  0.04980822 -0.03875817]
 [ 0.02343393  0.01404599  0.02975738  0.01966264 -0.04604946 -0.04613369
  -0.00381849 -0.02828096  0.04174156 -0.01918566]
 [ 0.02895174  0.02779008 -0.01136323 -0.02217522 -0.02374947  0.03061583
  -0.02898167 -0.00667221 -0.0421777   0.04392565]]
