# Word embedding

Word embedding gives a dense representation of words and their relative meanings. They are an improvement over sparse representations used in simpler bag of word model representations.Word embeddings can be learned from text data and reused among projects. They can also be learned as part of fitting a neural network on text data.

Here we will try to learn word embedding and implement the same.

In [3]:
from tensorflow.keras.preprocessing.text import one_hot #tensorflow >2.0
#from keras.preprocessing.text import one_hot tensorflow<2.0

In [4]:
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]

In [5]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [6]:
voc_size=10000 #defining voc_size as 10000

# one hot representation

In [10]:
one_hot = [one_hot(words,voc_size) for words in sent]
print(one_hot)

[[7949, 8604, 3877, 2293], [7949, 8604, 3877, 8643], [7949, 2990, 3877, 974], [9455, 9244, 8938, 3365, 5034], [9455, 9244, 8938, 3365, 6338], [4209, 7949, 9459, 3877, 6661], [6650, 7287, 7903, 3365]]


# word embedding representation

In [12]:
import numpy as np
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [15]:
sent_size=8 # it mkes all the sentences of equal length for our embeded matrix
embedded_docs=pad_sequences(one_hot,maxlen=sent_size,padding='pre')
print(embedded_docs)

[[   0    0    0    0 7949 8604 3877 2293]
 [   0    0    0    0 7949 8604 3877 8643]
 [   0    0    0    0 7949 2990 3877  974]
 [   0    0    0 9455 9244 8938 3365 5034]
 [   0    0    0 9455 9244 8938 3365 6338]
 [   0    0    0 4209 7949 9459 3877 6661]
 [   0    0    0    0 6650 7287 7903 3365]]


In [18]:
dim=10 # in the embedding matrix it represents the features
model=Sequential()
model.add(Embedding(voc_size,dim,input_length=sent_size))
model.compile('adam','mse')

In [20]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8, 10)             100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [21]:
print(model.predict(embedded_docs)) # each words are converted to a dimention of 10

[[[ 2.99354233e-02  7.55918026e-03 -3.84537801e-02 -2.43669283e-02
    2.39693001e-03  1.32949986e-02  1.12077482e-02  1.84354931e-03
   -1.09404810e-02  3.35662439e-03]
  [ 2.99354233e-02  7.55918026e-03 -3.84537801e-02 -2.43669283e-02
    2.39693001e-03  1.32949986e-02  1.12077482e-02  1.84354931e-03
   -1.09404810e-02  3.35662439e-03]
  [ 2.99354233e-02  7.55918026e-03 -3.84537801e-02 -2.43669283e-02
    2.39693001e-03  1.32949986e-02  1.12077482e-02  1.84354931e-03
   -1.09404810e-02  3.35662439e-03]
  [ 2.99354233e-02  7.55918026e-03 -3.84537801e-02 -2.43669283e-02
    2.39693001e-03  1.32949986e-02  1.12077482e-02  1.84354931e-03
   -1.09404810e-02  3.35662439e-03]
  [ 2.02933438e-02 -3.11214849e-03 -4.54806089e-02 -2.21575424e-03
   -3.62314098e-02 -3.77105474e-02  2.58757956e-02 -4.74439971e-02
   -2.67519709e-02 -4.26830426e-02]
  [ 4.90444787e-02 -2.54021883e-02  1.97733082e-02 -4.97432128e-02
   -2.50096451e-02  4.36573960e-02  3.67261097e-03 -3.87204662e-02
   -2.77972817e-

In [22]:
embedded_docs[0]

array([   0,    0,    0,    0, 7949, 8604, 3877, 2293])