#### NLP: Word Embedding Techniques using Embedding Layer in Keras
##### Saurabh Chatterjee

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
tf.config.list_physical_devices('GPU')      # check GPU

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
from keras.preprocessing.text import one_hot       # One-Hot Encoder (Keras)

In [4]:
# sample sentences: to be converted into vectors
sent=[  'the glass of milk',
    'the glass of juice',
    'the cup of tea',
    'I am a good boy',
    'I am a good developer',
    'understand the meaning of words',
    'your videos are good']

In [5]:
# Vocabulary Size
voc_size = 500          # One-Hot Vector Size

##### One Hot Representation

In [6]:
onehot_repr = [one_hot(words, voc_size) for words in sent]      # returns one-hot vector 1-INDICES AS A LIST of size voc_size (500) for each Sentence
print(onehot_repr)

[[491, 292, 35, 136], [491, 292, 35, 474], [491, 193, 35, 429], [235, 236, 53, 416, 445], [235, 236, 53, 416, 425], [209, 491, 359, 35, 333], [43, 380, 113, 416]]


#### Word Embedding Representation

In [7]:
from keras.layers import Embedding
from keras_preprocessing.sequence import pad_sequences      # Pre and Post Padding
from keras.models import Sequential

In [8]:
# PADDING: To make Length of all Sentences Equal

sent_length = 8     # set max sentence length
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)       # Pre-Padding
print(embedded_docs)

[[  0   0   0   0 491 292  35 136]
 [  0   0   0   0 491 292  35 474]
 [  0   0   0   0 491 193  35 429]
 [  0   0   0 235 236  53 416 445]
 [  0   0   0 235 236  53 416 425]
 [  0   0   0 209 491 359  35 333]
 [  0   0   0   0  43 380 113 416]]


In [9]:
# To represent EACH WORD: Feature Vector Size (like Word2Vec)
embedding_dim = 10        # sets the Embedding Layer Size

In [15]:
model = Sequential()        # create sequential group object
embedding_layer = Embedding(voc_size, embedding_dim, input_length=sent_length)      ## Creates EMBEDDING Weight Layer of DIMENSION: ** (voc_size, embedding_dim) ** (500, 10)
model.add(embedding_layer)      # Add EMBEDDING LAYER: 
model.compile(loss='mse', optimizer='adam')     # set hyperparameters

""" The model will take as input an integer matrix of size (batch, input_length), and the largest integer (i.e. word index) in the input 
should be no larger than vocabulary size. Now model.output_shape is (None, input_length, embedding_dim), where (input_length = sent_length) and `None` is the batch dimension."""

' The model will take as input an integer matrix of size (batch, input_length), and the largest integer (i.e. word index) in the input \nshould be no larger than vocabulary size. Now model.output_shape is (None, input_length, embedding_dim), where (input_length = sent_length) and `None` is the batch dimension.'

In [16]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 8, 10)             5000      
                                                                 
Total params: 5,000
Trainable params: 5,000
Non-trainable params: 0
_________________________________________________________________


In [23]:
print( embedding_layer.get_weights()[0].shape)      ## Embedding Weight Layer Dimension: ** (voc_size, embedding_dim) **

(500, 10)


In [17]:
# E.g sentence: 'the glass of milk',
embedded_docs[0]

array([  0,   0,   0,   0, 491, 292,  35, 136])

In [18]:
model.predict(embedded_docs[0])     # gives Vector Embeddings of Size 10 for Each Word



array([[ 0.03296963, -0.02528973,  0.01243914,  0.0450856 ,  0.04724941,
         0.00741047, -0.00646191, -0.04928155,  0.04532487, -0.04230431],
       [ 0.03296963, -0.02528973,  0.01243914,  0.0450856 ,  0.04724941,
         0.00741047, -0.00646191, -0.04928155,  0.04532487, -0.04230431],
       [ 0.03296963, -0.02528973,  0.01243914,  0.0450856 ,  0.04724941,
         0.00741047, -0.00646191, -0.04928155,  0.04532487, -0.04230431],
       [ 0.03296963, -0.02528973,  0.01243914,  0.0450856 ,  0.04724941,
         0.00741047, -0.00646191, -0.04928155,  0.04532487, -0.04230431],
       [-0.02349428, -0.02669402, -0.04313287,  0.01782418, -0.02239888,
        -0.03605409,  0.04890485,  0.00409625, -0.00211704,  0.03888908],
       [-0.03141445,  0.03040603,  0.01714956,  0.00808753,  0.03363431,
         0.01334251,  0.01038833,  0.0342663 ,  0.04216622, -0.01509238],
       [-0.02581209,  0.01779388, -0.03909098, -0.04414156, -0.00132989,
         0.04970746, -0.03879775, -0.01216512

In [19]:
print(model.predict(embedded_docs))

[[[ 3.29696275e-02 -2.52897274e-02  1.24391429e-02  4.50855978e-02
    4.72494103e-02  7.41046667e-03 -6.46190718e-03 -4.92815487e-02
    4.53248657e-02 -4.23043147e-02]
  [ 3.29696275e-02 -2.52897274e-02  1.24391429e-02  4.50855978e-02
    4.72494103e-02  7.41046667e-03 -6.46190718e-03 -4.92815487e-02
    4.53248657e-02 -4.23043147e-02]
  [ 3.29696275e-02 -2.52897274e-02  1.24391429e-02  4.50855978e-02
    4.72494103e-02  7.41046667e-03 -6.46190718e-03 -4.92815487e-02
    4.53248657e-02 -4.23043147e-02]
  [ 3.29696275e-02 -2.52897274e-02  1.24391429e-02  4.50855978e-02
    4.72494103e-02  7.41046667e-03 -6.46190718e-03 -4.92815487e-02
    4.53248657e-02 -4.23043147e-02]
  [-2.34942790e-02 -2.66940240e-02 -4.31328677e-02  1.78241841e-02
   -2.23988779e-02 -3.60540859e-02  4.89048474e-02  4.09624726e-03
   -2.11703777e-03  3.88890766e-02]
  [-3.14144492e-02  3.04060318e-02  1.71495564e-02  8.08752701e-03
    3.36343087e-02  1.33425109e-02  1.03883259e-02  3.42662968e-02
    4.21662219e-

In [None]:
''' Embedding Layer is Trained along with rest of the neural network layers with Classification Loss which makes it automatically learn 
semantic relations between words and assign Vectors accordingly, means Similar Words will be assigned Nearby Vectors.'''