In [166]:
from tensorflow.keras.preprocessing.text import one_hot

In [167]:
# Sentences
sentences = [
    "the glass of milk",
    "the glass of juice",
    "the cup of tea",
    "I am a good boy",
    "I am a good developer",
    "understand the meaning of words",
    "your videos are good",
    "I like cats",
    "I like dogs",
    "I like cats and dogs",
    "enjoy the videos"
]

In [168]:
sentences

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good',
 'I like cats',
 'I like dogs',
 'I like cats and dogs',
 'enjoy the videos']

In [169]:
# Define vocabulary size
vocabulary_size = 10000

In [170]:
# One hot representation for each word
one_hot_representations = []
for words in  sentences:
    one_hot_representations.append(one_hot(words, vocabulary_size))

In [171]:
# Every other word index = 0
one_hot_representations

[[7002, 8010, 4338, 8372],
 [7002, 8010, 4338, 4254],
 [7002, 2896, 4338, 1214],
 [8159, 8205, 4249, 1553, 3557],
 [8159, 8205, 4249, 1553, 4370],
 [5019, 7002, 7344, 4338, 5836],
 [6747, 3802, 1882, 1553],
 [8159, 6608, 1820],
 [8159, 6608, 431],
 [8159, 6608, 1820, 5438, 431],
 [2003, 7002, 3802]]

In [172]:
# Word embeddings representation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential

In [173]:
import numpy as np

In [174]:
# Padding to make sure all setences have same length - pre/post padding positions
sent_length = 8
embedded_docs = pad_sequences(one_hot_representations, padding="pre", maxlen=sent_length)

In [175]:
embedded_docs

array([[   0,    0,    0,    0, 7002, 8010, 4338, 8372],
       [   0,    0,    0,    0, 7002, 8010, 4338, 4254],
       [   0,    0,    0,    0, 7002, 2896, 4338, 1214],
       [   0,    0,    0, 8159, 8205, 4249, 1553, 3557],
       [   0,    0,    0, 8159, 8205, 4249, 1553, 4370],
       [   0,    0,    0, 5019, 7002, 7344, 4338, 5836],
       [   0,    0,    0,    0, 6747, 3802, 1882, 1553],
       [   0,    0,    0,    0,    0, 8159, 6608, 1820],
       [   0,    0,    0,    0,    0, 8159, 6608,  431],
       [   0,    0,    0, 8159, 6608, 1820, 5438,  431],
       [   0,    0,    0,    0,    0, 2003, 7002, 3802]], dtype=int32)

In [176]:
# Feature reprensentation
dim = 10 

In [177]:
model=Sequential()
model.add(Embedding(vocabulary_size, dim, input_length=sent_length))
model.compile("adam", "mse")
model.build(input_shape=(None, sent_length))

In [178]:
model.summary()

In [179]:
embedded_docs[0]

array([   0,    0,    0,    0, 7002, 8010, 4338, 8372], dtype=int32)

In [180]:
# Every word is represented by 10 dim vector
model.predict(embedded_docs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step


array([[[ 0.0204379 , -0.04776145, -0.00076778, -0.04742973,
         -0.01413868, -0.00364381,  0.00251291,  0.02663611,
         -0.03154528, -0.00735273],
        [ 0.0204379 , -0.04776145, -0.00076778, -0.04742973,
         -0.01413868, -0.00364381,  0.00251291,  0.02663611,
         -0.03154528, -0.00735273],
        [ 0.0204379 , -0.04776145, -0.00076778, -0.04742973,
         -0.01413868, -0.00364381,  0.00251291,  0.02663611,
         -0.03154528, -0.00735273],
        [ 0.0204379 , -0.04776145, -0.00076778, -0.04742973,
         -0.01413868, -0.00364381,  0.00251291,  0.02663611,
         -0.03154528, -0.00735273],
        [-0.0417323 , -0.02240251,  0.03762772,  0.03328974,
         -0.02434893,  0.03413801,  0.03690621,  0.00889935,
         -0.03863577, -0.02338661],
        [ 0.04046107, -0.02345911, -0.01565255, -0.0023985 ,
         -0.01216146, -0.03182361, -0.00313447, -0.01661786,
         -0.0155654 ,  0.04542508],
        [ 0.03661192,  0.03028686, -0.04170884, -0.0

In [181]:
sentences[0]

'the glass of milk'

In [192]:
model.predict(embedded_docs)[0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step


array([[ 0.0204379 , -0.04776145, -0.00076778, -0.04742973, -0.01413868,
        -0.00364381,  0.00251291,  0.02663611, -0.03154528, -0.00735273],
       [ 0.0204379 , -0.04776145, -0.00076778, -0.04742973, -0.01413868,
        -0.00364381,  0.00251291,  0.02663611, -0.03154528, -0.00735273],
       [ 0.0204379 , -0.04776145, -0.00076778, -0.04742973, -0.01413868,
        -0.00364381,  0.00251291,  0.02663611, -0.03154528, -0.00735273],
       [ 0.0204379 , -0.04776145, -0.00076778, -0.04742973, -0.01413868,
        -0.00364381,  0.00251291,  0.02663611, -0.03154528, -0.00735273],
       [-0.0417323 , -0.02240251,  0.03762772,  0.03328974, -0.02434893,
         0.03413801,  0.03690621,  0.00889935, -0.03863577, -0.02338661],
       [ 0.04046107, -0.02345911, -0.01565255, -0.0023985 , -0.01216146,
        -0.03182361, -0.00313447, -0.01661786, -0.0155654 ,  0.04542508],
       [ 0.03661192,  0.03028686, -0.04170884, -0.03038125, -0.04663254,
        -0.0095051 , -0.02830768,  0.02085412

In [None]:
embedded_docs[0]

array([   0,    0,    0,    0, 7002, 8010, 4338, 8372], dtype=int32)

### Input given:
- A single sentence is represented as a sequence of integer indices (after one-hot encoding and padding).
- For example, "the glass of milk" might become [0, 0, 0, 0, 7002, 8010, 4338, 8372] after padding to length sent_length = 8.
- Shape of this input: (1, 8) (1 sentence, 8 tokens).
- For each of the 8 positions in the sentence, you get a 10-dimensional vector.

### Output shape for one sentence: (1, 8, 10):
- Batch size (one sentence).  
- Sequence length (sent_length).  
- Embedding dimension (dim).  

### The Embedding layer takes this sequence and maps each integer to a dense vector of size dim = 10.
- So, for each of the 8 positions in the sentence, you get a 10-dimensional vector.
- Output shape for one sentence: (1, 8, 10):
- Batch size (one sentence).
- Sequence length (sent_length).
- Embedding dimension (dim).

[  
  [0.02, -0.04, ..., -0.01],  # Vector for 0 (padding)  
  [0.02, -0.04, ..., -0.01],  # Vector for 0 (padding)  
  [0.02, -0.04, ..., -0.01],  # Vector for 0 (padding)  
  [0.02, -0.04, ..., -0.01],  # Vector for 0 (padding)  
  [0.15, -0.22, ..., 0.08],  # Vector for 7002 ("the")  
  [0.04, -0.02, ..., 0.05],  # Vector for 8010 ("glass")  
  [-0.03, 0.01, ..., -0.06], # Vector for 4338 ("of")  
  [0.25, 0.10, ..., -0.05]   # Vector for 8372 ("milk")  
]  

- Above is a (1, 8, 10) tensor.
- Each of the 8 rows is a 10D vector.