In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot

In [3]:
# Experiment Sentences
sent = [
    'the glass of milk',
    'the glass of juice',
    'a cup of tea',
    'I am a good boy',
    'I am a good developer',
    'understand the meaning of words',
    'your videos are good',
]
sent

['the glass of milk',
 'the glass of juice',
 'a cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [4]:
# Define the vocabulary size
voc_size = 10000

In [5]:
# one hot representation for every word
one_hot_repr = [one_hot(words, voc_size) for words in sent]
one_hot_repr

[[4288, 2754, 6127, 4778],
 [4288, 2754, 6127, 3590],
 [3633, 6353, 6127, 4921],
 [3323, 2048, 3633, 203, 717],
 [3323, 2048, 3633, 203, 1449],
 [3518, 4288, 7429, 6127, 8136],
 [8403, 3948, 5579, 203]]

In [7]:
# Word Embedding representation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

In [10]:
# Sentince padding to make all sentences equal length
sent_length = max([len(words.split()) for words in sent])
embedded_docs = pad_sequences(one_hot_repr, padding='pre', maxlen=sent_length)
embedded_docs

array([[   0, 4288, 2754, 6127, 4778],
       [   0, 4288, 2754, 6127, 3590],
       [   0, 3633, 6353, 6127, 4921],
       [3323, 2048, 3633,  203,  717],
       [3323, 2048, 3633,  203, 1449],
       [3518, 4288, 7429, 6127, 8136],
       [   0, 8403, 3948, 5579,  203]])

In [11]:
# Feature representation
dim = 10

In [15]:
model = Sequential(
    [
        Embedding(voc_size, dim, input_length=sent_length)
    ]
)
model.compile(optimizer='adam', loss='mse')
model.summary()

In [16]:
model.predict(embedded_docs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step


array([[[ 4.29049879e-03,  4.39310409e-02,  3.64124440e-02,
         -2.87249219e-02,  1.33801140e-02,  2.44941749e-02,
         -3.14559229e-02,  1.25096925e-02,  3.78169678e-02,
         -3.07033304e-02],
        [ 3.65601443e-02,  1.14391223e-02,  1.55194737e-02,
          9.45228338e-03, -4.65438738e-02,  3.05731222e-03,
         -6.15990162e-03, -4.40470837e-02,  4.31880616e-02,
         -3.43180522e-02],
        [-3.33623961e-03, -4.86960672e-02, -4.58011739e-02,
         -9.50324535e-03, -1.52309425e-02, -2.48106010e-02,
          1.85945742e-02,  4.86974791e-03, -4.30100933e-02,
         -4.37429436e-02],
        [-2.91158687e-02,  4.39164080e-02,  2.59749629e-02,
         -2.13661045e-03,  1.17639303e-02,  2.92528383e-02,
         -7.16930628e-03, -7.31180981e-03,  3.03586386e-02,
         -2.42861509e-02],
        [ 3.67857516e-04,  3.64608429e-02,  3.90128046e-03,
          1.38993002e-02,  4.08270843e-02,  1.58443786e-02,
         -5.40462881e-03,  1.94102861e-02, -1.237958

In [17]:
embedded_docs[0]

array([   0, 4288, 2754, 6127, 4778])

In [19]:
model.predict(embedded_docs[0].reshape(1,len(embedded_docs[0])))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step


array([[[ 0.0042905 ,  0.04393104,  0.03641244, -0.02872492,
          0.01338011,  0.02449417, -0.03145592,  0.01250969,
          0.03781697, -0.03070333],
        [ 0.03656014,  0.01143912,  0.01551947,  0.00945228,
         -0.04654387,  0.00305731, -0.0061599 , -0.04404708,
          0.04318806, -0.03431805],
        [-0.00333624, -0.04869607, -0.04580117, -0.00950325,
         -0.01523094, -0.0248106 ,  0.01859457,  0.00486975,
         -0.04301009, -0.04374294],
        [-0.02911587,  0.04391641,  0.02597496, -0.00213661,
          0.01176393,  0.02925284, -0.00716931, -0.00731181,
          0.03035864, -0.02428615],
        [ 0.00036786,  0.03646084,  0.00390128,  0.0138993 ,
          0.04082708,  0.01584438, -0.00540463,  0.01941029,
         -0.01237959, -0.01854386]]], dtype=float32)