In [18]:
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras.models import Sequential

In [4]:
sentences = ["I love my dog", "I love my cat"]

In [5]:
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}


In [6]:
sentences = ["I love my dog", "I love my cat", "you love my dog!"]

In [7]:
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


In [8]:
sentences = ["I love my dog", "I love my cat", "you love my dog!", "do you think my dog is amazing?"]

In [9]:
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}


In [10]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


In [11]:
test_data = ["I really love my dog", "my dog loves my uncle"]

In [12]:
# tokenizer didn't see the words before 
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[4, 2, 1, 3], [1, 3, 1]]


In [13]:
# not to lose the length of the sequence or texts
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


In [14]:
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


In [15]:
# padding to ensure the input is in same dimensions
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
# padding = 'post' , maxlen = 5 , truncate = 'post'
padded = pad_sequences(sequences)
print(padded)

[[ 0  0  0  4  2  1  3]
 [ 0  0  0  4  2  1  6]
 [ 0  0  0  5  2  1  3]
 [ 7  5  8  1  3  9 10]]


Layer By Layer View

In [17]:
sentences = ["I love this movie", "This movie is terrible", "Best film ever"]

In [21]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'this': 1, 'movie': 2, 'i': 3, 'love': 4, 'is': 5, 'terrible': 6, 'best': 7, 'film': 8, 'ever': 9}


In [22]:
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences)
print(padded)

[[3 4 1 2]
 [1 2 5 6]
 [0 7 8 9]]


In [26]:

# Parameters
vocab_size = 10  # Vocabulary size + 1 for padding (index 0)
embedding_dim = 4  # Dimension of the embedding vectors

# Building the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    GlobalAveragePooling1D(),
    Dense(1, activation='sigmoid')
])

# Model summary
model.summary()


In [28]:

# Getting the output of each layer for the example input
embedding_layer = model.layers[0]
pooling_layer = model.layers[1]
dense_layer = model.layers[2]

# Example input sequences
padded_sequences = tf.constant(padded)

# Forward pass
embedding_output = embedding_layer(padded_sequences)
pooled_output = pooling_layer(embedding_output)
dense_output = dense_layer(pooled_output)

# Printing the outputs
print("Input sequences:\n", padded_sequences.numpy())
print("\nEmbedding output:\n", embedding_output.numpy())
print("\nPooled output:\n", pooled_output.numpy())
print("\nDense output:\n", dense_output.numpy())


Input sequences:
 [[3 4 1 2]
 [1 2 5 6]
 [0 7 8 9]]

Embedding output:
 [[[-0.00515059 -0.04360912 -0.03211959  0.00638112]
  [ 0.01303181 -0.02852996 -0.03487139 -0.02223028]
  [ 0.02221173 -0.03948899 -0.00527262  0.00718025]
  [ 0.04398812 -0.00190734  0.03003671 -0.0405668 ]]

 [[ 0.02221173 -0.03948899 -0.00527262  0.00718025]
  [ 0.04398812 -0.00190734  0.03003671 -0.0405668 ]
  [-0.02892005  0.03679902 -0.04293055  0.00268785]
  [ 0.01666312  0.03365344  0.0338567   0.01567891]]

 [[-0.00232523 -0.04276546  0.00192988 -0.00284358]
  [ 0.01780419  0.00768463  0.01692016 -0.04765263]
  [-0.02265648 -0.0179026   0.02033291 -0.03539554]
  [ 0.02030182 -0.02486253 -0.0161811  -0.00883853]]]

Pooled output:
 [[ 0.01852027 -0.02838385 -0.01055672 -0.01230893]
 [ 0.01348573  0.00726403  0.00392256 -0.00375495]
 [ 0.00328108 -0.01946149  0.00575046 -0.02368257]]

Dense output:
 [[0.5012482 ]
 [0.49754938]
 [0.5057437 ]]
