# Word Embeddings & Sequence Models

In [1]:
import io
import numpy as np

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow_datasets as tfds

2023-10-31 10:01:26.636161: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


## Tokenization & Padding

In [2]:
sentences = [
    'ML is good',
    'DL is awesome',
    'ML is explainable'
]

tokenizer = Tokenizer(oov_token="<OOV>")

tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print('number of words in word_index', len(word_index))


print('word_index', word_index)
print()

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')

print('sequences', sequences)
print('padded', padded)

number of words in word_index 7
word_index {'<OOV>': 1, 'is': 2, 'ml': 3, 'good': 4, 'dl': 5, 'awesome': 6, 'explainable': 7}

sequences [[3, 2, 4], [5, 2, 6], [3, 2, 7]]
padded [[3 2 4]
 [5 2 6]
 [3 2 7]]


## IMDB Review Dataset

### Train Model

In [3]:
imdb, info = tfds.load(
    'imdb_reviews',
    with_info=True,
    as_supervised=True
)

In [4]:
imdb, '-'*50, info

({'train': <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
  'test': <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
  'unsupervised': <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>},
 '--------------------------------------------------',
 tfds.core.DatasetInfo(
     name='imdb_reviews',
     full_name='imdb_reviews/plain_text/1.0.0',
     description="""
     Large Movie Review Dataset. This is a dataset for binary sentiment
     classification containing substantially more data than previous benchmark
     datasets. We provide a set of 25,000 highly polar movie reviews for training,
     and 25,000 for testing. There is additional unlabeled data for use as well.
     """,
     config_description="""
     Plain text
     """,
     hom

In [5]:
train_data, test_data = imdb['train'], imdb['test']

In [6]:
for train_example, test_example in zip(train_data.take(2), test_data.take(2)):
    print(train_example)
    print(test_example)
    print('-'*50)

(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"There are films that make careers. For George Romero, it was NIGHT OF THE LIVING DEAD; for Kevin Smith, CLERKS; for Robert Rodriguez, EL MARIACHI. Add t

2023-10-31 10:01:42.705837: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-10-31 10:01:42.706304: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [7]:
def transform_raw_data(data):

    sentences = []
    labels = []

    for s,l in data:
        sentences.append(s.numpy().decode('utf8'))
        labels.append(l.numpy())
    
    labels_final = np.array(labels)

    return (sentences, labels_final)


In [8]:
training_sentences, training_labels = transform_raw_data(train_data)
testing_sentences, testing_labels = transform_raw_data(test_data)

In [9]:
training_sentences[:5], training_labels[:5], testing_sentences[:5], testing_labels[:5]

(["This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
  'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development 

In [10]:
def pre_process_text(train_sentences, test_sentences,
                     vocab_size=10000, max_len=120, trunc_type='post', oov_token='<OOV>'):

    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
    tokenizer.fit_on_texts(train_sentences)
    word_index = tokenizer.word_index

    train_sequences = tokenizer.texts_to_sequences(train_sentences)
    train_padded = pad_sequences(train_sequences, maxlen=max_len, truncating=trunc_type)

    test_sequences = tokenizer.texts_to_sequences(test_sentences)
    test_padded = pad_sequences(test_sequences, maxlen=max_len, truncating=trunc_type)

    return tokenizer, train_padded, test_padded

In [11]:
vocab_size=10000
embedding_dim=16
max_len=120
trunc_type='post'
oov_token='<OOV>'

In [12]:
tokenizer, training_padded, testing_padded = pre_process_text(training_sentences, testing_sentences)

In [13]:
training_padded[0], testing_padded[0]

(array([   0,    0,    0,   12,   14,   33,  425,  392,   18,   90,   28,
           1,    9,   32, 1366, 3585,   40,  486,    1,  197,   24,   85,
         154,   19,   12,  213,  329,   28,   66,  247,  215,    9,  477,
          58,   66,   85,  114,   98,   22, 5675,   12, 1322,  643,  767,
          12,   18,    7,   33,  400, 8170,  176, 2455,  416,    2,   89,
        1231,  137,   69,  146,   52,    2,    1, 7577,   69,  229,   66,
        2933,   16,    1, 2904,    1,    1, 1479, 4940,    3,   39, 3900,
         117, 1584,   17, 3585,   14,  162,   19,    4, 1231,  917, 7917,
           9,    4,   18,   13,   14, 4139,    5,   99,  145, 1214,   11,
         242,  683,   13,   48,   24,  100,   38,   12, 7181, 5515,   38,
        1366,    1,   50,  401,   11,   98, 1197,  867,  141,   10],
       dtype=int32),
 array([  48,   24,  106,   13,   95, 4066,   16,  740, 5065,   10,   14,
         312,    5,    2,  579,  349,   16, 1847, 1257,    1,   16,  668,
        7666, 5531,   

In [14]:
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    keras.layers.Flatten(),
    keras.layers.Dense(6, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

In [15]:
model.compile(
    loss='binary_crossentropy', 
    optimizer='adam',
    metrics=['accuracy'])

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 16)           160000    
                                                                 
 flatten (Flatten)           (None, 1920)              0         
                                                                 
 dense (Dense)               (None, 6)                 11526     
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 171533 (670.05 KB)
Trainable params: 171533 (670.05 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
num_epochs = 10

model.fit(
    training_padded,
    training_labels,
    epochs=num_epochs,
    validation_data=(testing_padded, testing_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x144ee73d0>

### Viz Model

In [18]:
model.layers

[<keras.src.layers.core.embedding.Embedding at 0x144ca3250>,
 <keras.src.layers.reshaping.flatten.Flatten at 0x144f1b850>,
 <keras.src.layers.core.dense.Dense at 0x144ca3c10>,
 <keras.src.layers.core.dense.Dense at 0x145b63990>]

In [19]:
embedding_weights = model.layers[0].get_weights()[0]

embedding_weights.shape

(10000, 16)

In [20]:
reverse_word_index = tokenizer.index_word

# Open writeable files
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Initialize the loop. Start counting at `1` because `0` is just for the padding
for word_num in range(1, vocab_size):

  # Get the word associated at the current index
  word_name = reverse_word_index[word_num]

  # Get the embedding weights associated with the current index
  word_embedding = embedding_weights[word_num]

  # Write the word name
  out_m.write(word_name + "\n")

  # Write the word embedding
  out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")

# Close the files
out_v.close()
out_m.close()

Upload the tsv files to [visualize](https://projector.tensorflow.org/)