(predefined-word-embeddings)=
# Encoding Text - Using Predefined Word Embeddings

## Download Word Embeddings


In [None]:
#!curl -L -o glove.6B.zip https://nlp.stanford.edu/data/glove.6B.zip 
#!unzip -q glove.6B.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0   346    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  822M  100  822M    0     0  5208k      0  0:02:41  0:02:41 --:--:-- 5088k  0:01:40  0:00:03  0:01:37 11.2M:02:08 5141k02:36  0:00:49  0:01:47 5198k    0  0:02:37  0:00:58  0:01:39 5153k8  0:01:12  0:01:26 5174k:01:53  0:00:47 5123k:41  0:02:38  0:00:03 5158k


In [4]:
import numpy as np 

path_to_glove_file = "glove.6B.100d.txt"

embeddings_index = {} 

with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 400000 word vectors.


In [21]:
from tensorflow.keras import layers
from tensorflow import keras

batch_size  = 32
max_length  = 600
max_tokens  = 20_000

text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)

# ------------------------------------------------------------
# 1 ▶ read the raw text datasets (labelled)
# ------------------------------------------------------------
train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size=batch_size
)
val_ds   = keras.utils.text_dataset_from_directory(
    "aclImdb/val",   batch_size=batch_size
)
test_ds  = keras.utils.text_dataset_from_directory(
    "aclImdb/test",  batch_size=batch_size
)

# ------------------------------------------------------------
# 2 ▶ ADAPT the vectoriser on *text only* (no labels)
# ------------------------------------------------------------
text_only_ds = train_ds.map(lambda x, y: x)   # strip labels
text_vectorization.adapt(text_only_ds)        # builds the vocabulary

# optional: inspect
vocabulary = text_vectorization.get_vocabulary()
print("Top 10 tokens:", vocabulary[:10])

# ------------------------------------------------------------
# 3 ▶ map datasets to integer sequences
# ------------------------------------------------------------
int_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)
int_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)
int_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)


Found 22500 files belonging to 2 classes.
Found 2500 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.
Top 10 tokens: ['', '[UNK]', 'the', 'a', 'and', 'of', 'to', 'is', 'in', 'it']


In [22]:
embedding_dim = 100


vocabulary = text_vectorization.get_vocabulary() 
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((max_tokens, embedding_dim)) 

for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word) 
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [29]:
import tensorflow as tf 

embedding_layer = layers.Embedding(
    max_tokens, 
    embedding_dim, 
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False, 
    mask_zero=True)

# ----- model definition -------------------------------------------------
inputs   = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)                   # your pre‑built Glove Embedding layer
x        = layers.Bidirectional(layers.LSTM(32))(embedded)
x        = layers.Dropout(0.5)(x)
outputs  = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

# ----- weights‑only checkpoint (no .keras format) -----------------------
checkpoint = keras.callbacks.ModelCheckpoint(
    "glove_seq_best.weights.h5",   # any filename ending in .h5 is fine
    monitor="val_accuracy",        # save the epoch with highest val‑acc
    save_best_only=True,
    save_weights_only=True         # ← avoids the native .keras saver bug
)

# ----- training ---------------------------------------------------------
model.fit(int_train_ds,
          validation_data=int_val_ds,
          epochs=10,
          callbacks=[checkpoint])

# ----- reload & evaluate ------------------------------------------------
model.load_weights("glove_seq_best.weights.h5")      # weights‑only file
test_acc = model.evaluate(int_test_ds, verbose=0)[1]
print(f"Test acc: {test_acc:.3f}")


Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 100)         2000000   
                                                                 
 bidirectional_4 (Bidirecti  (None, 64)                34048     
 onal)                                                           
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2034113 (7.76 MB)
Trainable params: 34113 (133.25 KB)
Non-trainable params: 2000000 (7.63 MB)
_________________