# Chp11: Part 2 Sequence Models

### 11.3.3 Processing words as a sequence: the Sequence Model approach

In [47]:
def HR():
    # print char * numeric
    print('-' * 80)

In [48]:
# Downloading the GloVe word embeddings
from google.colab import drive
drive.mount('/content/drive')
!ls /content/drive/MyDrive/data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
glove.6B.100d.txt


In [49]:
def listing11_12():
    import os

    dirpath = 'aclImdb'
    if not os.path.isdir(dirpath):
        print(f'{dirpath} not found, creating directory')
        HR()
        try:
            !curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
            !tar -xf aclImdb_v1.tar.gz
            !rm -r aclImdb/train/unsup
        except Exception as ex:
            print(f"Not able to create directory due to error {ex}")
            
listing11_12()

--------------------------------------------------------------------------------
Finished creating aclImdb


In [50]:
# dict
options = {
    'batch_size': 32,
    'max_length' : 600,
    'max_tokens' : 20_000
}


In [51]:
def listing11_13():
    import os, pathlib, shutil, random
    from tensorflow import keras

    dirpath = 'aclImdb/val'
    if os.path.isdir(dirpath):
        print(f"{dirpath} already exists")
    else:
        print(f"Prepare a validation set by setting apart 20% of the training text files in a new directory, {dirpath}")
        base_dir = pathlib.Path("aclImdb")
        val_dir = base_dir / "val"
        train_dir = base_dir / "train"
        for category in ("neg", "pos"):
            os.makedirs(val_dir / category, exist_ok=True)
            files = os.listdir(train_dir / category)

            # Shuffle the list of training files using a seed, to ensure
            # we get the same validation set every time we run the code
            random.Random(1337).shuffle(files)

            # Take 20% of the training files to use for validation
            num_val_samples = int(0.2 * len(files))
            val_files = files[-num_val_samples:]

            # Move the files to aclImdb/val/neg and aclImdb/val/pos
            for fname in val_files:
                shutil.move(train_dir / category / fname,
                            val_dir / category / fname)

# This should be its own function, since the action is conditional
# and we want to be able to treat it as so via control-flow eventually

listing11_13()

aclImdb/val already exists


In [52]:
def listing11_14():
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers

    train_ds = keras.preprocessing.text_dataset_from_directory(
        "aclImdb/train", batch_size=options['batch_size']
    )
    val_ds = keras.preprocessing.text_dataset_from_directory(
        "aclImdb/val", batch_size=options['batch_size']
    )
    test_ds = keras.preprocessing.text_dataset_from_directory(
        "aclImdb/test", batch_size=options['batch_size']
    )
    text_only_train_ds = train_ds.map(lambda x, y: x)


    # Preparing integer sequence datasets
    text_vectorization = layers.experimental.preprocessing.TextVectorization(
        max_tokens=options['max_tokens'],
        output_mode="int",
        output_sequence_length=options['max_length'],
    )
    text_vectorization.adapt(text_only_train_ds)

    int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
    int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
    int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))

    HR()
    print(type(int_train_ds))
    print(int_train_ds)
    HR()
    return int_train_ds, int_val_ds, int_test_ds

int_train_ds, int_val_ds, int_test_ds = listing11_14()

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.
--------------------------------------------------------------------------------
<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>
<MapDataset shapes: ((None, 600), (None,)), types: (tf.int64, tf.int32)>
--------------------------------------------------------------------------------


In [53]:
# This runs VERY SLOWLY
def listing11_15(int_train_ds, int_val_ds, int_test_ds):
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers

    # A sequence model built on top of one-hot encoded vector sequences
    # import tensorflow as tf

    inputs = keras.Input(shape=(None,), dtype="int64")
    embedded = tf.one_hot(inputs, depth=options['max_tokens'])
    x = layers.Bidirectional(layers.LSTM(32))(embedded)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer="rmsprop",
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    model.summary()

    # Training a first basic sequence model
    callbacks = [
        keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.keras",
                                        save_best_only=True)
    ]
    model.fit(
        int_train_ds, 
        validation_data=int_val_ds, 
        #epochs=10, 
        epochs=1,
        callbacks=callbacks
    )
    model = keras.models.load_model("one_hot_bidir_lstm.keras")
    print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

# listing11_15(int_train_ds, int_val_ds, int_test_ds)

# 625/625 [==============================] - 482s 766ms/step - loss: 0.5439 - accuracy: 0.7372 - val_loss: 0.7386 - val_accuracy: 0.7542
# 782/782 [==============================] - 338s 431ms/step - loss: 0.7305 - accuracy: 0.7550
# Test acc: 0.755


In [54]:
# Listing 11.18 Model that uses an Embedding layer trained from scratch
# This should train much faster than the one-hot model (since the LSTM only 
# has to process 256-dimensional vectors instead of 20,000-dimensional).
def listing11_18(int_train_ds, int_val_ds, int_test_ds):
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers

    # Understanding word embeddings
    # Learning word embeddings with the Embedding layer
    # Instantiating an Embedding layer
    embedding_layer = layers.Embedding(
        input_dim=options['max_tokens'], 
        output_dim=256
    )

    # Model that uses an Embedding layer trained from scratch
    inputs = keras.Input(shape=(None,), dtype="int64")
    embedded = layers.Embedding(input_dim=options['max_tokens'], output_dim=256)(inputs)
    x = layers.Bidirectional(layers.LSTM(32))(embedded)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer="rmsprop",
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    model.summary()

    callbacks = [
        keras.callbacks.ModelCheckpoint(
            "embeddings_bidir_gru.keras",
            save_best_only=True
        )
    ]

    model.fit(
        int_train_ds, 
        validation_data=int_val_ds, 
        #epochs=10, 
        epochs=1,
        callbacks=callbacks
    )

    model = keras.models.load_model("embeddings_bidir_gru.keras")
    print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")


listing11_18(int_train_ds, int_val_ds, int_test_ds)

# 625/625 [==============================] - 162s 253ms/step - loss: 0.4543 - accuracy: 0.8001 - val_loss: 0.3996 - val_accuracy: 0.8496
# 782/782 [==============================] - 72s 91ms/step - loss: 0.4129 - accuracy: 0.8429
# Test acc: 0.843

# Layer (type)                 Output Shape              Param #   
# =================================================================
# input_7 (InputLayer)         [(None, None)]            0         
# _________________________________________________________________
# embedding_8 (Embedding)      (None, None, 256)         5120000   
# _________________________________________________________________
# bidirectional_6 (Bidirection (None, 64)                73984     
# _________________________________________________________________
# dropout_6 (Dropout)          (None, 64)                0         
# _________________________________________________________________
# dense_6 (Dense)              (None, 1)                 65        
# =================================================================
# Total params: 5,194,049
# Trainable params: 5,194,049
# Non-trainable params: 0


Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_8 (Embedding)      (None, None, 256)         5120000   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 64)                73984     
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 65        
Total params: 5,194,049
Trainable params: 5,194,049
Non-trainable params: 0
_________________________________________________________________
Test acc: 0.860


In [55]:
#  Listing 11.19 Model that uses an Embedding layer trained from scratch, with masking enabled
def listing11_19(int_train_ds, int_val_ds, int_test_ds):
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers

    # Understanding padding & masking
    # Model that uses an Embedding layer trained from scratch, with masking enabled
    inputs = keras.Input(shape=(None,), dtype="int64")
    embedded = layers.Embedding(
        input_dim=options['max_tokens'], output_dim=256, mask_zero=True)(inputs)
    
    x = layers.Bidirectional(layers.LSTM(32))(embedded)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer="rmsprop",
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    model.summary()

    callbacks = [
        keras.callbacks.ModelCheckpoint("embeddings_bidir_gru_with_masking.keras",
                                        save_best_only=True)
    ]

    model.fit(
        int_train_ds, 
        validation_data=int_val_ds, 
        #epochs=10, 
        epochs=1,
        callbacks=callbacks
    )
    model = keras.models.load_model("embeddings_bidir_gru_with_masking.keras")
    print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")    

listing11_19(int_train_ds, int_val_ds, int_test_ds)

# 625/625 [==============================] - 200s 262ms/step - loss: 0.4168 - accuracy: 0.8129 - val_loss: 0.3127 - val_accuracy: 0.8696
# 782/782 [==============================] - 79s 97ms/step - loss: 0.3221 - accuracy: 0.8604
# Test acc: 0.860

# _________________________________________________________________
# Layer (type)                 Output Shape              Param #   
# =================================================================
# input_8 (InputLayer)         [(None, None)]            0         
# _________________________________________________________________
# embedding_9 (Embedding)      (None, None, 256)         5120000   
# _________________________________________________________________
# bidirectional_7 (Bidirection (None, 64)                73984     
# _________________________________________________________________
# dropout_7 (Dropout)          (None, 64)                0         
# _________________________________________________________________
# dense_7 (Dense)              (None, 1)                 65        
# =================================================================
# Total params: 5,194,049
# Trainable params: 5,194,049
# Non-trainable params: 0

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_9 (Embedding)      (None, None, 256)         5120000   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 64)                73984     
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 5,194,049
Trainable params: 5,194,049
Non-trainable params: 0
_________________________________________________________________
Test acc: 0.854


---
# Using pretrained word embeddings

In [56]:
# def listing11_20():
#     import os

#     filepath = "glove.6B.100d.txt"
#     if not os.path.isfile(filepath):
#         try:
#             # OLD: https://nlp.stanford.edu/projects/glove/
#             # 822 MB zip file
#             #!wget https://web.archive.org/web/20181130213045/https://nlp.stanford.edu/data/glove.6B.zip --no-check-certificate
#             !unzip -q glove.6B.zip
#         except Exception as ex:
#             print(f"Encountered error: {ex}")

# listing11_20()

In [57]:
# csv.QUOTE_MINIMAL means only when required, for example, when a
#     field contains either the quotechar or the delimiter
# csv.QUOTE_ALL means that quotes are always placed around fields.
# csv.QUOTE_NONNUMERIC means that quotes are always placed around
#     fields which do not parse as integers or floating point
#     numbers.
# csv.QUOTE_NONE means that quotes are never placed around fields.

# Get error 'EOF inside string', as this line has a string that contains
# within it a single quote mark:
# " -0.30457 -0.23645 0.17576 -0.72854 -0.28343 -0.2564 0.26587 0.025309 
# You have to add this line to fix it: quoting=csv.QUOTE_NONE

# GB: Examining file contents with pandas

data_pathway = '/content/drive/MyDrive/data/glove.6B.100d.txt'

def test():
    import pandas as pd
    import csv

    df = pd.read_csv(data_pathway, sep=" ", header=None, quoting=csv.QUOTE_NONE)
    print(df.info())
    print()
    print(df.head().T)

test()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Columns: 101 entries, 0 to 100
dtypes: float64(100), object(1)
memory usage: 308.2+ MB
None

            0         1         2        3         4
0         the         ,         .       of        to
1   -0.038194  -0.10767  -0.33979  -0.1529   -0.1897
2    -0.24487   0.11053   0.20941 -0.24279  0.050024
3     0.72812   0.59812   0.46348  0.89837   0.19084
4    -0.39961  -0.54361  -0.64792  0.16996 -0.049184
..        ...       ...       ...      ...       ...
96   -0.51058   0.61214   0.31802 -0.34839  -0.41548
97   -0.52028  -0.35111  -0.39242 -0.56094 -0.038175
98    -0.1459  -0.83155  -0.23394   -0.591  -0.39804
99     0.8278   0.45293   0.47298   1.0039   0.47647
100   0.27062  0.082577 -0.028803  0.20664  -0.15983

[101 rows x 5 columns]


In [None]:
# Listing 11.20 Parsing the GloVe word-embeddings file
def listing11_20():
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    import numpy as np

    data_pathway = '/content/drive/MyDrive/data/glove.6B.100d.txt'
    path_to_glove_file = data_pathway

    embeddings_index = {}
    with open(path_to_glove_file) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs

    print(f"Found {len(embeddings_index)} word vectors.")
    print()

    #####

    # Setup
    train_ds = keras.preprocessing.text_dataset_from_directory(
        "aclImdb/train", batch_size=options['batch_size']
    )
    val_ds = keras.preprocessing.text_dataset_from_directory(
        "aclImdb/val", batch_size=options['batch_size']
    )
    test_ds = keras.preprocessing.text_dataset_from_directory(
        "aclImdb/test", batch_size=options['batch_size']
    )
    
    HR()

    text_only_train_ds = train_ds.map(lambda x, y: x)


    # Preparing integer sequence datasets
    text_vectorization = layers.experimental.preprocessing.TextVectorization(
        max_tokens=options['max_tokens'],
        output_mode="int",
        output_sequence_length=options['max_length'],
    )
    text_vectorization.adapt(text_only_train_ds)

    int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
    int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
    int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))

    ####

    # Loading the GloVe embeddings in the model
    # Preparing the GloVe word-embeddings matrix

    embedding_dim = 100

    vocabulary = text_vectorization.get_vocabulary()
    word_index = dict(zip(vocabulary, range(len(vocabulary))))

    embedding_matrix = np.zeros((options['max_tokens'], embedding_dim))
    for word, i in word_index.items():
        if i < options['max_tokens']:
            embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


    embedding_layer = layers.Embedding(
        options['max_tokens'],
        embedding_dim,
        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
        trainable=False,
        mask_zero=True,
    )

    #####

    # Training a simple bidirectional LSTM on top of the GloVe embeddings
    # Model that uses a pretrained Embedding layer

    inputs = keras.Input(shape=(None,), dtype="int64")
    embedded = embedding_layer(inputs)
    x = layers.Bidirectional(layers.LSTM(32))(embedded)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)

    model.compile(
        optimizer="rmsprop",
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    
    model.summary()

    callbacks = [
        keras.callbacks.ModelCheckpoint(
            "glove_embeddings_sequence_model.keras",
            save_best_only=True
        )
    ]
    
    model.fit(
        int_train_ds, 
        validation_data=int_val_ds, 
        #epochs=10, 
        epochs=1,
        callbacks=callbacks
    )
    
    model = keras.models.load_model("glove_embeddings_sequence_model.keras")
    
    print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

    #####
    print("Done..")

listing11_20()

# 625/625 [==============================] - 352s 552ms/step - loss: 0.5749 - accuracy: 0.6982 - val_loss: 0.4861 - val_accuracy: 0.7674
# 782/782 [==============================] - 100s 125ms/step - loss: 0.4894 - accuracy: 0.7669
# Test acc: 0.767

# Layer (type)                 Output Shape              Param #   
# =================================================================
# input_9 (InputLayer)         [(None, None)]            0         
# _________________________________________________________________
# embedding_10 (Embedding)     (None, None, 100)         2000000   
# _________________________________________________________________
# bidirectional_8 (Bidirection (None, 64)                34048     
# _________________________________________________________________
# dropout_8 (Dropout)          (None, 64)                0         
# _________________________________________________________________
# dense_8 (Dense)              (None, 1)                 65        
# =================================================================
# Total params: 2,034,113
# Trainable params: 34,113
# Non-trainable params: 2,000,000


Found 400000 word vectors.

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.
--------------------------------------------------------------------------------
Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_10 (Embedding)     (None, None, 100)         2000000   
_________________________________________________________________
bidirectional_8 (Bidirection (None, 64)                34048     
_________________________________________________________________
dropout_8 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 65        
Total params: 2,034,113

### Up to Listing 11.22 Model that uses a pretrained Embedding layer, pg.382