In [1]:
import io
import os
import re 
import shutil
import string
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [3]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                 untar=True, cache_dir='.',
                                 cache_subdir='')
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [5]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [6]:
# remove the unwanted directory from the train folder
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [7]:
batch_size = 1024
seed = 123
train_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/train', batch_size = batch_size, 
                                                      validation_split=0.2, subset = 'training', 
                                                      seed = seed)
val_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/train', batch_size = batch_size, 
                                                    validation_split=0.2, subset = 'validation', 
                                                    seed = seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [8]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(5):
        print(label_batch[i].numpy(), text_batch.numpy()[i])

0 b"Oh My God! Please, for the love of all that is holy, Do Not Watch This Movie! It it 82 minutes of my life I will never get back. Sure, I could have stopped watching half way through. But I thought it might get better. It Didn't. Anyone who actually enjoyed this movie is one seriously sick and twisted individual. No wonder us Australians/New Zealanders have a terrible reputation when it comes to making movies. Everything about this movie is horrible, from the acting to the editing. I don't even normally write reviews on here, but in this case I'll make an exception. I only wish someone had of warned me before I hired this catastrophe"
1 b'This movie is SOOOO funny!!! The acting is WONDERFUL, the Ramones are sexy, the jokes are subtle, and the plot is just what every high schooler dreams of doing to his/her school. I absolutely loved the soundtrack as well as the carefully placed cynicism. If you like monty python, You will love this film. This movie is a tad bit "grease"esk (without

In [9]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [10]:
# dimentions of an embedding is a parameter you can play with to find what works the best for you

# Embed a 1000 word vocabulary  into 5 dimensions
embedding_layer = tf.keras.layers.Embedding(1000, 5)

## Using an Embedding layer

The Embedding layer can be understood as a lookup table that maps from integer indices (which stand for specific words) to dense vectors (their embeddings). The dimensionality (or width) of the embedding is a parameter you can experiment with to see what works well for your problem, much in the same way you would experiment with the number of neurons in a Dense layer.

When you create an Embedding layer, the weights for the embedding are randomly initialized (just like any other layer). During training, they are gradually adjusted via backpropagation. Once trained, the learned word embeddings will roughly encode similarities between words (as they were learned for the specific problem your model is trained on).

In [15]:
# showing the embeddings for an example
result = embedding_layer(tf.constant([[0,1,2], [3,4,5]]))
result.numpy()

array([[[-0.02119352,  0.00638729,  0.00343934, -0.03916281,
         -0.03570843],
        [ 0.02280325,  0.04825342, -0.0207951 , -0.02805907,
         -0.02214397],
        [-0.0069988 ,  0.04791379,  0.04976752,  0.04081602,
         -0.03029866]],

       [[ 0.00366359,  0.00313182, -0.04499685,  0.0176713 ,
          0.00181502],
        [-0.03580976, -0.02015543, -0.00210682, -0.03422099,
          0.03230492],
        [ 0.04889331, -0.02149493,  0.03135312,  0.03694633,
         -0.01918172]]], dtype=float32)

In [28]:
# removing html tags from the text and punctuations
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />','')
    return tf.strings.regex_replace(stripped_html, '[%s]' %re.escape(string.punctuation), '')

vocab_size = 10000
sequence_length = 100

vectorize_layer  = TextVectorization(
    standardize = custom_standardization,
    max_tokens = vocab_size,
    output_mode = 'int',
    output_sequence_length = sequence_length)

text_ds = train_ds.map(lambda x, y:x)
vectorize_layer.adapt(text_ds)

### Now using the ebeddings formed foor classification task

In [31]:
embedding_dim = 16

model = Sequential([
    vectorize_layer,
    Embedding(vocab_size, embedding_dim, name = "embedding"),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1)
])

### compiling and training model, learning tensorboard

In [33]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [34]:
model.compile(optimizer='adam',
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=['accuracy'])

In [35]:
model.fit(train_ds,
         validation_data=val_ds,
         epochs=15,
         callbacks=[tensorboard_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x19c2b99e6a0>

### visualization on tensorboard

In [36]:
# docs_infra: no_execute
%load_ext tensorboard
%tensorboard --logdir logs

### saving the weights of embedding layer

In [37]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()


### saving the vectors in embedding layer and corresponding words to run on embedding projector(projector.tensorflow.org) to see the umap, t-sne, pca projections 

In [38]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()
