This colab file contains code for sentiment analysis of IMDB movie reviews using tensorfloe. This is part of the tensorflow developer course by deeplearning ai.

# Download IMDB dataset



In [None]:
import tensorflow_datasets as tfds
imdb,info=tfds.load('imdb_reviews',with_info=True,as_supervised=True)

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteO8CPZL/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteO8CPZL/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteO8CPZL/imdb_reviews-unsupervised.t…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [None]:
#Print info about the dataset
print(info)

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset. This is a dataset for binary sentiment
    classification containing substantially more data than previous benchmark
    datasets. We provide a set of 25,000 highly polar movie reviews for training,
    and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_dir=PosixGPath('/tmp/tmpl0hfftubtfds'),
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
        'text': Text(shape=(), dtype=string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'train': <SplitInfo

# Split the dataset

In [None]:
print(imdb)

{Split('train'): <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>, Split('test'): <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>, Split('unsupervised'): <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>}


In [None]:
#take two training examples and print its content
for example in imdb['train'].take(2):
  print(example)

(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on

**The next cell below will take all the train and test sentences and labels into separate lists so you can preprocess the text and feed it to the model later.**

In [None]:
import numpy as np

#get the train and test splits
train_data,test_data=imdb['train'],imdb['test']

#initialize sentences and labels list
training_sentences=[]
training_labels=[]

testing_sentences=[]
testing_labels=[]

#loop over all training examples and save the sentences and labels
for sentence,label in train_data:
  training_sentences.append(sentence.numpy().decode('utf8'))
  training_labels.append(label.numpy())

#loop over all testing examples and save the sentences and labels
for sentence,label in test_data:
  testing_sentences.append(sentence.numpy().decode('utf8'))
  testing_labels.append(label.numpy())

#convert labels list to numpy arrays
training_labels_final=np.array(training_labels)
testing_labels_final=np.array(testing_labels)




# Generate padded sequences

In [None]:
#parameters
vocab_size=1000
max_length=120
embedding_dim=16
trunc_type='post'
oov_token='<OOV>'

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#Initialize the tokenizer
tokenizer=Tokenizer(num_words=vocab_size, oov_token=oov_token)

#generate word index dictionary for the training sentences
tokenizer.fit_on_texts(training_sentences)
word_index=tokenizer.word_index

#generate and pad the training sentences
sequences=tokenizer.texts_to_sequences(training_sentences)
padded=pad_sequences(sequences,maxlen=max_length,truncating=trunc_type)

#generate and pad the testing sentences
testing_sequences=tokenizer.texts_to_sequences(testing_sentences)
testing_padded=pad_sequences(testing_sequences,maxlen=max_length,truncating=trunc_type)

# Build and compile the model

With the data already preprocessed, you can proceed to building your sentiment classification model. The input will be an Embedding layer. The main idea here is to represent each word in your vocabulary with vectors. These vectors have trainable weights so as your neural network learns, words that are most likely to appear in a positive tweet will converge towards similar weights. Similarly, words in negative tweets will be clustered more closely together. You can read more about word embeddings here.

After the Embedding layer, you will flatten its output and feed it into a Dense layer. You will explore other architectures for these hidden layers in the next labs.

The output layer would be a single neuron with a sigmoid activation to distinguish between the 2 classes. As is typical with binary classifiers, you will use the binary_crossentropy as your loss function while training.

In [None]:
import tensorflow as tf

#Build the model
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

#setup the training parameters
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

#print the model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 120, 16)           16000     
                                                                 
 flatten_2 (Flatten)         (None, 1920)              0         
                                                                 
 dense_3 (Dense)             (None, 6)                 11526     
                                                                 
 dense_4 (Dense)             (None, 1)                 7         
                                                                 
Total params: 27533 (107.55 KB)
Trainable params: 27533 (107.55 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


#Train the model

In [None]:
num_epochs=5

#train the model
model.fit(padded,training_labels_final,epochs=num_epochs,validation_data=(testing_padded,testing_labels_final))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7e97397b05b0>

#Visualize word embeddings

**After training, you can visualize the trained weights in the Embedding layer to see words that are clustered together. The Tensorflow Embedding Projector is able to reduce the 16-dimension vectors you defined earlier into fewer components so it can be plotted in the projector. First, you will need to get these weights and you can do that with the cell below:**

In [None]:
#get the embedding layer from the model's first layer
embedding_layer=model.layers[0]

#get the weights of the embedding layer
embedding_weights=embedding_layer.get_weights()[0]

#print the shape. expected is (vocab_size,embedding-dim)
print(embedding_weights.shape)

(1000, 16)


**You will need to generate two files:

    vecs.tsv - contains the vector weights of each word in the vocabulary
    meta.tsv - contains the words in the vocabulary

For this, it is useful to have reverse_word_index dictionary so you can quickly lookup a word based on a given index. For example, reverse_word_index[1] will return your OOV token because it is always at index = 1. Fortunately, the Tokenizer class already provides this dictionary through its index_word property. Yes, as the name implies, it is the reverse of the word_index property which you used earlier!
**

In [None]:
#get the index-word dictionary
reverse_word_index=tokenizer.index_word

In [None]:
reverse_word_index

{1: '<OOV>',
 2: 'the',
 3: 'and',
 4: 'a',
 5: 'of',
 6: 'to',
 7: 'is',
 8: 'br',
 9: 'in',
 10: 'it',
 11: 'i',
 12: 'this',
 13: 'that',
 14: 'was',
 15: 'as',
 16: 'for',
 17: 'with',
 18: 'movie',
 19: 'but',
 20: 'film',
 21: 'on',
 22: 'not',
 23: 'you',
 24: 'are',
 25: 'his',
 26: 'have',
 27: 'he',
 28: 'be',
 29: 'one',
 30: 'all',
 31: 'at',
 32: 'by',
 33: 'an',
 34: 'they',
 35: 'who',
 36: 'so',
 37: 'from',
 38: 'like',
 39: 'her',
 40: 'or',
 41: 'just',
 42: 'about',
 43: "it's",
 44: 'out',
 45: 'if',
 46: 'has',
 47: 'some',
 48: 'there',
 49: 'what',
 50: 'good',
 51: 'more',
 52: 'when',
 53: 'very',
 54: 'up',
 55: 'no',
 56: 'time',
 57: 'she',
 58: 'even',
 59: 'my',
 60: 'would',
 61: 'which',
 62: 'only',
 63: 'story',
 64: 'really',
 65: 'see',
 66: 'their',
 67: 'had',
 68: 'can',
 69: 'were',
 70: 'me',
 71: 'well',
 72: 'than',
 73: 'we',
 74: 'much',
 75: 'been',
 76: 'bad',
 77: 'get',
 78: 'will',
 79: 'do',
 80: 'also',
 81: 'into',
 82: 'people',
 8

**Now you can start the loop to generate the files. You will loop vocab_size-1 times, skipping the 0 key because it is just for the padding.**

In [None]:
import io

#open writable files
out_v=io.open('vecs.tsv','w',encoding='utf-8')
out_m=io.open('meta.tsv','w',encoding='utf-8')

#initialize the loop. start counting at 1 because 0 is just for padding
for word_num in range(1,vocab_size):
  #get the word associated at current index
  word_name=reverse_word_index[word_num]

  #get the embedding weights associated with current index
  word_embedding=embedding_weights[word_num]

  #write the word name
  out_m.write(word_name+"\n")

  #write the word embedding
  out_v.write('\t'.join([str(x) for x in word_embedding])+'\n')

#close the files\
out_v.close()
out_m.close()

In [None]:
#code to download the files
try:
  from google.colab import files
except ImportError:
  pass

#Download the files
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>