In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

__Embeddings__ map any word onto a vectorial representation (each word with a vector). For instance, the word dog can be represented by the vector (w1, w2,..., wn) in the embedding space.

# Load data

In [2]:
###########################################
### Just run this cell to load the data ###
###########################################

import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import text_to_word_sequence

def load_data(percentage_of_sentences=None):
    train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"], batch_size=-1, as_supervised=True)

    train_sentences, y_train = tfds.as_numpy(train_data)
    test_sentences, y_test = tfds.as_numpy(test_data)
    
    # Take only a given percentage of the entire data
    if percentage_of_sentences is not None:
        assert(percentage_of_sentences> 0 and percentage_of_sentences<=100)
        
        len_train = int(percentage_of_sentences/100*len(train_sentences))
        train_sentences, y_train = train_sentences[:len_train], y_train[:len_train]
  
        len_test = int(percentage_of_sentences/100*len(test_sentences))
        test_sentences, y_test = test_sentences[:len_test], y_test[:len_test]
    
    X_train = [text_to_word_sequence(_.decode("utf-8")) for _ in train_sentences]
    X_test = [text_to_word_sequence(_.decode("utf-8")) for _ in test_sentences]
    
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = load_data(percentage_of_sentences=10)

2024-03-12 09:18:05.553691: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-12 09:18:05.938894: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-03-12 09:18:05.938911: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-03-12 09:18:05.993630: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-12 09:18:07.069747: W tensorflow/stream_executor/platform/de

In [3]:
print(y_train[0], X_train[0])

0 ['this', 'was', 'an', 'absolutely', 'terrible', 'movie', "don't", 'be', 'lured', 'in', 'by', 'christopher', 'walken', 'or', 'michael', 'ironside', 'both', 'are', 'great', 'actors', 'but', 'this', 'must', 'simply', 'be', 'their', 'worst', 'role', 'in', 'history', 'even', 'their', 'great', 'acting', 'could', 'not', 'redeem', 'this', "movie's", 'ridiculous', 'storyline', 'this', 'movie', 'is', 'an', 'early', 'nineties', 'us', 'propaganda', 'piece', 'the', 'most', 'pathetic', 'scenes', 'were', 'those', 'when', 'the', 'columbian', 'rebels', 'were', 'making', 'their', 'cases', 'for', 'revolutions', 'maria', 'conchita', 'alonso', 'appeared', 'phony', 'and', 'her', 'pseudo', 'love', 'affair', 'with', 'walken', 'was', 'nothing', 'but', 'a', 'pathetic', 'emotional', 'plug', 'in', 'a', 'movie', 'that', 'was', 'devoid', 'of', 'any', 'real', 'meaning', 'i', 'am', 'disappointed', 'that', 'there', 'are', 'movies', 'like', 'this', 'ruining', "actor's", 'like', 'christopher', "walken's", 'good', 'nam

In [4]:
print(y_train[8], X_train[8])

0 ['i', 'really', 'love', 'the', 'sexy', 'action', 'and', 'sci', 'fi', 'films', 'of', 'the', 'sixties', 'and', 'its', 'because', 'of', 'the', "actress's", 'that', 'appeared', 'in', 'them', 'they', 'found', 'the', 'sexiest', 'women', 'to', 'be', 'in', 'these', 'films', 'and', 'it', "didn't", 'matter', 'if', 'they', 'could', 'act', 'remember', 'candy', 'the', 'reason', 'i', 'was', 'disappointed', 'by', 'this', 'film', 'was', 'because', 'it', "wasn't", 'nostalgic', 'enough', 'the', 'story', 'here', 'has', 'a', 'european', 'sci', 'fi', 'film', 'called', 'dragonfly', 'being', 'made', 'and', 'the', 'director', 'is', 'fired', 'so', 'the', 'producers', 'decide', 'to', 'let', 'a', 'young', 'aspiring', 'filmmaker', 'jeremy', 'davies', 'to', 'complete', 'the', 'picture', "they're", 'is', 'one', 'real', 'beautiful', 'woman', 'in', 'the', 'film', 'who', 'plays', 'dragonfly', 'but', "she's", 'barely', 'in', 'it', 'film', 'is', 'written', 'and', 'directed', 'by', 'roman', 'coppola', 'who', 'uses', 's

__LABELS__: the task is a binary classification problem:

- label __0__ corresponds to a negative movie review
- label __1__ corresponds to a positive movie review

__INPUTS__:

- data has been partially cleaned

# Tokenize data

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

# This initializes a Keras utilities that does all the tokenization for you
tokenizer = Tokenizer()

# The tokenization learns a dictionary that maps a token (integer) to each word
# It can be done only on the train set - we are not supposed to know the test set!
# This tokenization also lowercases your words, apply some filters, and so on - you can check the doc if you want
tokenizer.fit_on_texts(X_train)
    
# We apply the tokenization to the train and test set
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

In [6]:
sentence_number = 100

input_raw = X_train[sentence_number]
input_token = X_train_token[sentence_number]

for i in range(40):
    print(f'Word: {input_raw[i]} -> Token {input_token[i]}')

Word: i -> Token 9
Word: enjoyed -> Token 579
Word: this -> Token 11
Word: movie -> Token 18
Word: and -> Token 3
Word: after -> Token 104
Word: watching -> Token 155
Word: it -> Token 10
Word: it -> Token 10
Word: made -> Token 90
Word: me -> Token 65
Word: wonder -> Token 574
Word: just -> Token 40
Word: how -> Token 84
Word: many -> Token 107
Word: 'caitlin -> Token 17238
Word: rose's' -> Token 17239
Word: exist -> Token 1632
Word: in -> Token 8
Word: the -> Token 1
Word: world -> Token 189
Word: how -> Token 84
Word: many -> Token 107
Word: other -> Token 82
Word: girls -> Token 526
Word: have -> Token 25
Word: been -> Token 76
Word: subjected -> Token 5574
Word: to -> Token 5
Word: this -> Token 11
Word: sort -> Token 406
Word: of -> Token 4
Word: sexual -> Token 991
Word: abuse -> Token 2393
Word: and -> Token 3
Word: torment -> Token 8200
Word: by -> Token 31
Word: classmates -> Token 7069
Word: and -> Token 3
Word: have -> Token 25


The dictionary that maps each word to a token can be accessed with __tokenizer.word_index__

Let's add a __vocab_size__ variable that stores the number of different words (= tokens) in the train set. This is called the size of the vocabulary

In [7]:
vocab_size = len(tokenizer.word_index)

print(f'There are {vocab_size} different words in the train set')

There are 30419 different words in the train set


__X_train_token__ and __X_test_token__ contain sequences of different lengths.


However, a neural network has to have a tensor as input. For this reason, we have to pad the data.

# Padding

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_pad = pad_sequences(X_train_token, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_test_token, dtype='float32', padding='post')

# RNN model

Let's write a model that has:

- an embedding layer whose __input_dim__ is the size of the vocabulary (= __vocab_size__), and whose __output_dim__ is the size of the embedding space we want to have
- a RNN (SimpleRNN, LSTM, GRU) layer
- a Dense layer
- an output layer


Here we don't need a masking layer because __layers.Embedding__ has an argument to do that directly which we have to set with __mask_zero=True__. That also means that the data __HAS TO__ be padded with __0__ (which is the default behavior).


__input_dim__ should equal size of vocabulary + 1

In [9]:
from tensorflow.keras import Sequential, layers

embedding_dimension = 50

model = Sequential()
model.add(layers.Embedding(input_dim = vocab_size+1, output_dim=embedding_dimension, mask_zero=True))
model.add(layers.LSTM(20))
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
             optimizer='rmsprop',
             metrics=['accuracy'])

In [10]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=4, restore_best_weights=True)

model.fit(X_train_pad, y_train,
         epochs=20,
         batch_size=16,
         validation_split=0.3,
         callbacks=[es])

Epoch 1/20
Epoch 2/20
 11/110 [==>...........................] - ETA: 36s - loss: 0.5942 - accuracy: 0.7670

KeyboardInterrupt: 