## End to end Deep Learning Project Using Simple RNN

In [26]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,SimpleRNN,Dense
from tensorflow.keras.layers import Dropout

In [43]:
## Load the imdb dataset

from tensorflow.keras.datasets import imdb
from sklearn.model_selection import train_test_split

# Load the IMDB dataset
max_features = 10000  # Vocabulary size
(X_data, y_data), (_, _) = imdb.load_data(num_words=max_features)  # Ignore default split

# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_data, test_size=0.2, random_state=42
)

# Print the shape of the data
print(f'Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}')
print(f'Testing data shape: {X_test.shape}, Testing labels shape: {y_test.shape}')

Training data shape: (20000,), Training labels shape: (20000,)
Testing data shape: (5000,), Testing labels shape: (5000,)


In [44]:
# check if balanced dataset
from collections import Counter
print(Counter(y_train))
print(Counter(y_test))

Counter({0: 10063, 1: 9937})
Counter({1: 2563, 0: 2437})


In [45]:
X_train[0],y_train[0]

([1,
  73,
  89,
  81,
  25,
  60,
  967,
  6,
  20,
  141,
  17,
  14,
  31,
  127,
  12,
  60,
  28,
  1360,
  1107,
  66,
  45,
  6,
  20,
  15,
  497,
  8,
  79,
  17,
  491,
  8,
  112,
  6,
  6683,
  20,
  17,
  614,
  691,
  4,
  436,
  20,
  9,
  2855,
  6,
  762,
  7,
  493,
  8621,
  6,
  185,
  250,
  24,
  55,
  2276,
  5,
  23,
  350,
  7,
  15,
  82,
  24,
  15,
  821,
  66,
  10,
  10,
  45,
  578,
  15,
  4,
  20,
  805,
  8,
  30,
  17,
  821,
  5,
  1621,
  17,
  614,
  190,
  4,
  20,
  9,
  43,
  32,
  99,
  1214,
  18,
  15,
  8,
  157,
  46,
  17,
  1436,
  4,
  2,
  5,
  2,
  9,
  32,
  1796,
  5,
  1214,
  267,
  17,
  73,
  17,
  4413,
  36,
  26,
  400,
  43,
  4562,
  83,
  4,
  1873,
  247,
  74,
  83,
  4,
  250,
  540,
  82,
  4,
  96,
  4,
  250,
  8306,
  8,
  32,
  4,
  2,
  9,
  184,
  3966,
  13,
  384,
  48,
  14,
  16,
  147,
  1348,
  59,
  62,
  69,
  9420,
  12,
  46,
  50,
  9,
  53,
  2,
  74,
  1930,
  11,
  14,
  31,
  151,
  10,
  10,
  4,
 

In [46]:
## Inspect a sample review and its label
sample_review=X_train[0]
sample_label=y_train[0]

print(f"Sample review (as integers):{sample_review}")
print(f'Sample label: {sample_label}')


Sample review (as integers):[1, 73, 89, 81, 25, 60, 967, 6, 20, 141, 17, 14, 31, 127, 12, 60, 28, 1360, 1107, 66, 45, 6, 20, 15, 497, 8, 79, 17, 491, 8, 112, 6, 6683, 20, 17, 614, 691, 4, 436, 20, 9, 2855, 6, 762, 7, 493, 8621, 6, 185, 250, 24, 55, 2276, 5, 23, 350, 7, 15, 82, 24, 15, 821, 66, 10, 10, 45, 578, 15, 4, 20, 805, 8, 30, 17, 821, 5, 1621, 17, 614, 190, 4, 20, 9, 43, 32, 99, 1214, 18, 15, 8, 157, 46, 17, 1436, 4, 2, 5, 2, 9, 32, 1796, 5, 1214, 267, 17, 73, 17, 4413, 36, 26, 400, 43, 4562, 83, 4, 1873, 247, 74, 83, 4, 250, 540, 82, 4, 96, 4, 250, 8306, 8, 32, 4, 2, 9, 184, 3966, 13, 384, 48, 14, 16, 147, 1348, 59, 62, 69, 9420, 12, 46, 50, 9, 53, 2, 74, 1930, 11, 14, 31, 151, 10, 10, 4, 20, 9, 540, 364, 352, 5, 45, 6, 2, 589, 33, 269, 8, 2715, 142, 1621, 5, 821, 17, 73, 17, 204, 5, 2908, 19, 55, 1763, 4697, 92, 66, 104, 14, 20, 93, 76, 1488, 151, 33, 4, 58, 12, 188, 626, 151, 12, 215, 69, 224, 142, 73, 237, 6, 964, 7, 1446, 2289, 188, 626, 103, 14, 31, 10, 10, 451, 7, 1465, 5

In [47]:
### Mapping of words index bacl to words(for understanding)
word_index=imdb.get_word_index()
#word_index
reverse_word_index = {value: key for key, value in word_index.items()}
reverse_word_index

{34701: 'fawn',
 52006: 'tsukino',
 52007: 'nunnery',
 16816: 'sonja',
 63951: 'vani',
 1408: 'woods',
 16115: 'spiders',
 2345: 'hanging',
 2289: 'woody',
 52008: 'trawling',
 52009: "hold's",
 11307: 'comically',
 40830: 'localized',
 30568: 'disobeying',
 52010: "'royale",
 40831: "harpo's",
 52011: 'canet',
 19313: 'aileen',
 52012: 'acurately',
 52013: "diplomat's",
 25242: 'rickman',
 6746: 'arranged',
 52014: 'rumbustious',
 52015: 'familiarness',
 52016: "spider'",
 68804: 'hahahah',
 52017: "wood'",
 40833: 'transvestism',
 34702: "hangin'",
 2338: 'bringing',
 40834: 'seamier',
 34703: 'wooded',
 52018: 'bravora',
 16817: 'grueling',
 1636: 'wooden',
 16818: 'wednesday',
 52019: "'prix",
 34704: 'altagracia',
 52020: 'circuitry',
 11585: 'crotch',
 57766: 'busybody',
 52021: "tart'n'tangy",
 14129: 'burgade',
 52023: 'thrace',
 11038: "tom's",
 52025: 'snuggles',
 29114: 'francesco',
 52027: 'complainers',
 52125: 'templarios',
 40835: '272',
 52028: '273',
 52130: 'zaniacs',

In [48]:
# i-3 because 0,1,2 are special tokens (padding, SOS, unknown)
decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in sample_review])
decoded_review

"? well how do you even rate a movie such as this one does it even have cinematic value really it's a movie that tries to get as close to being a snuff movie as possible basically the entire movie is purely a bunch of guys torturing a young girl not very appealing and on top of that also not that realistic really br br it's obvious that the movie tried to be as realistic and shocking as possible however the movie is just all too fake for that to work out as intended the ? and ? is all soft and fake looking as well as sounding they are often just kicking into the floor rather than into the girl obviously also the way the girl responds to all the ? is pretty tame i mean if this was real surely she would had screamed it out there is more ? than screaming in this one though br br the movie is obviously low budget and it's a ? attempt at trying to achieve something shocking and realistic as well as original and provoking with very limited resources don't really think this movie made much im

In [49]:
from tensorflow.keras.preprocessing import sequence

max_len=500

X_train=sequence.pad_sequences(X_train,maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)
X_train

array([[   0,    0,    0, ...,   10,  470,  158],
       [  86,  541,  173, ...,    6, 2761,  632],
       [   0,    0,    0, ..., 1689,  798,   12],
       ...,
       [   0,    0,    0, ...,    7,   14,  509],
       [   0,    0,    0, ...,   25,  170, 2241],
       [   0,    0,    0, ...,   25,  194,  757]], dtype=int32)

In [50]:
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [30]:
## Train Simple RNN
model=Sequential()
model.add(Embedding(max_features,128,input_length=max_len)) ## Embedding Layers
model.add(SimpleRNN(128,activation='relu',unroll=True))
model.add(Dropout(0.5))  # Add dropout to reduce overfitting
model.add(Dense(1,activation="sigmoid"))
model.build(input_shape=(None, max_len))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [31]:
model.summary()

In [None]:
# ## Create an instance of EarlyStoppping Callback
# from tensorflow.keras.callbacks import EarlyStopping
# earlystopping=EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)
# earlystopping

<keras.src.callbacks.early_stopping.EarlyStopping at 0x1736b3010>

In [None]:
# ## Train the model with early sstopping
# history=model.fit(
#     X_train,y_train,epochs=10,batch_size=32,
#     validation_split=0.2,
#     callbacks=[earlystopping]
# )

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 278ms/step - accuracy: 0.5766 - loss: 5242.6489 - val_accuracy: 0.6670 - val_loss: 0.5889
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 256ms/step - accuracy: 0.7274 - loss: 0.7535 - val_accuracy: 0.7730 - val_loss: 0.4845
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 259ms/step - accuracy: 0.8358 - loss: 0.3625 - val_accuracy: 0.7922 - val_loss: 0.4684
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 256ms/step - accuracy: 0.8863 - loss: 0.2787 - val_accuracy: 0.7894 - val_loss: 0.5149
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 258ms/step - accuracy: 0.9095 - loss: 0.2317 - val_accuracy: 0.7954 - val_loss: 0.5281
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 259ms/step - accuracy: 0.9317 - loss: 0.1823 - val_accuracy: 0.7890 - val_loss: 0.5610
E

In [34]:
## Save model file
model.save('simple_rnn_imdb.h5')



In [51]:
import keras_tuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Constants (Update these based on your dataset)
max_features = 10000  # Size of the vocabulary
max_len = 500         # Max length of sequences (padded)
X_train_padded = pad_sequences(X_train, maxlen=max_len)
X_test_padded = pad_sequences(X_test, maxlen=max_len)

# Define the model builder function
def build_model(hp):
    model = Sequential()
    # Embedding layer with fixed vocabulary size and sequence length
    model.add(Embedding(input_dim=max_features, output_dim=128, input_length=max_len))
    
    # Add RNN layers dynamically based on the hyperparameter search space
    for i in range(hp.Int("rnn_layers", 1, 2)):  # Search for 1 or 2 RNN layers
        model.add(SimpleRNN(
            units=hp.Choice("units", [32, 64]),  # Search for 32 or 64 units
            activation="relu",
            unroll=True
        ))
        model.add(Dropout(rate=hp.Choice("dropout_rate", [0.2, 0.3])))  # Search for dropout rate

    # Output layer
    model.add(Dense(1, activation="sigmoid"))

    # Compile the model
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    return model

# Define the Bayesian Tuner
tuner = kt.BayesianOptimization(
    hypermodel=build_model,
    objective="val_accuracy",  # Optimize for validation accuracy
    max_trials=10,             # Number of hyperparameter combinations to try
    directory="my_dir",        # Directory to save logs
    project_name="rnn_tuning"  # Name of the project
)

# Define EarlyStopping callback
earlystopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

# Perform the hyperparameter search using validation_split
tuner.search(
    X_train_padded, y_train,
    epochs=10,
    validation_split=0.2,  # Use 20% of training data for validation
    callbacks=[earlystopping]
)

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Best Hyperparameters:")
print(f"Units: {best_hps.get('units')}")
print(f"Dropout Rate: {best_hps.get('dropout_rate')}")
print(f"RNN Layers: {best_hps.get('rnn_layers')}")

# Retrieve the best model
best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()

# Evaluate the best model on the test set
test_loss, test_accuracy = best_model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {test_accuracy}")

Trial 8 Complete [00h 00m 01s]

Best val_accuracy So Far: 0.8510000109672546
Total elapsed time: 00h 54m 41s
Best Hyperparameters:
Units: 32
Dropout Rate: 0.3
RNN Layers: 1


  saveable.load_own_variables(weights_store.get(inner_path))


[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 737ms/step - accuracy: 0.8562 - loss: 0.3531
Test Accuracy: 0.853600025177002


In [52]:
# Retrieve the best model
# best_model = tuner.get_best_models(num_models=1)[0]

# Save the best model to an .h5 file
best_model.save("simple_rnn_imdb_optimized.h5")

print("Best model saved as simple_rnn_imdb_optimized.h5")



Best model saved as simple_rnn_imdb_optimized.h5


In [None]:
# check to make sure model is mapping text correctly and performing as expected

import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Decode the original text of an IMDB review
def decode_review(encoded_review, word_index):
    reverse_word_index = {value: key for key, value in word_index.items()}  # Reverse the word index
    return " ".join([reverse_word_index.get(i - 3, "?") for i in encoded_review if i > 3])  # Map indices to words

# Example: Validation split setup (adjust index as needed)
validation_split = 0.2
num_val_samples = int(len(X_train_padded) * validation_split)
val_index = 0  # Pick a validation record (adjust as needed)

# Get the validation record and label
val_record = X_train_padded[-num_val_samples:][val_index]
val_label = y_train[-num_val_samples:][val_index]

# Reshape the record for prediction
val_record_reshaped = np.expand_dims(val_record, axis=0)

# Get the prediction score during validation
validation_score = best_model.predict(val_record_reshaped)[0][0]
print(f"Validation score during training: {validation_score}")
print(f"Actual label: {val_label}")

# Decode the original review text
word_index = imdb.get_word_index()
original_text = decode_review(X_train[-num_val_samples:][val_index], word_index)
print(f"Original review text: {original_text}")

# Predict again to compare
new_prediction = best_model.predict(val_record_reshaped)[0][0]
print(f"New prediction score: {new_prediction}")

# Compare the scores
if np.isclose(validation_score, new_prediction, atol=1e-6):
    print("Validation and new prediction scores match.")
else:
    print("Validation and new prediction scores do not match.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 671ms/step
Validation score during training: 0.9996938705444336
Actual label: 1
Original review text: antonio aka castle of blood of 1964 is a beautiful and incredibly haunting masterpiece of italian gothic horror and after mario la del aka black sunday of 1960 and roger pit and the of starring the great vincent price another must see that earned the wonderful barbara steele her more than deserved fame as the most important female horror icon in the history of motion pictures but not only is the beautiful and brilliant barbara steele one of my favorite actresses of all time the screenplay to was co written by no one less than the cinematic genius sergio who directed such ingenious spaghetti western as 1966 and the great silence 1968 number 2 in the field right after mario director antonio is one of the all time masters of gothic horror and castle of blood is his greatest achievement hardly another film works so brilliantly i