# RNN
We used an RNN, this time on the preprocessed text of the review.

In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


### Import libraries

In [2]:
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
import ast
from sklearn.model_selection import train_test_split

2024-06-19 16:57:28.980674: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [3]:
from tensorflow.keras.layers import Input, Dense,Embedding, Bidirectional, Attention, LSTM, Concatenate

In [4]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [5]:
import pandas as pd

### Check GPU

In [6]:
## CHECK GPU

from tensorflow.python.client import device_lib

def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print(get_available_devices())

['/device:CPU:0', '/device:GPU:0']


In [7]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


### Functions
The first function serves to convert a string into a Python object, the second to create text from tokens, and the last to split the dataset into training and testing sets.

In [8]:
#FUNCTIONS DEFINITION

#READ SPLIT TOKENS
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError) as e:
        print(f"Error parsing value {val}: {e}")
        return val  # Return the original value if there is an error

#MERGE TOKENS AS A WHOLE TEXT
def join_tokens(token_list):
    if isinstance(token_list, list):
        return ' '.join(token_list)
    return token_list


#SPLIT TRAIN + TEST 80-20
def split_train_test(df, label_name):
    train, test= train_test_split(df, test_size=0.2, stratify=df[label_name],random_state=42)
    return train, test


### Function metrics
Here we have defined the various functions to calculate the metrics.

In [9]:
from tensorflow.keras import backend as K

def precision_m(y_true, y_pred):     
    y_pred = tf.nn.sigmoid(y_pred)  # Apply sigmoid to get probabilities   
    y_pred = K.round(y_pred)  # Convert probabilities to 0 or 1    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))     
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))     
    precision = true_positives / (predicted_positives + K.epsilon())     
    return precision

# Custom metric for recall
def recall_m(y_true, y_pred):
    y_pred = tf.nn.sigmoid(y_pred)  # Apply sigmoid to get probabilities
    y_pred = K.round(y_pred)  # Convert probabilities to 0 or 1
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

# Custom metric for F1 score
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

def false_negative_rate_m(y_true, y_pred):
    # Apply sigmoid to get probabilities
    y_pred = tf.nn.sigmoid(y_pred)
    # Convert probabilities to binary predictions
    y_pred = K.round(y_pred)
    # Calculate False Negatives
    false_negatives = K.sum(K.cast(y_true, dtype='float32') * (1 - y_pred))
    # Calculate True Positives + False Negatives (total actual positives)
    possible_positives = K.sum(K.cast(y_true, dtype='float32'))
    # Calculate False Negative Rate
    fnr = false_negatives / (possible_positives + K.epsilon())
    return fnr

###  Read the dataset

In [10]:
CleanData=pd.read_csv("../Dataset/datiClean.csv")

In [11]:
CleanData=CleanData[["clean_review","is_spoiler"]]

In [12]:
CleanData

Unnamed: 0,clean_review,is_spoiler
0,"['oscar', 'year', 'shawshank', 'redemption', '...",True
1,"['shawshank', 'redemption', 'without', 'doubt'...",True
2,"['believe', 'film', 'best', 'story', 'ever', '...",True
3,"['yes', 'spoiler', 'film', 'emotional', 'impac...",True
4,"['heart', 'extraordinary', 'movie', 'brilliant...",True
...,...,...
573908,"['go', 'wise', 'fast', 'pure', 'entertainment'...",False
573909,"['well', 'shall', 'say', 'one', 'fun', 'rate',...",False
573910,"['go', 'best', 'movie', 'ever', 'seen', 'seen'...",False
573911,"['call', '1999', 'teenage', 'version', 'pulp',...",False


In [13]:
target = CleanData['is_spoiler']

In [14]:
CleanData["clean_review"] = CleanData["clean_review"].apply(safe_literal_eval)

In [15]:
CleanData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573913 entries, 0 to 573912
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   clean_review  573913 non-null  object
 1   is_spoiler    573913 non-null  bool  
dtypes: bool(1), object(1)
memory usage: 4.9+ MB


Create a dummy text thanks to the token

In [16]:
CleanData["whole__text"] = CleanData["clean_review"].apply(join_tokens)

In [17]:
features = CleanData['whole__text']
features.head()

0    oscar year shawshank redemption written direct...
1    shawshank redemption without doubt one brillia...
2    believe film best story ever told film tell ti...
3    yes spoiler film emotional impact find hard wr...
4    heart extraordinary movie brilliant indelible ...
Name: whole__text, dtype: object

map the Boolean values in values 0 and 1

In [18]:
CleanData['is_spoiler_numeric'] = np.where(CleanData['is_spoiler'] == True, 1, 0)

In [19]:
CleanData = CleanData.rename(columns={'is_spoiler_numeric': 'label','whole__text':'text'})

In [20]:
train, test = split_train_test(CleanData, 'label')

In [21]:
train = train[['text','label']]
test = test[['text','label']]

In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 459130 entries, 94625 to 221631
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    459130 non-null  object
 1   label   459130 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 10.5+ MB


### Transform the dataset

Let's transform the pandas dataset into a TensorFlow dataset.

In [23]:
training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(train['text'].values, tf.string),
            tf.cast(train['label'].values, tf.int64)
        )
    )
)

In [24]:
test_dataset =( 
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(test['text'].values, tf.string),
            tf.cast(test['label'].values, tf.int64)
        )
    )
)

In [25]:
training_dataset

<_TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [26]:
for example, label in training_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b'maltese falcon film noir based novel title dashiell hammett directed john huston feature humphrey bogart private investigator sam spade mary astor femme fatale client gladys george peter lorre sydney greenstreet co star key supporting role story follows san francisco private detective dealing three unscrupulous adventurer competing obtain jewel encrusted falcon statuette sam spade hard boiled san francisco private eye unscrupulous next guy also adheres personal code honor office spade archer detective agency sweep miss wonderly offer large retainer sam partner mile archer protect someone named floyd thursby detective believe neither miss wonderly story believe money since archer saw first take case later evening shot death mysterious thursby miss wonderly real name turn brigid shaughnessey story continues sam also introduced effeminate joel cairo fat erudite kasper gutman turn brigid cairo gutman international scoundrel involved search foot high jewel encrusted statuette shape

## RNN MODEL
Next, we define the model with its various layers.

In [27]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64


This code prepares the training and test datasets for training machine learning models using TensorFlow

In [28]:
training_dataset = training_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [29]:
training_dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>


Once adapted, the encoder can be used to convert textual input data (such as reviews, titles, or any other text) into numerical tensors that can be processed by the model.

In [31]:
VOCAB_SIZE = 10000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(training_dataset.map(lambda text, label: text))

In [32]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'movie', 'film', 'one', 'like', 'character', 'time',
       'good', 'story', 'see', 'really', 'make', 'great', 'well', 'would',
       'scene', 'get', 'even', 'much'], dtype='<U16')

In [33]:

model = tf.keras.Sequential([
    
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=256,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [34]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
             metrics=['accuracy',recall_m,precision_m,f1_m,false_negative_rate_m])

In [35]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVe  (None, None)              0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, None, 256)         2560000   
                                                                 
 bidirectional (Bidirection  (None, 512)               1050624   
 al)                                                             
                                                                 
 dense (Dense)               (None, 256)               131328    
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 3742209 (14.28 MB)
Trainable params: 37422

In [36]:
history = model.fit(training_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [43]:
print("Accuracy values for each epoch:")
for epoch in range(len(history.history['accuracy'])):
    accuracy = history.history['accuracy'][epoch]
    precision = history.history['precision_m'][epoch]
    recall = history.history['recall_m'][epoch]
    f1_score = history.history['f1_m'][epoch]
    print(f"Epoch {epoch}: {accuracy} " f"recall: {recall} " f"precision: {precision} " f"f1-score: {f1_score} ")

Accuracy values for each epoch:
Epoch 0: 0.7632086873054504 recall: 0.2814015746116638 precision: 0.61093670129776 f1-score: 0.3615213632583618 
Epoch 1: 0.7793631553649902 recall: 0.3670152723789215 precision: 0.6765910387039185 f1-score: 0.46051326394081116 
Epoch 2: 0.7864766120910645 recall: 0.40077152848243713 precision: 0.6854116320610046 f1-score: 0.4926777184009552 
Epoch 3: 0.792590320110321 recall: 0.43183839321136475 precision: 0.6927367448806763 f1-score: 0.5193272829055786 
Epoch 4: 0.8007187247276306 recall: 0.4804462194442749 precision: 0.6993305683135986 f1-score: 0.5576528906822205 


In [38]:
training_accuracy_mean = np.mean(history.history['accuracy'])
training_precision_mean = np.mean(history.history['precision_m'])
training_recall_mean = np.mean(history.history['recall_m'])
training_f1_score_mean = np.mean(history.history['f1_m'])

print(f"    Media Training accuracy: {training_accuracy_mean}")
print(f"    Media Training precision: {training_precision_mean}")
print(f"    Media Training recall: {training_recall_mean}")
print(f"    Media Training F1 score: {training_f1_score_mean}")

    Media Training accuracy: 0.7844714999198914
    Media Training precision: 0.6730013370513916
    Media Training recall: 0.39229459762573243
    Media Training F1 score: 0.47833850383758547


### Result obtained
Accuracy values for each epoch:
Epoch 0: 0.7632086873054504 recall: 0.2814015746116638 precision: 0.61093670129776 f1-score: 0.3615213632583618 
Epoch 1: 0.7793631553649902 recall: 0.3670152723789215 precision: 0.6765910387039185 f1-score: 0.46051326394081116 
Epoch 2: 0.7864766120910645 recall: 0.40077152848243713 precision: 0.6854116320610046 f1-score: 0.4926777184009552 
Epoch 3: 0.792590320110321 recall: 0.43183839321136475 precision: 0.6927367448806763 f1-score: 0.5193272829055786 
Epoch 4: 0.8007187247276306 recall: 0.4804462194442749 precision: 0.6993305683135986 f1-score: 0.5576528906822205

Media Training accuracy: 0.7844714999198914
    Media Training precision: 0.6730013370513916
    Media Training recall: 0.39229459762573243
    Media Training F1 score: 0.47833850383758547


In [44]:
resultTest=model.evaluate(test_dataset)



In [46]:
with open("../Output/outputRNN.txt", "a") as f:
   for epoch in range(len(history.history['accuracy'])):
        accuracy = history.history['accuracy'][epoch]
        precision = history.history['precision_m'][epoch]
        recall = history.history['recall_m'][epoch]
        f1_score = history.history['f1_m'][epoch]
        print(f"Epoch {epoch}: {accuracy} " f"recall: {recall} " f"precision: {precision} " f"f1-score: {f1_score} ",file=f)
    

In [47]:
with open("../Output/outputRNN.txt", "a") as f:
    print("Test Result",file=f)
    print(f"  Loss: {resultTest[0]}, Accuracy: {resultTest[1]}, F1: {resultTest[4]}, Precision: {resultTest[3]}, Recall: {resultTest[2]}",file=f)

In [50]:
print(resultTest)

[0.5010486841201782, 0.7816836833953857, 0.4375437796115875, 0.5841323733329773, 0.4914727210998535, 0.5624560713768005]


In [49]:
print(f"  Loss: {resultTest[0]}, Accuracy: {resultTest[1]}, F1: {resultTest[4]}, Precision: {resultTest[3]}, Recall: {resultTest[2]}")

  Loss: 0.5010486841201782, Accuracy: 0.7816836833953857, F1: 0.4914727210998535, Precision: 0.5841323733329773, Recall: 0.4375437796115875


In [None]:
resultTest