# RNN
We used an RNN, this time on the preprocessed text of the review.

In [37]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


### Import libraries

In [38]:
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
import ast
from sklearn.model_selection import train_test_split

In [76]:
from tensorflow.keras.layers import Input, Dense,Embedding, Bidirectional, Attention, LSTM, Concatenate

In [40]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [41]:
import pandas as pd

### Check GPU

In [42]:
## CHECK GPU

from tensorflow.python.client import device_lib

def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print(get_available_devices())

['/device:CPU:0', '/device:GPU:0']


In [43]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


### Functions
The first function serves to convert a string into a Python object, the second to create text from tokens, and the last to split the dataset into training and testing sets.

In [44]:
#FUNCTIONS DEFINITION

#READ SPLIT TOKENS
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError) as e:
        print(f"Error parsing value {val}: {e}")
        return val  # Return the original value if there is an error

#MERGE TOKENS AS A WHOLE TEXT
def join_tokens(token_list):
    if isinstance(token_list, list):
        return ' '.join(token_list)
    return token_list


#SPLIT TRAIN + TEST 80-20
def split_train_test(df, label_name):
    train, test= train_test_split(df, test_size=0.2, stratify=df[label_name],random_state=42)
    return train, test


### Function metrics
Here we have defined the various functions to calculate the metrics.

In [45]:
from tensorflow.keras import backend as K

def precision_m(y_true, y_pred):     
    y_pred = tf.nn.sigmoid(y_pred)  # Apply sigmoid to get probabilities   
    y_pred = K.round(y_pred)  # Convert probabilities to 0 or 1    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))     
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))     
    precision = true_positives / (predicted_positives + K.epsilon())     
    return precision

# Custom metric for recall
def recall_m(y_true, y_pred):
    y_pred = tf.nn.sigmoid(y_pred)  # Apply sigmoid to get probabilities
    y_pred = K.round(y_pred)  # Convert probabilities to 0 or 1
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

# Custom metric for F1 score
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

def false_negative_rate_m(y_true, y_pred):
    # Apply sigmoid to get probabilities
    y_pred = tf.nn.sigmoid(y_pred)
    # Convert probabilities to binary predictions
    y_pred = K.round(y_pred)
    # Calculate False Negatives
    false_negatives = K.sum(K.cast(y_true, dtype='float32') * (1 - y_pred))
    # Calculate True Positives + False Negatives (total actual positives)
    possible_positives = K.sum(K.cast(y_true, dtype='float32'))
    # Calculate False Negative Rate
    fnr = false_negatives / (possible_positives + K.epsilon())
    return fnr

###  Read the dataset

In [46]:
CleanData=pd.read_csv("../Dataset/datiClean.csv")

In [47]:
CleanData=CleanData[["clean_review","is_spoiler"]]

In [48]:
CleanData

Unnamed: 0,clean_review,is_spoiler
0,"['oscar', 'year', 'shawshank', 'redemption', '...",True
1,"['shawshank', 'redemption', 'without', 'doubt'...",True
2,"['believe', 'film', 'best', 'story', 'ever', '...",True
3,"['yes', 'spoiler', 'film', 'emotional', 'impac...",True
4,"['heart', 'extraordinary', 'movie', 'brilliant...",True
...,...,...
573908,"['go', 'wise', 'fast', 'pure', 'entertainment'...",False
573909,"['well', 'shall', 'say', 'one', 'fun', 'rate',...",False
573910,"['go', 'best', 'movie', 'ever', 'seen', 'seen'...",False
573911,"['call', '1999', 'teenage', 'version', 'pulp',...",False


In [49]:
target = CleanData['is_spoiler']

In [50]:
CleanData["clean_review"] = CleanData["clean_review"].apply(safe_literal_eval)

In [51]:
CleanData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573913 entries, 0 to 573912
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   clean_review  573913 non-null  object
 1   is_spoiler    573913 non-null  bool  
dtypes: bool(1), object(1)
memory usage: 4.9+ MB


Create a dummy text thanks to the token

In [52]:
CleanData["whole__text"] = CleanData["clean_review"].apply(join_tokens)

In [53]:
features = CleanData['whole__text']
features.head()

0    oscar year shawshank redemption written direct...
1    shawshank redemption without doubt one brillia...
2    believe film best story ever told film tell ti...
3    yes spoiler film emotional impact find hard wr...
4    heart extraordinary movie brilliant indelible ...
Name: whole__text, dtype: object

map the Boolean values in values 0 and 1

In [54]:
CleanData['is_spoiler_numeric'] = np.where(CleanData['is_spoiler'] == True, 1, 0)

In [55]:
CleanData = CleanData.rename(columns={'is_spoiler_numeric': 'label','whole__text':'text'})

In [56]:
train, test = split_train_test(CleanData, 'label')

In [57]:
train = train[['text','label']]
test = test[['text','label']]

In [58]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 459130 entries, 94625 to 221631
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    459130 non-null  object
 1   label   459130 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 10.5+ MB


### Transform the dataset

Let's transform the pandas dataset into a TensorFlow dataset.

In [59]:
training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(train['text'].values, tf.string),
            tf.cast(train['label'].values, tf.int64)
        )
    )
)

In [60]:
test_dataset =( 
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(test['text'].values, tf.string),
            tf.cast(test['label'].values, tf.int64)
        )
    )
)

In [61]:
training_dataset

<_TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [62]:
for example, label in training_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b'maltese falcon film noir based novel title dashiell hammett directed john huston feature humphrey bogart private investigator sam spade mary astor femme fatale client gladys george peter lorre sydney greenstreet co star key supporting role story follows san francisco private detective dealing three unscrupulous adventurer competing obtain jewel encrusted falcon statuette sam spade hard boiled san francisco private eye unscrupulous next guy also adheres personal code honor office spade archer detective agency sweep miss wonderly offer large retainer sam partner mile archer protect someone named floyd thursby detective believe neither miss wonderly story believe money since archer saw first take case later evening shot death mysterious thursby miss wonderly real name turn brigid shaughnessey story continues sam also introduced effeminate joel cairo fat erudite kasper gutman turn brigid cairo gutman international scoundrel involved search foot high jewel encrusted statuette shape

## RNN MODEL
Next, we define the model with its various layers.

In [63]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64


This code prepares the training and test datasets for training machine learning models using TensorFlow

In [64]:
training_dataset = training_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [65]:
training_dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [66]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])


Once adapted, the encoder can be used to convert textual input data (such as reviews, titles, or any other text) into numerical tensors that can be processed by the model.

In [67]:
VOCAB_SIZE = 10000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(training_dataset.map(lambda text, label: text))

In [68]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'movie', 'film', 'one', 'like', 'character', 'time',
       'good', 'story', 'see', 'really', 'make', 'great', 'well', 'would',
       'scene', 'get', 'even', 'much'], dtype='<U16')

In [108]:

model = tf.keras.Sequential([
    
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=256,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [109]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
             metrics=['accuracy',recall_m,precision_m,f1_m,false_negative_rate_m])

In [110]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (Text  (None, None)              0         
 Vectorization)                                                  
                                                                 
 embedding_15 (Embedding)    (None, None, 256)         2560000   
                                                                 
 bidirectional_11 (Bidirect  (None, 512)               1050624   
 ional)                                                          
                                                                 
 dense_22 (Dense)            (None, 256)               131328    
                                                                 
 dense_23 (Dense)            (None, 1)                 257       
                                                                 
Total params: 3742209 (14.28 MB)
Trainable params: 374

In [None]:
history = model.fit(training_dataset, epochs=3)

Epoch 1/3
  17/7174 [..............................] - ETA: 15:33 - loss: 0.6840 - accuracy: 0.7381 - recall_m: 0.0361 - precision_m: 0.0470 - f1_m: 0.0366 - false_negative_rate_m: 0.9639

In [None]:
print("Accuracy values for each epoch:")
for epoch in range(len(history.history['accuracy'])):
    accuracy = history.history['accuracy'][epoch]
    precision = history.history['precision_m'][epoch]
    recall = history.history['recall_m'][epoch]
    f1_score = history.history['f1_m'][epoch]
    print(f"Epoch {epoch}: {accuracy} " f"recall: {recall} " f"precision: {precision} " f"f1-score: {f1_score} ")

In [42]:
training_accuracy_mean = np.mean(history.history['accuracy'])
training_precision_mean = np.mean(history.history['precision_m'])
training_recall_mean = np.mean(history.history['recall_m'])
training_f1_score_mean = np.mean(history.history['f1_m'])

print(f"    Media Training accuracy: {training_accuracy_mean}")
print(f"    Media Training precision: {training_precision_mean}")
print(f"    Media Training recall: {training_recall_mean}")
print(f"    Media Training F1 score: {training_f1_score_mean}")

    Media Training accuracy: 0.7775285243988037
    Media Training precision: 0.662295420964559
    Media Training recall: 0.35786786675453186
    Media Training F1 score: 0.44650845726331073


### Result obtained
Accuracy values for each epoch:
Epoch 0: 0.7552915215492249 recall: 0.247200608253479 precision: 0.577804684638977 f1-score: 0.32188791036605835 
Epoch 1: 0.7683684229850769 recall: 0.30710819363594055 precision: 0.664779543876648 f1-score: 0.40216726064682007 
Epoch 2: 0.7724130153656006 recall: 0.315212219953537 precision: 0.6743733882904053 f1-score: 0.41411998867988586 

 Media Training accuracy: 0.7653576532999674
    Media Training precision: 0.6389858722686768
    Media Training recall: 0.28984034061431885
    Media Training F1 score: 0.3793917198975881

In [45]:
resultTest=model.evaluate(test_dataset)



In [50]:
with open("outputRNN.txt", "a") as f:
    print(f"    Media Training accuracy: {training_accuracy_mean}", file=f)
    print(f"    Media Training precision: {training_precision_mean}", file=f)
    print(f"    Media Training recall: {training_recall_mean}", file=f)
    print(f"    Media Training F1 score: {training_f1_score_mean}", file=f)
    

In [57]:
with open("outputRNN.txt", "a") as f:
    print("Test Result",file=f)
    print(f"  Loss: {resultTest[0]}, Accuracy: {resultTest[1]}, F1: {resultTest[4]}, Precision: {resultTest[3]}, Recall: {resultTest[2]}",file=f)

  Loss: 0.4872026741504669, Accuracy: 0.7834609746932983, F1: 0.5242252945899963, Precision: 0.5631215572357178, Recall: 0.5051186084747314


In [None]:
print(f"Eval Loss: {log_history['eval_loss']}, Accuracy: {log_history['eval_accuracy']}, F1: {log_history['eval_f1']}, Precision: {log_history['eval_precision']}, Recall: {log_history['eval_recall']}",file=f)

In [46]:
resultTest

[0.4872026741504669, 0.7834609746932983, 0.5051186084747314, 0.5631215572357178, 0.5242252945899963, 0.49488136172294617]
