# RNN
We used an RNN, this time on the preprocessed text of the review.

In [None]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

### Import libraries

In [None]:
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
import ast
from sklearn.model_selection import train_test_split

In [None]:
from tensorflow.keras.layers import Input, Dense,Embedding, Bidirectional, Attention, LSTM, Concatenate

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [None]:
import pandas as pd

### Check GPU

In [None]:
## CHECK GPU

from tensorflow.python.client import device_lib

def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print(get_available_devices())

In [None]:
print(tf.config.list_physical_devices('GPU'))

### Functions
The first function serves to convert a string into a Python object, the second to create text from tokens, and the last to split the dataset into training and testing sets.

In [None]:
#FUNCTIONS DEFINITION

#READ SPLIT TOKENS
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError) as e:
        print(f"Error parsing value {val}: {e}")
        return val  # Return the original value if there is an error

#MERGE TOKENS AS A WHOLE TEXT
def join_tokens(token_list):
    if isinstance(token_list, list):
        return ' '.join(token_list)
    return token_list


#SPLIT TRAIN + TEST 80-20
def split_train_test(df, label_name):
    train, test= train_test_split(df, test_size=0.2, stratify=df[label_name],random_state=42)
    return train, test


###  Read the dataset

In [None]:
CleanData=pd.read_csv("../Dataset/datiClean.csv")

In [None]:
CleanData=CleanData[["clean_review","is_spoiler"]]

In [None]:
CleanData

In [None]:
target = CleanData['is_spoiler']

In [None]:
CleanData["clean_review"] = CleanData["clean_review"].apply(safe_literal_eval)

In [None]:
CleanData.info()

### Compute the vocabulary

In [None]:
from collections import Counter

In [None]:
token_counts = Counter()

In [None]:
for token_list in CleanData["clean_review"]:
    token_counts.update(token_list)

In [None]:
token_counts.most_common(5)

In [None]:
# Filter out tokens that occur less than the unk_cutoff
vocab = {token: count for token, count in token_counts.items() if count >= 5}

In [None]:
len(token_counts)

In [None]:
len(vocab)

### Divide in Train and Test

Create a dummy text thanks to the token

In [None]:
CleanData["whole__text"] = CleanData["clean_review"].apply(join_tokens)

In [None]:
features = CleanData['whole__text']
features.head()

map the Boolean values in values 0 and 1

In [None]:
CleanData['is_spoiler_numeric'] = np.where(CleanData['is_spoiler'] == True, 1, 0)

In [None]:
CleanData = CleanData.rename(columns={'is_spoiler_numeric': 'label','whole__text':'text'})

In [None]:
train, test = split_train_test(CleanData, 'label')

In [None]:
train = train[['text','label']]
test = test[['text','label']]

In [None]:
train.info()

### Transform the dataset

Let's transform the pandas dataset into a TensorFlow dataset.

In [None]:
training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(train['text'].values, tf.string),
            tf.cast(train['label'].values, tf.int64)
        )
    )
)

In [None]:
test_dataset =( 
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(test['text'].values, tf.string),
            tf.cast(test['label'].values, tf.int64)
        )
    )
)

In [None]:
training_dataset

In [None]:
for example, label in training_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

## RNN MODEL
Next, we define the model with its various layers.

In [None]:
BUFFER_SIZE = 1000
BATCH_SIZE = 64


This code prepares the training and test datasets for training machine learning models using TensorFlow

In [None]:
training_dataset = training_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
training_dataset


Once adapted, the encoder can be used to convert textual input data into numerical tensors that can be processed by the model.

In [None]:
VOCAB_SIZE = 10000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(training_dataset.map(lambda text, label: text))

In [None]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

In [None]:

model = tf.keras.Sequential([
    
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=256,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4,weight_decay=0.02),
             metrics=['accuracy','recall','precision'])

In [None]:
model.summary()

In [None]:
history = model.fit(training_dataset, epochs=8)

## Results

In [None]:
print("Accuracy values for each epoch:")
for epoch in range(len(history.history['accuracy'])):
    accuracy = history.history['accuracy'][epoch]
    precision = history.history['precision'][epoch]
    recall = history.history['recall'][epoch]
    f1_score=2* (precision * recall) / (precision + recall)
    #f1_score = history.history['f1_m'][epoch]
    print(f"Epoch {epoch}: {accuracy} " f"recall: {recall} " f"precision: {precision} " f"f1-score: {f1_score} ")

In [None]:
training_accuracy_mean = np.mean(history.history['accuracy'])
training_precision_mean = np.mean(history.history['precision'])
training_recall_mean = np.mean(history.history['recall'])


print(f"    Media Training accuracy: {training_accuracy_mean}")
print(f"    Media Training precision: {training_precision_mean}")
print(f"    Media Training recall: {training_recall_mean}")


### Result obtained



In [None]:
resultTest=model.evaluate(test_dataset)

Save the result on a file

In [None]:
with open("../Output/outputRNN.txt", "a") as f:
   for epoch in range(len(history.history['accuracy'])):
        accuracy = history.history['accuracy'][epoch]
        precision = history.history['precision'][epoch]
        recall = history.history['recall'][epoch]
        f1_score=2* (precision * recall) / (precision + recall)
        print(f"Epoch {epoch}: {accuracy} " f"recall: {recall} " f"precision: {precision} " f"f1-score: {f1_score} ",file=f)
    

In [None]:
with open("../Output/outputRNN.txt", "a") as f:
    print("Test Result",file=f)
    precision=resultTest[3]
    recall=resultTest[2]
    f1_score=2* (precision * recall) / (precision + recall)
    print(f"  Loss: {resultTest[0]}, Accuracy: {resultTest[1]}, F1: {f1_score}, Precision: {resultTest[3]}, Recall: {resultTest[2]}",file=f)

In [None]:
print(resultTest)

In [None]:
print(f"  Loss: {resultTest[0]}, Accuracy: {resultTest[1]}, F1: {f1_score}, Precision: {resultTest[3]}, Recall: {resultTest[2]}")

In [None]:
resultTest