In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [2]:
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
import ast
from sklearn.model_selection import train_test_split

2024-05-30 22:59:14.251157: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-30 22:59:14.298689: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-30 22:59:14.298806: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-30 22:59:14.301025: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-30 22:59:14.314228: I tensorflow/core/platform/cpu_feature_guar

In [3]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [4]:
import pandas as pd

In [5]:
## CHECK GPU

from tensorflow.python.client import device_lib

def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print(get_available_devices())

['/device:CPU:0', '/device:GPU:0']


2024-05-30 22:59:16.255740: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /device:GPU:0 with 31134 MB memory:  -> device: 0, name: Tesla V100S-PCIE-32GB, pci bus id: 0000:af:00.0, compute capability: 7.0


In [6]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [7]:
#FUNCTIONS DEFINITION

#READ SPLIT TOKENS
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError) as e:
        print(f"Error parsing value {val}: {e}")
        return val  # Return the original value if there is an error

#MERGE TOKENS AS A WHOLE TEXT
def join_tokens(token_list):
    if isinstance(token_list, list):
        return ' '.join(token_list)
    return token_list


#SPLIT TRAIN + TEST 80-20
def split_train_test(df, label_name):
    train, test= train_test_split(df, test_size=0.2, stratify=df[label_name],random_state=42)
    return train, test


In [42]:
from tensorflow.keras import backend as K

def precision_m(y_true, y_pred):     
    y_pred = tf.nn.sigmoid(y_pred)  # Apply sigmoid to get probabilities   
    y_pred = K.round(y_pred)  # Convert probabilities to 0 or 1    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))     
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))     
    precision = true_positives / (predicted_positives + K.epsilon())     
    return precision

# Custom metric for recall
def recall_m(y_true, y_pred):
    y_pred = tf.nn.sigmoid(y_pred)  # Apply sigmoid to get probabilities
    y_pred = K.round(y_pred)  # Convert probabilities to 0 or 1
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

# Custom metric for F1 score
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

def false_negative_rate_m(y_true, y_pred):
    # Apply sigmoid to get probabilities
    y_pred = tf.nn.sigmoid(y_pred)
    # Convert probabilities to binary predictions
    y_pred = K.round(y_pred)
    # Calculate False Negatives
    false_negatives = K.sum(K.cast(y_true, dtype='float32') * (1 - y_pred))
    # Calculate True Positives + False Negatives (total actual positives)
    possible_positives = K.sum(K.cast(y_true, dtype='float32'))
    # Calculate False Negative Rate
    fnr = false_negatives / (possible_positives + K.epsilon())
    return fnr

In [9]:
CleanData=pd.read_csv("../Dataset/datiClean.csv")

In [10]:
CleanData=CleanData[["clean_review","is_spoiler"]]

In [11]:
CleanData

Unnamed: 0,clean_review,is_spoiler
0,"['oscar', 'year', 'shawshank', 'redemption', '...",True
1,"['shawshank', 'redemption', 'without', 'doubt'...",True
2,"['believe', 'film', 'best', 'story', 'ever', '...",True
3,"['yes', 'spoiler', 'film', 'emotional', 'impac...",True
4,"['heart', 'extraordinary', 'movie', 'brilliant...",True
...,...,...
573908,"['go', 'wise', 'fast', 'pure', 'entertainment'...",False
573909,"['well', 'shall', 'say', 'one', 'fun', 'rate',...",False
573910,"['go', 'best', 'movie', 'ever', 'seen', 'seen'...",False
573911,"['call', '1999', 'teenage', 'version', 'pulp',...",False


In [12]:
target = CleanData['is_spoiler']

In [13]:
CleanData["clean_review"] = CleanData["clean_review"].apply(safe_literal_eval)

In [14]:
CleanData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573913 entries, 0 to 573912
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   clean_review  573913 non-null  object
 1   is_spoiler    573913 non-null  bool  
dtypes: bool(1), object(1)
memory usage: 4.9+ MB


In [15]:
CleanData["whole__text"] = CleanData["clean_review"].apply(join_tokens)

In [16]:
features = CleanData['whole__text']
features.head()

0    oscar year shawshank redemption written direct...
1    shawshank redemption without doubt one brillia...
2    believe film best story ever told film tell ti...
3    yes spoiler film emotional impact find hard wr...
4    heart extraordinary movie brilliant indelible ...
Name: whole__text, dtype: object

In [17]:
CleanData['is_spoiler_numeric'] = np.where(CleanData['is_spoiler'] == True, 1, 0)

In [18]:
CleanData = CleanData.rename(columns={'is_spoiler_numeric': 'label','whole__text':'text'})

In [19]:
train, test = split_train_test(CleanData, 'label')

In [20]:
train = train[['text','label']]
test = test[['text','label']]

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 459130 entries, 94625 to 221631
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    459130 non-null  object
 1   label   459130 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 10.5+ MB


In [22]:
training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(train['text'].values, tf.string),
            tf.cast(train['label'].values, tf.int64)
        )
    )
)

2024-05-30 23:01:15.370802: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 31134 MB memory:  -> device: 0, name: Tesla V100S-PCIE-32GB, pci bus id: 0000:af:00.0, compute capability: 7.0


In [23]:
test_dataset =( 
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(train['text'].values, tf.string),
            tf.cast(train['label'].values, tf.int64)
        )
    )
)

In [24]:
training_dataset

<_TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [25]:
for example, label in training_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b'maltese falcon film noir based novel title dashiell hammett directed john huston feature humphrey bogart private investigator sam spade mary astor femme fatale client gladys george peter lorre sydney greenstreet co star key supporting role story follows san francisco private detective dealing three unscrupulous adventurer competing obtain jewel encrusted falcon statuette sam spade hard boiled san francisco private eye unscrupulous next guy also adheres personal code honor office spade archer detective agency sweep miss wonderly offer large retainer sam partner mile archer protect someone named floyd thursby detective believe neither miss wonderly story believe money since archer saw first take case later evening shot death mysterious thursby miss wonderly real name turn brigid shaughnessey story continues sam also introduced effeminate joel cairo fat erudite kasper gutman turn brigid cairo gutman international scoundrel involved search foot high jewel encrusted statuette shape

# RNN MODEL

In [26]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [27]:
training_dataset = training_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [28]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [29]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(training_dataset.map(lambda text, label: text))

In [30]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'movie', 'film', 'one', 'like', 'character', 'time',
       'good', 'story', 'see', 'really', 'make', 'great', 'well', 'would',
       'scene', 'get', 'even', 'much'], dtype='<U14')

In [43]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [44]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
             metrics=['accuracy',recall_m,precision_m,f1_m,false_negative_rate_m])

In [None]:
history = model.fit(training_dataset, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3

In [None]:
print("Accuracy values for each epoch:")
for epoch in range(len(history.history['accuracy'])):
    accuracy = history.history['accuracy'][epoch]
    precision = history.history['precision_m'][epoch]
    recall = history.history['recall_m'][epoch]
    f1_score = history.history['f1_m'][epoch]
    print(f"Epoch {epoch}: {accuracy} " f"recall: {recall} " f"precision: {precision} " f"f1-score: {f1_score} ")

In [None]:
training_accuracy_mean = np.mean(history.history['accuracy'])
training_precision_mean = np.mean(history.history['precision_m'])
training_recall_mean = np.mean(history.history['recall_m'])
training_f1_score_mean = np.mean(history.history['f1_m'])

print(f"    Media Training accuracy: {training_accuracy_mean}")
print(f"    Media Training precision: {training_precision_mean}")
print(f"    Media Training recall: {training_recall_mean}")
print(f"    Media Training F1 score: {training_f1_score_mean}")

In [None]:
model.evaluate(test_dataset)