In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the dataset
file_path = "/content/clean_movie_datasetV2.csv"  # Update with the actual file path
df = pd.read_csv(file_path)

In [3]:
# Ensure the 'plot' and 'averageRating' columns exist
df = df[['plot', 'averageRating']].dropna()

In [4]:
# Text Preprocessing
max_vocab_size = 10000  # Limit vocabulary size
max_length = 300  # Maximum sequence length for padding

tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df['plot'])

sequences = tokenizer.texts_to_sequences(df['plot'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')


In [5]:
# Prepare target labels
ratings = np.array(df['averageRating'])

# Normalize ratings (optional, but helps with stability)
scaler = StandardScaler()
ratings = scaler.fit_transform(ratings.reshape(-1, 1)).flatten()

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, ratings, test_size=0.2, random_state=42)


In [7]:
# Build the model
embedding_dim = 128

from tensorflow.keras.regularizers import l2

model = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=embedding_dim),  # Removed input_length
    LSTM(64, return_sequences=True, kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    LSTM(32, kernel_regularizer=l2(0.01)),
    Dense(16, activation='relu'),
    Dropout(0.2),
    Dense(1)
])


In [10]:
import tensorflow as tf

# Define the threshold accuracy function
def threshold_accuracy(y_true, y_pred):
  """Calculates the accuracy based on a threshold."""
  threshold = 0  # Example threshold, adjust as needed
  return tf.reduce_mean(tf.cast(tf.math.greater_equal(y_pred, threshold), tf.float32))

# Now compile the model using the defined function
model.compile(optimizer='adam', loss='mse', metrics=['mae', threshold_accuracy])

In [11]:
# Train the model
epochs = 10
batch_size = 32

from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3)  # Stop if val_loss doesn't improve for 3 epochs

history = model.fit(X_train, y_train, validation_data=(X_test, y_test),
                    epochs=epochs, batch_size=batch_size, callbacks=[early_stopping])


Epoch 1/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 362ms/step - loss: 2.4965 - mae: 0.7604 - threshold_accuracy: 0.5076 - val_loss: 1.2091 - val_mae: 0.7868 - val_threshold_accuracy: 0.4417
Epoch 2/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 456ms/step - loss: 1.0473 - mae: 0.7382 - threshold_accuracy: 0.4653 - val_loss: 1.0732 - val_mae: 0.7885 - val_threshold_accuracy: 0.4167
Epoch 3/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 355ms/step - loss: 1.0638 - mae: 0.7654 - threshold_accuracy: 0.4069 - val_loss: 1.1705 - val_mae: 0.8064 - val_threshold_accuracy: 0.4167
Epoch 4/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 354ms/step - loss: 0.8104 - mae: 0.6360 - threshold_accuracy: 0.2755 - val_loss: 1.1759 - val_mae: 0.8302 - val_threshold_accuracy: 0.2460
Epoch 5/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 351ms/step - loss: 0.7321 - mae: 0.6104 - threshold_ac

In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Get predictions for the test set
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0).astype(int)  # Convert to binary predictions (0 or 1) based on a threshold (e.g., 0)

# Convert y_test to binary using the same threshold used for y_pred_binary
y_test_binary = (y_test > 0).astype(int) # Convert y_test to binary as well

precision = precision_score(y_test_binary, y_pred_binary) # Use y_test_binary instead of y_test
recall = recall_score(y_test_binary, y_pred_binary) # Use y_test_binary instead of y_test
f1 = f1_score(y_test_binary, y_pred_binary) # Use y_test_binary instead of y_test

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 155ms/step
Precision: 0.6068
Recall: 0.2752
F1 Score: 0.3787
