In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the dataset
file_path = "/content/clean_movie_dataset.csv"  # Update with the actual file path
df = pd.read_csv(file_path)

In [3]:
# Ensure the 'plot' and 'averageRating' columns exist
df = df[['plot', 'averageRating']].dropna()

In [4]:
# Text Preprocessing
max_vocab_size = 10000  # Limit vocabulary size
max_length = 300  # Maximum sequence length for padding

tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df['plot'])

sequences = tokenizer.texts_to_sequences(df['plot'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')


In [5]:
# Prepare target labels
ratings = np.array(df['averageRating'])

# Normalize ratings (optional, but helps with stability)
scaler = StandardScaler()
ratings = scaler.fit_transform(ratings.reshape(-1, 1)).flatten()

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, ratings, test_size=0.2, random_state=42)


In [6]:
# Build the model
embedding_dim = 128

from tensorflow.keras.regularizers import l2

model = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=embedding_dim, input_length=max_length),
    LSTM(64, return_sequences=True, kernel_regularizer=l2(0.01)),  # Add L2 regularization
    Dropout(0.3),
    LSTM(32, kernel_regularizer=l2(0.01)),  # Add L2 regularization
    Dense(16, activation='relu'),
    Dropout(0.2),
    Dense(1)
])




In [7]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [8]:
# Train the model
epochs = 10
batch_size = 32

from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3)  # Stop if val_loss doesn't improve for 3 epochs

history = model.fit(X_train, y_train, validation_data=(X_test, y_test),
                    epochs=epochs, batch_size=batch_size, callbacks=[early_stopping])


Epoch 1/10
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 440ms/step - loss: 2.2095 - mae: 0.7656 - val_loss: 0.9640 - val_mae: 0.7424
Epoch 2/10
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 335ms/step - loss: 1.0187 - mae: 0.7725 - val_loss: 0.9431 - val_mae: 0.7394
Epoch 3/10
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 337ms/step - loss: 1.0095 - mae: 0.7610 - val_loss: 0.9602 - val_mae: 0.7438
Epoch 4/10
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 340ms/step - loss: 0.9519 - mae: 0.7229 - val_loss: 0.9696 - val_mae: 0.7480
Epoch 5/10
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 341ms/step - loss: 0.8749 - mae: 0.6870 - val_loss: 1.0431 - val_mae: 0.7720


In [9]:
# Evaluate the model
loss, mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {mae}")

[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 72ms/step - loss: 1.0546 - mae: 0.7759
Test MAE: 0.771984875202179


In [10]:
# Function to predict rating from a new plot
def predict_rating(plot):
    sequence = tokenizer.texts_to_sequences([plot])
    padded = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    predicted_rating = model.predict(padded)[0][0]
    return scaler.inverse_transform([[predicted_rating]])[0][0]  # Convert back to original scale


In [11]:
# Example prediction
sample_plot = "A young girl is chosen to participate in a deadly survival game broadcast to the nation."
predicted = predict_rating(sample_plot)
print(f"Predicted Rating: {predicted:.2f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 477ms/step
Predicted Rating: 6.23


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3)  # Stop if val_loss doesn't improve for 3 epochs

history = model.fit(X_train, y_train, validation_data=(X_test, y_test),
                    epochs=epochs, batch_size=batch_size, callbacks=[early_stopping])

Epoch 1/10
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 333ms/step - loss: 0.8113 - mae: 0.6564 - val_loss: 1.0823 - val_mae: 0.7933
Epoch 2/10
[1m179/194[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m4s[0m 321ms/step - loss: 0.8031 - mae: 0.6553