In [1]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, TFDistilBertModel
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras import mixed_precision
from sklearn.preprocessing import LabelEncoder

In [2]:
# Enable mixed precision training
policy = mixed_precision.Policy('mixed_float16')
# Changed set_policy to set_global_policy
mixed_precision.set_global_policy(policy)

In [3]:
# Load dataset
try:
    df = pd.read_csv('/content/clean_movie_datasetV3.csv')
except FileNotFoundError:
    print("Error: Dataset file not found. Please upload 'clean_movie_datasetV2.csv'.")
    exit()

In [4]:
# Preprocessing
df['plot'] = df['plot'].fillna('')
df['averageRating'] = df['averageRating'].astype(str)

In [5]:
# Encode the target labels
label_encoder = LabelEncoder()
df['averageRating'] = label_encoder.fit_transform(df['averageRating'])

In [6]:
# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [7]:
def tokenize_plot(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=256,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )
    return encoding['input_ids'], encoding['attention_mask']

In [8]:
# Create a TensorFlow dataset


def create_dataset(df):
    input_ids = []
    attention_masks = []
    for plot in df['plot']:
        ids, masks = tokenize_plot(plot)
        input_ids.append(ids)
        attention_masks.append(masks)
    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)
    labels = tf.convert_to_tensor(df['averageRating'].values, dtype=tf.int32)
    return tf.data.Dataset.from_tensor_slices(({'input_ids': input_ids, 'attention_mask': attention_masks}, labels))


In [9]:
# Data Splitting
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = create_dataset(df_train).batch(
    32).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = create_dataset(df_test).batch(
    32).prefetch(tf.data.experimental.AUTOTUNE)

In [10]:
# Load DistilBERT model (with error handling)
try:
    distilbert_model = TFDistilBertModel.from_pretrained(
        'distilbert-base-uncased')
    print("DistilBERT model loaded successfully!")
except Exception as e:
    print(f"Error loading DistilBERT model: {e}")
    exit()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


DistilBERT model loaded successfully!


In [11]:
# Model Building
input_ids = tf.keras.layers.Input(
    shape=(256,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(
    shape=(256,), dtype=tf.int32, name='attention_mask')

In [13]:
# Use DistilBERT model


def distilbert_layer(inputs):

    return distilbert_model(input_ids=inputs[0], attention_mask=inputs[1])[0]
distilbert_output = tf.keras.layers.Lambda(
    distilbert_layer, output_shape=(256, 768))([input_ids, attention_mask])

In [14]:
# Use the [CLS] token representation
pooled_output = distilbert_output[:, 0, :]

x = tf.keras.layers.Dense(256, activation='relu')(pooled_output)
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
output = tf.keras.layers.Dense(3, activation='softmax')(x)  # Use softmax for multi-class classification


In [15]:
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

In [16]:
# Optimizer and Compilation
optimizer = tf.keras.optimizers.AdamW(learning_rate=1e-4, weight_decay=0.01)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [17]:
# Training with EarlyStopping
early_stopping = EarlyStopping(
    monitor='val_accuracy', patience=3, restore_best_weights=True)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6)

In [18]:
batch_size = 32
epochs = 10

history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=epochs,
    callbacks=[early_stopping, reduce_lr]
)

Epoch 1/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 403ms/step - accuracy: 0.4237 - loss: 1.0666 - val_accuracy: 0.5506 - val_loss: 0.9540 - learning_rate: 1.0000e-04
Epoch 2/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 312ms/step - accuracy: 0.5773 - loss: 0.9283 - val_accuracy: 0.5506 - val_loss: 0.9504 - learning_rate: 1.0000e-04
Epoch 3/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 313ms/step - accuracy: 0.5793 - loss: 0.9246 - val_accuracy: 0.5506 - val_loss: 0.9452 - learning_rate: 1.0000e-04
Epoch 4/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 313ms/step - accuracy: 0.5823 - loss: 0.9095 - val_accuracy: 0.5496 - val_loss: 0.9457 - learning_rate: 1.0000e-04


In [19]:
# Evaluate the model
loss, accuracy = model.evaluate(test_dataset, verbose=0)
print(f"Accuracy on test set: {accuracy}")

Accuracy on test set: 0.550632894039154


In [20]:
# Predict and calculate additional metrics
y_pred_prob = model.predict(test_dataset)
y_pred = np.argmax(y_pred_prob, axis=1)

precision = precision_score(df_test['averageRating'], y_pred, average='weighted')
recall = recall_score(df_test['averageRating'], y_pred, average='weighted')
f1 = f1_score(df_test['averageRating'], y_pred, average='weighted')

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 380ms/step
Precision: 0.30319660310847624
Recall: 0.5506329113924051
F1 Score: 0.391061741152157


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
