In [None]:
# Install Hugging Face Transformers (if not installed)
!pip install transformers

import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel
from transformers import BertTokenizer, TFBertModel
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter

# 1️⃣ Load Dataset
file_path = "/kaggle/input/imbd-dataset/IMBD.csv"
df = pd.read_csv(file_path)

# 2️⃣ Preprocess Dataset - Keep relevant columns
df = df[['description', 'genre']].dropna()

# Convert genre string into a list (e.g., "Drama, Action" → ["Drama", "Action"])
df['genre'] = df['genre'].apply(lambda x: [g.strip() for g in x.split(',')])

# Count occurrences of each genre
genre_counts = Counter([g for genres in df['genre'] for g in genres])

# Define threshold (e.g., keep genres appearing in at least 100 movies)
min_genre_count = 500
valid_genres = {g for g, count in genre_counts.items() if count >= min_genre_count}

# Filter dataset
# Remove "Animation" explicitly
df['genre'] = df['genre'].apply(lambda genres: [g for g in genres if g in valid_genres and g != "Animation"])

# Remove empty genre rows
df = df[df['genre'].map(len) > 0].reset_index(drop=True)

# 3️⃣ Multi-Label Encoding of Genres
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genre'])  # Convert genres into binary labels

# Display available genre labels
print("Genres:", mlb.classes_)

# 4️⃣ Check Genre Imbalance
genre_counts = Counter([g for genres in df['genre'] for g in genres])
plt.figure(figsize=(12, 6))
plt.bar(genre_counts.keys(), genre_counts.values(), color='skyblue')
plt.xticks(rotation=90)
plt.xlabel("Genres")
plt.ylabel("Count")
plt.title("Genre Distribution in Dataset")
plt.show()

# 5️⃣ Tokenize Movie Descriptions using DistilBERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_length = 512  # Set max length for input sequences

tokens = tokenizer(
    list(df['description']),
    max_length=max_length,
    padding="max_length",
    truncation=True,
    return_tensors="np"
)

# 6️⃣ Prepare Train/Test Split
X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    tokens["input_ids"], tokens["attention_mask"], y, test_size=0.2, random_state=42
)

# Now, split train into train and validation
X_train_ids, X_val_ids, X_train_mask, X_val_mask, y_train, y_val = train_test_split(
    X_train_ids, X_train_mask, y_train, test_size=0.2, random_state=42
)

# Convert to dictionary format
X_train = {"input_ids": X_train_ids, "attention_mask": X_train_mask}
X_val = {"input_ids": X_val_ids, "attention_mask": X_val_mask}
X_test = {"input_ids": X_test_ids, "attention_mask": X_test_mask}

# Convert to TensorFlow Dataset
batch_size = 16
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)


from transformers import TFDistilBertModel, DistilBertTokenizer
import tensorflow as tf

def build_model():
    max_length = 512
    distilbert_model_name = "distilbert-base-uncased"
    distilbert_model = TFDistilBertModel.from_pretrained(distilbert_model_name)

    # Inputs
    input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")

    # DistilBERT model
    distilbert_output = distilbert_model([input_ids, attention_mask])

    # Extract the hidden state of the [CLS] token (first token) from the output
    cls_token = distilbert_output.last_hidden_state[:, 0, :]  # (None, 768)


    # LSTM layer to capture the sequential context
   # lstm_output = tf.keras.layers.LSTM(128, return_sequences=True)(distilbert_output.last_hidden_state)
    lstm_output = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(distilbert_output.last_hidden_state)

    # Combine both the LSTM output and CLS token
    combined_output = tf.keras.layers.concatenate([cls_token, lstm_output])

    # Additional layers
    x = tf.keras.layers.Dense(512, activation="relu")(combined_output)
    x = tf.keras.layers.Dropout(0.4)(x)
    x = tf.keras.layers.Dense(256, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.3)(x)


    # Output layer (binary classification in this case)
    output = tf.keras.layers.Dense(y.shape[1], activation="sigmoid")(x)

    # Model definition
    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    return model

model = build_model()
model.summary()

# 8️⃣ Train Model
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)

history = model.fit(train_dataset, validation_data=val_dataset, epochs=10, callbacks=[early_stopping])

# 9️⃣ Evaluate Model
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy: {accuracy:.4f}")


# 1️⃣1️⃣ Function for Predicting Genres from New Descriptions
def predict_genres(description, threshold=0.3):  # Adjust threshold to balance precision/recall
    tokens = tokenizer(description, max_length=max_length, padding="max_length", truncation=True, return_tensors="np")
    prediction = model.predict({"input_ids": tokens["input_ids"], "attention_mask": tokens["attention_mask"]})[0]

    predicted_labels = [mlb.classes_[i] for i, prob in enumerate(prediction) if prob > threshold]
    return predicted_labels

# Example Prediction
example_desc = "A detective must solve a mystery in a futuristic city filled with crime and corruption."
print("Predicted Genres:", predict_genres(example_desc))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, hamming_loss
import numpy as np

def calculate_metrics(y_true, y_pred, threshold=0.5):
    """
    Calculate accuracy, precision, recall, and F1-score for multi-label classification.

    Args:
        y_true: Ground truth labels (binary multi-label matrix).
        y_pred: Predicted probabilities (binary multi-label matrix).
        threshold: Probability threshold to classify predictions as 1.

    Returns:
        A dictionary with accuracy, precision, recall, and F1-score.
    """
    # Binarize predictions based on the threshold
    y_pred_binarized = (y_pred > threshold).astype(int)

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred_binarized)
    precision = precision_score(y_true, y_pred_binarized, average="micro")  # Micro average for multi-label
    recall = recall_score(y_true, y_pred_binarized, average="micro")
    f1 = f1_score(y_true, y_pred_binarized, average="micro")
    f1_samples = f1_score(y_true, y_pred_binarized, average="samples")
    hamming = hamming_loss(y_true,y_pred_binarized)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "f1_samples": f1_samples,
        "hamming": hamming
    }

# Evaluate on test data
y_test_pred = model.predict({"input_ids": X_test["input_ids"], "attention_mask": X_test["attention_mask"]})
metrics = calculate_metrics(y_test, y_test_pred)
print("Evaluation Metrics on Test Set:")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1-Score: {metrics['f1_score']:.4f}")
print(f"F1-Score-Samples: {metrics['f1_samples']:.4f}")
print(f"Hamming Loss: {metrics['hamming']:.4f}")

Evaluation Metrics on Test Set:
Accuracy: 0.5804
Precision: 0.6643
Recall: 0.6107
F1-Score: 0.6364
F1-Score-Samples: 0.6190
Hamming Loss: 0.0265
