In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Embedding, Flatten, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2025-03-05 11:13:21.954485: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
data = pd.read_csv('data/data.csv', sep=";", dtype=str)

In [5]:
data.head()

Unnamed: 0,ID,Title,Year,Genres,Director,Cast,RunningTime,Rating,Votes
0,tt0000009,Miss Jerry,1894,Romance,Alexander Black,"Blanche Bayliss, William Courtenay, Chauncey D...",45,5.3,222
1,tt0000147,The Corbett-Fitzsimmons Fight,1897,"Documentary,News,Sport",Enoch J. Rector,Unknown,100,5.2,553
2,tt0000502,Bohemios,1905,Unknown,Ricardo de Baños,"Antonio del Pozo, El Mochuelo",100,3.8,21
3,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",Charles Tait,"Elizabeth Tait, John Tait, Nicholas Brierley, ...",70,6.0,976
4,tt0000591,The Prodigal Son,1907,Drama,Michel Carré,"Georges Wague, Henri Gouget, Christiane Mandel...",90,5.6,31


In [6]:
data = data[["Genres", "Director", "Cast", "RunningTime", "Rating", "Votes"]]

# Convert multi-entry columns from string to list (assuming '|' is the separator)
for col in ["Genres", "Director", "Cast"]:
    data[col] = data[col].apply(lambda x: x.split('|') if isinstance(x, str) else [])

In [7]:
# 1. **Multi-Label Encoding (Genres, Director, Cast)**
mlb_genres = MultiLabelBinarizer()
mlb_director = MultiLabelBinarizer()
mlb_cast = MultiLabelBinarizer()

genres_encoded = mlb_genres.fit_transform(data["Genres"])
directors_encoded = mlb_director.fit_transform(data["Director"])
cast_encoded = mlb_cast.fit_transform(data["Cast"])

# Convert back to DataFrame
genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
directors_df = pd.DataFrame(directors_encoded, columns=mlb_director.classes_)
cast_df = pd.DataFrame(cast_encoded, columns=mlb_cast.classes_)

In [None]:
# 2. **Normalize Numerical Columns**
scaler = MinMaxScaler()
data[["RunningTime", "Rating", "Votes"]] = scaler.fit_transform(data[["RunningTime", "Rating", "Votes"]])

In [None]:
# 3. **Tokenize and Encode Cast & Director (Optional Alternative)**
# You can use a tokenizer instead of MultiLabelBinarizer if using embeddings
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data["Director"] + data["Cast"])  # Fit on both columns

data["Director_tokenized"] = tokenizer.texts_to_sequences(data["Director"])
data["Cast_tokenized"] = tokenizer.texts_to_sequences(data["Cast"])

# Pad sequences to make them of equal length
max_length = max(data["Cast_tokenized"].apply(len))  # Find longest sequence
data["Cast_tokenized"] = pad_sequences(data["Cast_tokenized"], maxlen=max_length, padding='post').tolist()

In [None]:
# 4. **Final Data Concatenation**
df_final = pd.concat([data[["RunningTime", "Rating", "Votes"]], genres_df, directors_df, cast_df], axis=1)

# Convert DataFrame to NumPy array for LSTM training
X = np.array(df_final)

# Display processed data
print(X.shape)  # (num_samples, num_features)

In [None]:
# Assume `X_cast` and `X_director` are padded sequences, and `X_other` is a NumPy array of numerical features.
# Assume `num_cast` and `num_directors` are the vocab sizes from tokenizer.word_index + 1

# Define input layers
cast_input = Input(shape=(X_cast.shape[1],), name="Cast_Input")
director_input = Input(shape=(X_director.shape[1],), name="Director_Input")
other_features_input = Input(shape=(X_other.shape[1],), name="Other_Features")

# Embedding layers for categorical text-based features
embedding_dim = 32  # You can adjust this

cast_embedding = Embedding(input_dim=num_cast, output_dim=embedding_dim, mask_zero=True)(cast_input)
director_embedding = Embedding(input_dim=num_directors, output_dim=embedding_dim, mask_zero=True)(director_input)

# LSTM layers to process sequences
cast_lstm = LSTM(32)(cast_embedding)
director_lstm = LSTM(32)(director_embedding)

# Combine numerical features
combined = Concatenate()([cast_lstm, director_lstm, other_features_input])

# Dense layers for learning feature interactions
dense1 = Dense(64, activation='relu')(combined)
dense2 = Dense(32, activation='relu')(dense1)
output = Dense(1, activation='sigmoid')  # Adjust based on task (e.g., regression/classification)

# Build and compile model
model = Model(inputs=[cast_input, director_input, other_features_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()