In [1]:
from typing import Dict
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

from helpers.utils import MovieMetadata

with open('movie_metadata.pkl', 'rb') as f:
    data_dict: Dict[str, MovieMetadata] = pickle.load(f)

# Convert data_dict to a DataFrame
df = pd.DataFrame.from_dict(data_dict, orient='index')

# Drop unique columns
unique_columns = ["title", "description", "rating", "original_language", "release_date_theater", "release_date_streaming"]
df = df.drop(columns=unique_columns)

# Convert Sentiment to numerical
def parse_sentiment(sentiment):
    if sentiment == "POSITIVE":
        return 1
    elif sentiment == "NEGATIVE":
        return -1
    return 0
df["critic_score_sentiment"] = df["critic_score_sentiment"].apply(parse_sentiment)
df["audience_score_sentiment"] = df["audience_score_sentiment"].apply(parse_sentiment)

# Convert Runtime to numerical
def parse_runtime(runtime):
    if pd.isnull(runtime):
        return None
    
    runtime_parts = runtime.split(" ")
    
    hours = 0
    minutes = 0
    
    for part in runtime_parts:
        if part.endswith("h"):
            hours = int(part[:-1])
        elif part.endswith("m"):
            minutes = int(part[:-1])

    return hours * 60 + minutes
df["runtime"] = df["runtime"].apply(parse_runtime)

# Fix Column Types
df["audience_score_average_rating"] = df["audience_score_average_rating"].astype(float)
df["audience_score_score"] = df["audience_score_score"].astype(float)
df["critic_score_average_rating"] = df["critic_score_average_rating"].apply(lambda x: None if x == '' else x).astype(float)
df["critic_score_score"] = df["critic_score_score"].astype(float)

# Types of columns
print(df.dtypes)
df.head()

2024-06-03 13:15:48.148282: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-03 13:15:48.148481: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-03 13:15:48.150247: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-03 13:15:48.171358: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


audience_score_average_rating     float64
audience_score_liked_count        float64
audience_score_not_liked_count    float64
audience_score_review_count       float64
audience_score_score              float64
audience_score_sentiment            int64
critic_score_average_rating       float64
critic_score_liked_count          float64
critic_score_not_liked_count      float64
critic_score_review_count         float64
critic_score_score                float64
critic_score_sentiment              int64
director                           object
producer                           object
screenwriter                       object
distributor                        object
production_company                 object
genre                              object
sound_mix                          object
runtime                           float64
box_office                        float64
dtype: object


Unnamed: 0,audience_score_average_rating,audience_score_liked_count,audience_score_not_liked_count,audience_score_review_count,audience_score_score,audience_score_sentiment,critic_score_average_rating,critic_score_liked_count,critic_score_not_liked_count,critic_score_review_count,...,critic_score_sentiment,director,producer,screenwriter,distributor,production_company,genre,sound_mix,runtime,box_office
xoxo_2016,3.3,211.0,192.0,76.0,52.0,-1,5.8,5.0,2.0,7.0,...,1,[Christopher Louie],"[Max Leitman, Joe Russell, Pete Tong, Daniel S...",[Dylan Meyer],,[Netflix],[Comedy],,92.0,
angelina_ballerina_sweet_valentine_2012,,,,,,0,,,,,...,0,[Unknown Director],,,,,,,,
fufu,,,,,,0,,,,,...,0,[Ng Toi Yung],,,,,"[Comedy, Drama]",,80.0,
outatime_saving_the_delorean_time_machine,,,,,,0,,,,,...,0,[Steve Concotelli],"[Marci Concotelli, Brian Thompson, Josh Turche...",[Steve Concotelli],,[Cricket Pictures],[Documentary],,64.0,
women_of_the_night,3.8,36.0,9.0,22.0,80.0,1,,,,,...,0,[Kenji Mizoguchi],[Hisao Itoya],[Yoshikata Yoda],,,[Drama],,105.0,


In [2]:
# Separate list columns
list_columns = ['director', 'producer', 'screenwriter', 'distributor', 'production_company', 'genre', 'sound_mix']
df_list = df[list_columns]
df_non_list = df.drop(columns=list_columns)

# Handle missing values for non-list columns
imputer = SimpleImputer()
df_non_list_imputed = pd.DataFrame(imputer.fit_transform(df_non_list), columns=df_non_list.columns)


# Encode categorical data for non-list columns
label_encoders = {}
for column in df_non_list_imputed.select_dtypes(include=['object', 'category']).columns:
    label_encoders[column] = LabelEncoder()
    df_non_list_imputed[column] = label_encoders[column].fit_transform(df_non_list_imputed[column].astype(str))

# Standardize numerical features for non-list columns
scaler = StandardScaler()
numeric_columns = df_non_list_imputed.select_dtypes(include=['float']).columns
df_non_list_imputed[numeric_columns] = scaler.fit_transform(df_non_list_imputed[numeric_columns])


# Prepare lists of categorical values using Tokenizer
tokenizer = Tokenizer()
for column in list_columns:
    df_list[column] = df_list[column].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
    tokenizer.fit_on_texts(df_list[column])
    df_list[column] = tokenizer.texts_to_sequences(df_list[column])

# Pad sequences to ensure uniform length
for column in list_columns:
    df_list[column] = pad_sequences(df_list[column], padding='post').tolist()

# Convert padded sequences back to DataFrame
df_expanded_list = pd.DataFrame()
for column in list_columns:
    max_len = len(max(df_list[column], key=len))
    df_temp = pd.DataFrame(df_list[column].tolist(), index=df_list.index, columns=[f'{column}_{i}' for i in range(max_len)])
    df_expanded_list = pd.concat([df_expanded_list, df_temp], axis=1)


# Combine non-list and expanded list DataFrames
df_final = pd.concat([df_non_list_imputed, df_expanded_list], axis=1)

# Define target column
target_column = 'box_office'

# Drop rows with missing target values
df_final = df_final.dropna(subset=[target_column])

X = df_final.drop(columns=[target_column])
y = df_final[target_column]

# Check the final DataFrame shape
print(df_final.dtypes)
df_final.head()
# print(X.shape, y.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_list[column] = df_list[column].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_list[column] = tokenizer.texts_to_sequences(df_list[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_list[column] = pad_sequences(df_list[column]

audience_score_average_rating     float64
audience_score_liked_count        float64
audience_score_not_liked_count    float64
audience_score_review_count       float64
audience_score_score              float64
                                   ...   
sound_mix_10                      float64
sound_mix_11                      float64
sound_mix_12                      float64
sound_mix_13                      float64
sound_mix_14                      float64
Length: 302, dtype: object


Unnamed: 0,audience_score_average_rating,audience_score_liked_count,audience_score_not_liked_count,audience_score_review_count,audience_score_score,audience_score_sentiment,critic_score_average_rating,critic_score_liked_count,critic_score_not_liked_count,critic_score_review_count,...,sound_mix_5,sound_mix_6,sound_mix_7,sound_mix_8,sound_mix_9,sound_mix_10,sound_mix_11,sound_mix_12,sound_mix_13,sound_mix_14
0,-0.0883886,-0.1553182,-0.1312416,-0.049911,-0.2231884,-1.350061,-0.5757032,-1.012146,-0.967226,-1.175739,...,,,,,,,,,,
1,2.595372e-15,-3.7317630000000003e-17,-2.7997540000000004e-17,0.0,-3.994285e-16,0.0262,2.587089e-15,-3.063292e-16,1.432166e-16,0.0,...,,,,,,,,,,
2,2.595372e-15,-3.7317630000000003e-17,-2.7997540000000004e-17,0.0,-3.994285e-16,0.0262,2.587089e-15,-3.063292e-16,1.432166e-16,0.0,...,,,,,,,,,,
3,2.595372e-15,-3.7317630000000003e-17,-2.7997540000000004e-17,0.0,-3.994285e-16,0.0262,2.587089e-15,-3.063292e-16,1.432166e-16,0.0,...,,,,,,,,,,
4,0.8856546,-0.1840401,-0.221376,-0.052008,1.350819,1.402461,2.587089e-15,-3.063292e-16,1.432166e-16,0.0,...,,,,,,,,,,


In [4]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

# Define the model
model = Sequential()
model.add(Dense(256, input_dim=X.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))  # Assuming regression problem for continuous target variable

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Summary of the model
model.summary()

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1, callbacks=[early_stopping])

# Evaluate the model
loss, mae = model.evaluate(X_test, y_test, verbose=1)
print(f'Test Loss: {loss}')
print(f'Test MAE: {mae}')

# Predict on test set
y_pred = model.predict(X_test)

# Calculate accuracy as a percentage
threshold_percentages = [0.05, 0.10, 0.5, 1.0, 5.0, 10.0]
for threshold in threshold_percentages:
    accuracy = np.mean(np.abs((y_test - y_pred.flatten()) / y_test) < threshold) * 100
    print(f'Accuracy within {threshold * 100}%: {accuracy:.4f}%')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m2836/2836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 920us/step - loss: 1.1338 - mae: 0.1780 - val_loss: 1.0969 - val_mae: 0.1778
Epoch 2/100
[1m2836/2836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 862us/step - loss: 1.0774 - mae: 0.1776 - val_loss: 1.0971 - val_mae: 0.1845
Epoch 3/100
[1m2836/2836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 787us/step - loss: 0.9637 - mae: 0.1757 - val_loss: 1.0968 - val_mae: 0.1753
Epoch 4/100
[1m2836/2836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 892us/step - loss: 1.1159 - mae: 0.1739 - val_loss: 1.0969 - val_mae: 0.1796
Epoch 5/100
[1m2836/2836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 810us/step - loss: 1.0490 - mae: 0.1744 - val_loss: 1.0968 - val_mae: 0.1768
Epoch 6/100
[1m2836/2836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 871us/step - loss: 1.2395 - mae: 0.1789 - val_loss: 1.0968 - val_mae: 0.1716
Epoch 7/100
[1m2836/2836[0m [32m━━━━━━━━━━━━━━━━━━━━[0