In [1]:
DATA_FOLDER = './data/'
MOVIES_FOLDER = DATA_FOLDER + 'movies_summaries/'
PLOT_SUMMARY_FOLDER = DATA_FOLDER + 'corenlp_plot_summaries'

REPORT_FOLDER = './gen/reports/'
ETHNICITY_FILE = './gen/ethnicities.tsv'

CHARACTERS_FILE = MOVIES_FOLDER + 'character.metadata.tsv'
MOVIES_FILE = MOVIES_FOLDER + 'movie.metadata.tsv'
PLOT_SUMMARIES_FILE = MOVIES_FOLDER + 'plot_summaries.txt'
TROPES_FILE = MOVIES_FOLDER + 'tvtropes.clusters.txt'

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import datetime

from pandas_profiling import ProfileReport
%matplotlib inline

from data_wrangling_tools import *

# Programming tools
from tqdm.notebook import tqdm
from tqdm.keras import TqdmCallback

# Neural Networks
import tensorflow as tf

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

# Tensorflow GPUs available
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [4]:
# load movies
movies = load_movies(MOVIES_FILE)
# clean features    
movies = clean_unknowns(movies)
movies = clean_jsons(movies)

movies.head()

Unnamed: 0,wiki_movie_id,freebase_movie_id,name,release_date,box_office_revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]"
2,28463795,/m/0crgdbh,Brun bitter,1988-01-01,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]"
3,9363483,/m/0285_cd,White Of The Eye,1987-01-01,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri..."
4,261236,/m/01mrr1,A Woman in Flames,1983-01-01,,106.0,[German Language],[Germany],[Drama]


In [9]:
movies.shape

(81741, 9)

In [10]:
movies_bo = movies[movies['box_office_revenue'] > 0]
movies_bo.shape

(8401, 9)

In [11]:
movies_na = movies[movies['box_office_revenue'].isna()]
movies_na.shape

(73340, 9)

In [13]:
def to_1D(series):
 return pd.Series([x for _list in series for x in _list])

In [22]:
countries = to_1D(movies['countries']).unique()
genres = to_1D(movies['genres']).unique()
languages = to_1D(movies['languages']).unique()
print(f'Countries: {len(countries)}')
print(f'Genres: {len(genres)}')
print(f'Languages: {len(languages)}')

Countries: 148
Genres: 364
Languages: 208


In [80]:
def augment_movies(movies, countries, genres, languages):
    movies_augmented = movies.copy()
    for country in countries:
        movies_augmented[f'country:{country}'] = movies_augmented['countries'].apply(lambda x: 1 if country in x else 0)

    for genre in genres:
        movies_augmented[f'genre:{genre}'] = movies_augmented['genres'].apply(lambda x: 1 if genre in x else 0)

    for language in languages:
        movies_augmented[f'language:{language}'] = movies_augmented['languages'].apply(lambda x: 1 if language in x else 0)

    movies_augmented['release_date'] = movies_augmented['release_date'].apply(lambda x: x.toordinal() if not pd.isna(x) else 0)
    movies_augmented['runtime'] = movies_augmented['runtime'].apply(lambda x: x if not pd.isna(x) else 0)

    movies_augmented = movies_augmented.drop(['wiki_movie_id', 'freebase_movie_id', 'name', 'countries', 'genres', 'languages'], axis=1)

    return movies_augmented

def normalize(movies):
    def normalize_column(column):
        return (column - column.min()) / (column.max() - column.min())

    movies_norm = movies.copy()
    movies_norm['box_office_revenue'] = normalize_column(movies_norm['box_office_revenue'])
    movies_norm['release_date'] = normalize_column(movies_norm['release_date'])
    movies_norm['runtime'] = normalize_column(movies_norm['runtime'])
    
    return movies_norm

def norm_values(feature):
    min = feature.min()
    max = feature.max()
    range = max - min
    return min, range

def denormalize(feature, min, range):
    return feature * range + min

dataset = augment_movies(movies_bo, countries, genres, languages)
dataset = normalize(dataset)
dataset

Unnamed: 0,release_date,box_office_revenue,runtime,country:United States of America,country:Norway,country:United Kingdom,country:Germany,country:South Africa,country:Argentina,country:Japan,...,language:Hmong language,language:Osetin Language,language:Deutsch,language:Nahuatl languages,language:Hainanese,language:Chewa language,language:Haryanvi Language,language:Assyrian language,language:Papiamento language,language:Kuna language
0,0.994447,0.005032,0.097707,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0.963121,0.001290,0.105683,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0.988586,0.003649,0.136590,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0.976060,0.036755,0.138584,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,0.993504,0.000004,0.091725,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81695,0.987910,0.103780,0.116650,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81720,0.988660,0.005521,0.099701,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81725,0.993953,0.002492,0.105683,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81726,0.990361,0.008222,0.106680,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
train = dataset.sample(frac=0.8, random_state=0)
test = dataset.drop(train.index)

In [51]:
x_train = train.copy()
x_test = test.copy()

y_train = x_train.pop('box_office_revenue')
y_test = x_test.pop('box_office_revenue')

In [67]:
model = Sequential()
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(1))

model.compile(optimizer='Adam', loss='mse', metrics=['mae', 'mse'])

In [71]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          verbose=1,
          validation_data=(x_test, y_test)
)
model.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             (None, 512)               370176    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 257       
Total params: 501,761
Trainable params: 501,761
Non-trainable params: 0
_________________________________________________________________


In [72]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.0009300319361500442
Test accuracy: 0.013888183049857616


In [83]:
y_pred = model.predict(x_test)
print(y_pred)

[[0.02752409]
 [0.00187077]
 [0.00921796]
 ...
 [0.00501775]
 [0.00589338]
 [0.00700867]]
