In [1]:
# Libraries used to train models & manipulate data
import tensorflow as tf
import numpy as np
import pandas as pd

### Kaggle import: https://github.com/Kaggle/kaggle-api
# kaggle datasets download -f movies_metadata.csv --unzip rounakbanik/the-movies-dataset
# kaggle datasets download -f ratings_small.csv --unzip rounakbanik/the-movies-dataset
metadata = pd.read_csv(r'./movies_metadata.csv')
ratings = pd.read_csv(r'./ratings_small.csv')

In [2]:
# Metadata: Use only relevant columns
metadata = metadata.filter(items=['id', 'budget', 'original_title', 'revenue'])
# Filter out budgets that are void
budget_filter = metadata['budget'] != '0'
metadata = metadata[budget_filter]

metadata.head(3)

Unnamed: 0,id,budget,original_title,revenue
0,862,30000000,Toy Story,373554033.0
1,8844,65000000,Jumanji,262797249.0
3,31357,16000000,Waiting to Exhale,81452156.0


In [3]:
# Ratings: Group and filter ratings 
ratings = ratings.filter(items=['movieId', 'rating'])
# Convert type to prep for merge
ratings['movieId'] = ratings['movieId'].astype(str)
# Average ratings by movie
ratings = ratings.groupby(['movieId']).mean().reset_index()

ratings.head(3)

Unnamed: 0,movieId,rating
0,1,3.87247
1,10,3.45082
2,100,3.428571


In [4]:
# Traing Data Preparation
train = metadata.filter(items=['id', 'budget'])

train = pd.merge(train, ratings, left_on='id', right_on='movieId')
del train['movieId'] # Redundant id

train = train.astype(float) # Convert everything the integers
# Scale budget to be between 0 & 1
train['budget'] /= train['budget'].max()

print(train.shape)
train.head()

(1263, 3)


Unnamed: 0,id,budget,rating
0,949.0,0.157895,3.59375
1,710.0,0.152632,1.5
2,1408.0,0.257895,3.616279
3,524.0,0.136842,3.555556
4,4584.0,0.043421,5.0


In [5]:
# Make the model to predict ratings based of budget
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(1000, activation='relu'),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(1)
])
# Hyper parameters
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['accuracy']
)

In [13]:
history = model.fit(
    train['budget'].tolist(),
    train['rating'].tolist(),
    epochs=3
)

Train on 1263 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [19]:
import plotly.graph_objects as go

o = model.predict(train['budget'])
outputs = []
for i in range(len(o)):
    outputs.append(o[i][0])

fig = go.Figure()
# Training Data
fig.add_trace(go.Scatter(
    x=train['budget'], y=train['rating'],
    name='Real',
    mode='markers',
    marker_color='rgba(255, 182, 193, .8)'
))
# Prediction Data
fig.add_trace(go.Scatter(
    x=train['budget'], y=outputs,
    name='Pred',
    mode='lines',
    marker_color='rgba(255, 0, 100, 1)'
))

fig.update_layout(title='Model Prediction with Budget',
                  yaxis_zeroline=False, xaxis_zeroline=False)
fig.show()