In [85]:
# Libraries used to train models & manipulate data
import tensorflow as tf
import numpy as np
import pandas as pd
from ast import literal_eval

### Kaggle import: https://github.com/Kaggle/kaggle-api
# kaggle datasets download -f movies_metadata.csv --unzip rounakbanik/the-movies-dataset
# kaggle datasets download -f ratings_small.csv --unzip rounakbanik/the-movies-dataset
metadata = pd.read_csv(r'./movies_metadata.csv')
ratings = pd.read_csv(r'./ratings_small.csv')
metadata.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [86]:
# Make a list of all unique genre-ID pairs
genres = []

def genreGatherer(genreList):
    genreList = literal_eval(genreList)
    for g in genreList:
        if g['name'] not in genres:
            genres.append(g['name'])

metadata['genres'].apply(genreGatherer)
print(genres)

['Animation', 'Comedy', 'Family', 'Adventure', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'History', 'Science Fiction', 'Mystery', 'War', 'Foreign', 'Music', 'Documentary', 'Western', 'TV Movie', 'Carousel Productions', 'Vision View Entertainment', 'Telescene Film Group Productions', 'Aniplex', 'GoHands', 'BROSTA TV', 'Mardock Scramble Production Committee', 'Sentai Filmworks', 'Odyssey Media', 'Pulser Productions', 'Rogue State', 'The Cartel']


In [87]:
# Convert genres list-dicts into a list of IDs
def genreIdParser(genreList):
    genreList = literal_eval(genreList)
    IDs = []
    for g in genreList:
        newID = genres.index(g['name'])
        IDs.append(newID)
    
    return IDs

# Metadata: Use only relevant columns
metadata = metadata.filter(items=['id', 'genres'])

metadata['genres'] = metadata['genres'].apply(genreIdParser)
metadata.head(3)

Unnamed: 0,id,genres
0,862,"[0, 1, 2]"
1,8844,"[3, 4, 2]"
2,15602,"[5, 1]"


In [88]:
# Ratings: Group and filter ratings 
ratings = ratings.filter(items=['movieId', 'rating'])
# Convert type to prep for merge
ratings['movieId'] = ratings['movieId'].astype(str)
# Average ratings by movie
ratings = ratings.groupby(['movieId']).mean().reset_index()

ratings.head(3)

Unnamed: 0,movieId,rating
0,1,3.87247
1,10,3.45082
2,100,3.428571


In [89]:
# Traing Data Merging
train = pd.merge(metadata, ratings, left_on='id', right_on='movieId')
del train['movieId']
train.head(3)

Unnamed: 0,id,genres,rating
0,949,"[7, 8, 6, 9]",3.59375
1,710,"[3, 7, 9]",1.5
2,1408,"[7, 3]",3.616279


In [90]:
# One hot encode genres
num_genres = len(genres)
def multiHotEncoder(genreList):
    IDs = [0]*num_genres

    for g in genreList:
        IDs[g] = 1
    return IDs

train['genres'] = train['genres'].apply(multiHotEncoder)
train.head(3)

Unnamed: 0,id,genres,rating
0,949,"[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...",3.59375
1,710,"[0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...",1.5
2,1408,"[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",3.616279


In [91]:
# Make the model to predict ratings based of genres
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(num_genres,)),
    tf.keras.layers.Dense(1000, activation='relu'),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(1)
])
# Hyper parameters
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['accuracy']
)
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_18 (Dense)             (None, 1000)              33000     
_________________________________________________________________
dense_19 (Dense)             (None, 100)               100100    
_________________________________________________________________
dense_20 (Dense)             (None, 1)                 101       
Total params: 133,201
Trainable params: 133,201
Non-trainable params: 0
_________________________________________________________________


In [92]:
history = model.fit(
    train['genres'].tolist(),
    train['rating'].tolist(),
    epochs=5,
    verbose=0
)
print("Final accuracy:")
history.history['accuracy'][-1]

Final accuracy:


0.021193925

In [105]:
import plotly.figure_factory as ff
# Predict and clean output
o = model.predict(train['genres'].tolist())
outputs = []
for i in range(len(o)):
    outputs.append(o[i][0])

# Determine the error of each prediction
error = [0] * len(outputs)
for i in range(len(error)):
    error[i] = abs(outputs[i] - train['rating'][i])

# Create distribution of error plot
fig = ff.create_distplot([error], ["Error"], bin_size=.1)
fig.update_layout(title='Error of Rating Distrbution when Predicted with Genres')
fig.show()