In [1]:
import numpy as np
import pandas as pd

In [2]:
DATASET_PATH = "datasets/"

In [3]:
import os

In [4]:
movies = pd.read_csv(os.path.join(DATASET_PATH, "tmdb_5000_movies.csv"))
credit = pd.read_csv(os.path.join(DATASET_PATH, "tmdb_5000_credits.csv"))

In [5]:
movies = movies.merge(credit, on="title")

In [7]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [37]:
movies = movies[['id', 'title', 'overview', 'genres', 'keywords', 'vote_average', 'vote_count', 'popularity', 'release_date', 'cast']]


movies.columns = map(str.lower, movies.columns)

Unnamed: 0,id,title,overview,genres,keywords,vote_average,vote_count,popularity,release_date,cast
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",7.2,11800,150.437577,2009-12-10,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",6.9,4500,139.082615,2007-05-19,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",6.3,4466,107.376788,2015-10-26,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",7.6,9106,112.31295,2012-07-16,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",6.1,2124,43.926995,2012-03-07,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c..."


In [None]:
movies['genres'] = movies['genres'].apply(eval)
movies['genres'] = movies['genres'].apply(lambda x: [d['name'] for d in x])

In [None]:
movies['keywords'] = movies['keywords'].apply(eval)
movies['keywords'] = movies['keywords'].apply(lambda x: [d['name'] for d in x])

In [None]:
movies.head()

In [None]:
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')

In [None]:
movies['cast'] = movies['cast'].apply(eval)
movies['cast'] = movies['cast'].apply(lambda x: [d['name'] for d in x])

In [None]:
movies.dropna(inplace=True)

In [None]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [None]:
movies.sample(5)

In [None]:
# converting genres, keywords and cast to string 
movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x))
movies['keywords'] = movies['keywords'].apply(lambda x: ' '.join(x))
movies['cast'] = movies['cast'].apply(lambda x: ' '.join(x))
movies['overview'] = movies['overview'].apply(lambda x: ' '.join(x))

In [None]:
# making final column for training
movies['combined'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['title'] + movies['release_date'].apply(lambda x: str(x.year) + " " + str(x.month) + " " + str(x.day))
movies['title_lower'] = movies['title'].apply(lambda x: x.lower())

Machine Learning part

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

In [None]:
tf = TfidfVectorizer(max_features=5000, stop_words='english')

In [None]:
tf_vector = tf.fit_transform(movies['combined']).toarray()

In [None]:
sig = sigmoid_kernel(tf_vector, tf_vector)

In [None]:
def recommend(movie):
    index = movies[movies['title_lower'] == movie.lower()].index[0]
    sig_scores = list(enumerate(sig[index]))
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    sig_scores = sig_scores[1:11]
    movie_index = [i[0] for i in sig_scores]
    return movies[
        ['title', 'vote_count', 'vote_average', 'release_date', 'genres']
    ].iloc[movie_index].to_dict('records')

In [None]:
recommend('Avatar')

In [None]:
# using bayesian average
C = movies['vote_average'].mean()
m = movies['vote_count'].quantile(0.9)
v = movies['vote_count']
R = movies['vote_average']

movies['weighted_average'] = (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
def get_top_x_movies(x):
    # return list of top x movies with id, title, vote_count, vote_average, weighted_average, popularity
    return movies[['id', 'title', 'vote_count', 'vote_average', 'weighted_average', 'popularity']].sort_values('weighted_average', ascending=False).head(x).to_dict('records')

In [38]:
def get_top_x_movies_by_genre(genre, x=10):
    return movies[movies['genres'].str.contains(genre)].sort_values('weighted_average', ascending=False).head(x).to_dict('records')


In [40]:
get_top_x_movies_by_genre('action', 10)

KeyError: 'weighted_average'

In [None]:
get_top_x_movies(10)

In [None]:
# extracting movies df to be loaded as a df in another app and used for recommendation but not in csv format
movies[['id', 'title', 'vote_count', 'vote_average', 'weighted_average', 'popularity']].sort_values('weighted_average', ascending=False).head(10).to_dict('records')

# saving movies df to csv
movies[['id', 'title', 'vote_count', 'vote_average', 'weighted_average', 'popularity']].sort_values('weighted_average', ascending=False).head(10).to_csv('movies.csv', index=False)