In [168]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from ast import literal_eval
import matplotlib.pyplot as plt

# Simple Recommender (Top Movies)

In [148]:
md = pd.read_csv('data/input/movies/movies_metadata.csv')

  md = pd.read_csv('data/input/movies/movies_metadata.csv')


In [149]:
# Handles missing values in the genre column by:
# 1. Filling in missing values with '[]'
# 2. Converts the stringified list to a list using literal_eval
# 3. Extracts the name of the genre
# 4. If the genre is not available, returns an empty list
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

I use the TMDB Ratings to come up with our Top Movies Chart. I will use IMDB's weighted rating formula to construct my chart. Mathematically, it is represented as follows:

**Weighted Rating (WR) =  (v/(v+m)*R)+(m/(v+m)*C)**

**v:** is the number of votes for the movie

**m:** is the minimum votes required to be listed in the chart

**R:** is the average rating of the movie

**C:** is the mean vote across the whole report

In other words, for a movie to feature in the charts, it must have more votes than at least 95% of the movies in the list.

In [150]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

np.float64(5.244896612406511)

In [151]:
m = vote_counts.quantile(0.95)
m

np.float64(434.0)

In [152]:
# Creates a new column 'year' by extracting the year from the release_date column
# 1. Converts the release_date column to datetime
# 2. Extracts the year from the release_date column
# 3. If the year is missing, returns NaN
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [153]:
# Filters out movies that have a vote_count of less than 95th percentile
# Selects only the title, year, vote_count, vote_average, popularity, and genres columns
# Converts the vote_count and vote_average columns to integers
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [154]:
# A movie has to have at least 434 votes on TMDB.
# We also see that the average rating for a movie on TMDB is 5.244 on a scale of 10
# 2274 Movies qualify to be on our chart
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [155]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [156]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

**Top Movies**

In [157]:
qualified.head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.869599,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.950236,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.645403,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.307194,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,"[Adventure, Fantasy, Action]",7.851924


In [158]:
# The function takes each entry in the ‘genres’ column (which is expected to be a list of genres), and converts it into a pandas Series, effectively expanding the list into separate rows.
# The stack() function then stacks these rows on top of each other, and the reset_index() function resets the index of the resulting Series.
# The name of the Series is set to ‘genre’.
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

In [162]:
gen_md.head()

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Animation
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Comedy
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Family
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Adventure
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Fantasy


In [166]:
def build_chart(genre, percentile=0.85):
    """
    Builds a chart of top movies for a particular genre
    :param genre: str, genre of the movie
    :param percentile: float, percentile of the vote_count
    :return: pd.DataFrame, top 250 movies for the genre
    """
    # Filters for movies in the given genre
    df = gen_md[gen_md['genre'] == genre]
    # Creates vote_counts variable for movies in the genre
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    # Creates vote_averages variable for movies in the genre
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)

    # Filters for the movies that have a vote_count greater than the 85th percentile
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')

    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)

    return qualified

In [167]:
build_chart('Romance')

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457024,8.565285
351,Forrest Gump,1994,8147,8,48.307194,7.971357
876,Vertigo,1958,1162,8,18.20822,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.845107,7.745154
...,...,...,...,...,...,...
7582,How to Steal a Million,1966,133,7,12.389626,6.352112
6912,Beauty and the Beast,1946,133,7,4.798852,6.352112
24888,Remember Sunday,2013,131,7,11.190606,6.346273
2801,Body Heat,1981,128,7,5.991482,6.337313
