In [113]:
import pandas as pd
import numpy as np

# Load the dataset into pandas df
df = pd.read_csv('data/movies_metadata.csv', low_memory=False)

df.head(15)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
5,False,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,1995-12-15,187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0
6,False,,58000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,11860,tt0114319,en,Sabrina,An ugly duckling having undergone a remarkable...,...,1995-12-15,0.0,127.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,You are cordially invited to the most surprisi...,Sabrina,False,6.2,141.0
7,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,45325,tt0112302,en,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses...",...,1995-12-22,0.0,97.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The Original Bad Boys.,Tom and Huck,False,5.4,45.0
8,False,,35000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,9091,tt0114576,en,Sudden Death,International action superstar Jean Claude Van...,...,1995-12-22,64350171.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Terror goes into overtime.,Sudden Death,False,5.5,174.0
9,False,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.mgm.com/view/movie/757/Goldeneye/,710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,...,1995-11-16,352194034.0,130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,No limits. No fears. No substitutes.,GoldenEye,False,6.6,1194.0


## Classifying movies by rating acording to IMDB Weighted Rating (WR)

WR = (v/ (v+m) * R) + ( m / (v+m) * C) 


- v is the number of votes garnered by the movie
- m is the minimum number of votes required for the movie to be in the chart. 
- R is the mean rating of the movie
- C is the mean rating of all the movies in the dataset

For our recommender, we will use the number of votes garnered by the 80th percentile movie as our value for m. In other words, for a movie to be considered in the rankings, it must have garnered move votes than at least 80% of the movies peresent in our dataset.

In [114]:
# Calculate the number of votes garnered by the 80th percentile movie
m = df['vote_count'].quantile(0.80)

In [115]:
m

50.0

In [116]:
# Only consider movies that have runtime greater than 45 minutes and less than 300 minutes. 
q_movies = df[(df['runtime'] >= 45) & (df['runtime'] <= 300)]

In [117]:
# Only consider movies that have garnered more than m votes
q_movies = q_movies[q_movies['vote_count'] >= m]

In [118]:
q_movies.shape

(8963, 24)

In [119]:
# Calculate C, the mean rating for all the movies in the dataset
C = df['vote_average'].mean()
C 

5.618207215133889

In [120]:
# Function to compute the IMDB weighted rating for each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/ (v+m) * R) + ( m / (v+m) * C) 



In [121]:
# Computer the score using the weighted_rating function defined above
q_movies['score'] =  q_movies.apply(weighted_rating, axis=1)

In [122]:
q_movies[['title', 'release_date', 'vote_count', 'score']].sort_values('score', ascending=False).head(100)

Unnamed: 0,title,release_date,vote_count,score
10309,Dilwale Dulhania Le Jayenge,1995-10-20,661.0,8.855148
314,The Shawshank Redemption,1994-09-23,8358.0,8.482863
834,The Godfather,1972-03-14,6024.0,8.476278
40251,Your Name.,2016-08-26,1030.0,8.366584
12481,The Dark Knight,2008-07-16,12269.0,8.289115
2843,Fight Club,1999-10-15,9678.0,8.286216
292,Pulp Fiction,1994-09-10,8670.0,8.284623
522,Schindler's List,1993-11-29,4436.0,8.270109
23673,Whiplash,2014-10-10,4376.0,8.269704
5481,Spirited Away,2001-07-20,3968.0,8.266628


In [123]:
df = q_movies[['title', 'genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]

In [124]:
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0
5,Heat,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",1995-12-15,170.0,7.7,1886.0


In [125]:
# Convert release_date into pandas datetime format
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

# Extract year from the datetime
df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [126]:
# Helper function to convert NAT to 0 and all other years to integers
def convert_int(x): 
    try: 
        return int(x)
    except: 
        return 0 
    

In [127]:
# Apply convert_int to the year feature
df['year'] = df['year'].apply(convert_int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [128]:
# Drop the release_date column
df = df.drop('release_date', axis=1)

# Display the dataframe
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",81.0,7.7,5415.0,1995
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",101.0,6.5,92.0,1995
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",106.0,5.7,173.0,1995
5,Heat,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",170.0,7.7,1886.0,1995


In [129]:
# Print genres of the first movie
df.iloc[0]['genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [130]:
# Destringify genres dictionary

# Import the literal_eval funtion from ast 
from ast import literal_eval

# Apply a stringified list and output its type
a = "[1,2,3]"
print(type(a))

# Define a stringified list and output type
b = literal_eval(a)
print(type(b))

<class 'str'>
<class 'list'>


In [131]:
# Convert all NaN stringified empty lists
df['genres'] = df['genres'].fillna('[]')

In [132]:
# Apply literal_eval to convert to the list object
df['genres'] = df['genres'].apply(literal_eval)

In [133]:
#Convert list of dictionaries to a list of strings
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [134]:
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[Animation, Comedy, Family]",81.0,7.7,5415.0,1995
1,Jumanji,"[Adventure, Fantasy, Family]",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[Romance, Comedy]",101.0,6.5,92.0,1995
4,Father of the Bride Part II,[Comedy],106.0,5.7,173.0,1995
5,Heat,"[Action, Crime, Drama, Thriller]",170.0,7.7,1886.0,1995


In [135]:
# Create title row for each of the genres
s = df.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)

In [136]:
# Name the new feature as 'genre'
s.name = 'genre'

# Create a new dataframe gen_df which by dropping th eold 'genres' feature an adding the new 'genre'
gen_df = df.drop('genres', axis=1).join(s)

# Print the head of the new gen_df
gen_df.head()


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre
0,Toy Story,81.0,7.7,5415.0,1995,Animation
0,Toy Story,81.0,7.7,5415.0,1995,Comedy
0,Toy Story,81.0,7.7,5415.0,1995,Family
1,Jumanji,104.0,6.9,2413.0,1995,Adventure
1,Jumanji,104.0,6.9,2413.0,1995,Fantasy


In [137]:
def build_chart(gen_df, percentile=0.8): 
    #Ask for preferred genres
    print("Input preferred genre")
    genre = input()
    
    #Ask for lower limit of duration
    print("Input shortest duration")
    low_time = int(input())
    
    #Ask for upper limit of duration
    print("Input longest duration")
    high_time = int(input())
    
    #Ask for lower limit of timeline
    print("Input earliest year")
    low_year = int(input())
    
    #Ask for upper limit of timeline
    print("Input latest year")
    high_year = int(input())
    
    #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies
    movies = gen_df.copy()
    
    #Filter based on the condition
    movies = movies[(movies['genre'] == genre) & 
                    (movies['runtime'] >= low_time) & 
                    (movies['runtime'] <= high_time) & 
                    (movies['year'] >= low_year) & 
                    (movies['year'] <= high_year)]
    
    #Compute the values of C and m for the filtered movies
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)
    
    #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies
    q_movies = movies.copy().loc[movies['vote_count'] >= m]
    
    #Calculate score using the IMDB formula
    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) 
                                       + (m/(m+x['vote_count']) * C)
                                       ,axis=1)

    #Sort movies in descending order of their scores
    q_movies = q_movies.sort_values('score', ascending=False)
    
    return q_movies


    
    
    
    
    
    
    
    

In [138]:
build_chart(gen_df).head()

Input preferred genre
Animation
Input shortest duration
30
Input longest duration
120
Input earliest year
1993
Input latest year
2010


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre,score
359,The Lion King,89.0,8.0,5520.0,1994,Animation,7.780574
9698,Howl's Moving Castle,119.0,8.2,2049.0,2004,Animation,7.662559
13724,Up,96.0,7.8,7048.0,2009,Animation,7.645951
12704,WALL·E,98.0,7.8,6439.0,2008,Animation,7.633224
0,Toy Story,81.0,7.7,5415.0,1995,Animation,7.521005


In [140]:
# Convert the cleaned (non-exploded) dataframe df into a CSV file and save it in the data folder
# Set parameter index to False as the index of the DataFrame has no inherent meaning 

df.to_csv('data/metadata_clean.csv', index=False)