In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from datetime import datetime

from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate, KFold, GridSearchCV

import warnings; warnings.simplefilter('ignore')



In [3]:
# Load all the dataset csv files
creds = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')
links = pd.read_csv('data/links.csv')
links_small = pd.read_csv('data/links_small.csv')
metadata = pd.read_csv('data/movies_metadata.csv')
ratings = pd.read_csv('data/ratings.csv')
ratings_small = pd.read_csv('data/ratings_small.csv')

## Exploratory Data Analysis

In [4]:
metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
# All the feature columns from the metadata csv
print(f'\nColumns in metadata: \n{list(metadata.columns)}')

# We have 45,466 movies' metadata with 24 feature columns for each movie
print(f'\nData Shape: {metadata.shape}\n')


Columns in metadata: 
['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count']

Data Shape: (45466, 24)



In [6]:
# Non-English Films - total of 11,786 movies are non-English
dubbed_movies = metadata[metadata['original_title'] != metadata['title']][['title', 'original_title', 'original_language', 'spoken_languages', 'production_countries']]
print(f'{dubbed_movies.shape[0]} non-English movies')
dubbed_movies.head(15)

11402 non-English movies


Unnamed: 0,title,original_title,original_language,spoken_languages,production_countries
28,The City of Lost Children,La Cité des Enfants Perdus,fr,"[{'iso_639_1': 'cn', 'name': '广州话 / 廣州話'}, {'i...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso..."
29,Shanghai Triad,摇啊摇，摇到外婆桥,zh,"[{'iso_639_1': 'zh', 'name': '普通话'}]","[{'iso_3166_1': 'CN', 'name': 'China'}, {'iso_..."
32,Wings of Courage,"Guillaumet, les ailes du courage",fr,"[{'iso_639_1': 'en', 'name': 'English'}]","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso..."
57,The Postman,Il postino,it,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...","[{'iso_3166_1': 'BE', 'name': 'Belgium'}, {'is..."
58,The Confessional,Le confessionnal,fr,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]"
67,French Twist,Gazon maudit,fr,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...","[{'iso_3166_1': 'FR', 'name': 'France'}]"
72,Les Miserables,Les misérables,fr,"[{'iso_639_1': 'fr', 'name': 'Français'}]","[{'iso_3166_1': 'FR', 'name': 'France'}]"
79,The White Balloon,بادکنک سفید,fa,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]","[{'iso_3166_1': 'IR', 'name': 'Iran'}]"
81,Antonia's Line,Antonia,nl,"[{'iso_639_1': 'nl', 'name': 'Nederlands'}]","[{'iso_3166_1': 'NL', 'name': 'Netherlands'}, ..."
104,Nobody Loves Me,Keiner liebt mich,de,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]","[{'iso_3166_1': 'DE', 'name': 'Germany'}]"


In [7]:
# Check the count of adult movies - only 9 movies belong to the adult genre
metadata['adult'].value_counts()

False                                                                                                                             45454
True                                                                                                                                  9
 - Written by Ørnås                                                                                                                   1
 Rune Balot goes to a casino connected to the October corporation to try to wrap up her case once and for all.                        1
 Avalanche Sharks tells the story of a bikini contest that turns into a horrifying affair when it is hit by a shark avalanche.        1
Name: adult, dtype: int64

## Simple Recommender

Sorting popular movies for each genres based on popularity. We clean up the genres data. Also, we use the vote count and average to sort the popular movies.

#### Find Most Popular Movies for All Genres

In [8]:
# Fill empty genres rows with a []
metadata['genres'] = metadata['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [9]:
# We get the minimum votes required to make it into the 'Popularity' chart for each genre

# Only select movies that have an non-null vote cound and averages, since we will be using these to generate the 'Movie Popularity Chart' for each genre
vote_counts = metadata[metadata['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = metadata[metadata['vote_average'].notnull()]['vote_average'].astype('int')

m = vote_counts.quantile(0.96)
C = vote_averages.mean()

print('Criteria for Vote Counts and Vote Averages - ')
print(f'Vote Counts criteria: {m}')
print(f'Minimum vote average required: {C}')

Criteria for Vote Counts and Vote Averages - 
Vote Counts criteria: 576.6399999999994
Minimum vote average required: 5.244896612406511


In [10]:
# Change to datetime format for each movie
metadata['year'] = pd.to_datetime(metadata['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [11]:
# Filter out the metadata to only use movies that have at least 566 votes and have a vote average of 5.23
most_popular = metadata[(metadata['vote_count'] >= m) & (metadata['vote_count'].notnull()) & (metadata['vote_average'].notnull())]
most_popular = most_popular[['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
most_popular['vote_count'] = most_popular['vote_count'].astype('int')
most_popular['vote_average'] = most_popular['vote_average'].astype('int')

print(f'Filtered out {most_popular.shape[0]} movies for Genre Popularity Chart!')

Filtered out 1819 movies for Genre Popularity Chart!


In [12]:
# Function to calulate the weighted ratings for a given dataframe,based on vote_count and vote_average
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [13]:
# Calculate the weighted rating function for the filtered out list of movies
most_popular['weighted_rating'] = most_popular.apply(weighted_rating, axis=1)
most_popular = most_popular.sort_values('weighted_rating', ascending=False)
print(f'A total of {most_popular.shape[0]} movies are in the top chart.')
most_popular.head(15)

A total of 1819 movies are in the top chart.


Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating
15480,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.891568
12481,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.876324
22879,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.864948
2843,Fight Club,1999,9678,8,63.869599,[Drama],7.845075
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.832214
292,Pulp Fiction,1994,8670,8,140.950236,"[Thriller, Crime]",7.828186
314,The Shawshank Redemption,1994,8358,8,51.645403,"[Drama, Crime]",7.822186
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.81952
351,Forrest Gump,1994,8147,8,48.307194,"[Comedy, Drama, Romance]",7.817885
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,"[Adventure, Fantasy, Action]",7.806672


#### Find Most Popular movies for each genre

In [14]:
# Since movies have multiple genres, we drop to assign just one genre per movie
s = metadata.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = metadata.drop('genres', axis=1).join(s)

In [15]:
# Finds the most popular movies for a given genre
def genre_popular(genre, percentile=0.85):

    # Find all movies which belong to the given genre
    df = gen_md[gen_md['genre'] == genre]

    # 
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)

    # Filter out the movies that do not meet our filtering criteria
    most_popular = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    most_popular['vote_count'] = most_popular['vote_count'].astype('int')
    most_popular['vote_average'] = most_popular['vote_average'].astype('int')

    # Apply the weighted rating criteria for movies in the given genre
    most_popular['weighted_rating'] = most_popular.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    most_popular = most_popular.sort_values('weighted_rating', ascending=False).head(250)

    return most_popular

##### Top Mystery Movies

In [16]:
print(f'\t\t\t\tTop 15 Mystery movies\n\n')
genre_popular('Mystery').head(10)

				Top 15 Mystery movies




Unnamed: 0,title,year,vote_count,vote_average,popularity,weighted_rating
15480,Inception,2010,14075,8,29.108149,7.966674
46,Se7en,1995,5915,8,18.45743,7.922088
11354,The Prestige,2006,4510,8,16.94556,7.89877
4099,Memento,2000,4168,8,15.450789,7.890815
9430,Oldboy,2003,2000,8,10.616859,7.782445
877,Rear Window,1954,1531,8,17.911314,7.722961
896,Citizen Kane,1941,1244,8,15.811921,7.667293
876,Vertigo,1958,1162,8,18.20822,7.647028
5157,Rashomon,1950,471,8,9.887355,7.274827
3315,Double Indemnity,1944,425,8,6.49432,7.220079


##### Top Action Movies

In [17]:
print(f'\t\t\t\t\t\tTop 15 Action movies\n\n')
genre_popular('Action').head(10)

						Top 15 Action movies




Unnamed: 0,title,year,vote_count,vote_average,popularity,weighted_rating
15480,Inception,2010,14075,8,29.108149,7.955099
12481,The Dark Knight,2008,12269,8,123.167259,7.94861
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,7.929579
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,7.924031
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,7.918382
256,Star Wars,1977,6778,8,42.149697,7.908327
1154,The Empire Strikes Back,1980,5998,8,19.470959,7.896841
4135,Scarface,1983,3017,8,11.299673,7.802046
9430,Oldboy,2003,2000,8,10.616859,7.711649
1910,Seven Samurai,1954,892,8,15.01777,7.426145


## Content-Based Filtering Systems

#### Taglines and Descriptions based

In [18]:
# Use the links_small to get TMDB IDs and IDs for 9000 movies
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [19]:
# Drop the movies with IDs that have 'NaN' data
metadata = metadata.drop([19730, 29503, 35587])
metadata['id'] = metadata['id'].astype('int')

In [20]:
# Only use movies that have IDs stores in the `links_small` csv file
smd = metadata[metadata['id'].isin(links_small)]
print(f'We have a total of {smd.shape[0]} movies for content-based filtering using Movie Descriptions.')
smd.shape

We have a total of 9099 movies for content-based filtering using Movie Descriptions.


(9099, 25)

In [21]:
# We use the tagline and description to use as content
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [22]:
# We use TF-IDF to generate embeddings for the movie descriptions
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

# Generate embeddings for each movie's description
tfidf = tf.fit_transform(smd['description'])
print(f'Generated {tfidf.shape[1]} dimensional embeddings')
tfidf.shape

Generated 268124 dimensional embeddings


(9099, 268124)

In [23]:
# Generate a pairwise similarity of all movies with each other
pairwise_mat = linear_kernel(tfidf, tfidf)
print(f'Generated a similarity matrix of size {pairwise_mat.shape}\n')
pairwise_mat[0]

Generated a similarity matrix of size (9099, 9099)



array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [24]:
# Create a mapping of the movie title and it's corresponding IDs
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [25]:
# Function to find similar movies to a given title
def get_recommendations(title):
    # Find the index of the given movie
    idx = indices[title]

    # Get similarity scores of the given movie 
    sim_scores = list(enumerate(pairwise_mat[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return pd.DataFrame(titles.iloc[movie_indices])

##### Results

In [26]:
print(f'\nTop 10 similar movies to The Dark Knight')
get_recommendations('The Dark Knight')


Top 10 similar movies to The Dark Knight


Unnamed: 0,title
7931,The Dark Knight Rises
132,Batman Forever
1113,Batman Returns
8227,"Batman: The Dark Knight Returns, Part 2"
7565,Batman: Under the Red Hood
524,Batman
7901,Batman: Year One
2579,Batman: Mask of the Phantasm
2696,JFK


In [27]:
print(f'\nTop 10 similar movies to The Conjuring')
get_recommendations('The Conjuring')


Top 10 similar movies to The Conjuring


Unnamed: 0,title
9071,The Conjuring 2
5578,The Boston Strangler
6110,The Amityville Horror
3780,Things Behind the Sun
6556,Shooting Dogs
5781,The Turning Point
7040,Magicians
7786,Midnight in Paris
6092,Electra Glide in Blue


#### Metadata based

In [28]:
# Use movies from keywords, credits and metadata csv files
keywords['id'] = keywords['id'].astype('int')
creds['id'] = creds['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

print(f'Using {metadata.shape[0]} movies with metadata, credits and keywords')

Using 45463 movies with metadata, credits and keywords


In [29]:
# We merge the credits and keywords with the metadata dataframe
metadata = metadata.merge(creds, on='id')
metadata = metadata.merge(keywords, on='id')

In [30]:
# We again filter out that are a part of the links_small csv file
smd = metadata[metadata['id'].isin(links_small)]
print(f'We have a total of {smd.shape[0]} movies for content-based filtering using metadata, keywords and credits.')
smd.shape

We have a total of 9219 movies for content-based filtering using metadata, keywords and credits.


(9219, 28)

In [31]:
# Use also use cast and crew info 
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [32]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

# Since movies made by the same director are more likely to be liked by the user, we also leverage that information
smd['director'] = smd['crew'].apply(get_director)

In [33]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [34]:
# Pick the director name and top 3 characters from the cast
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [35]:
# Perform stemming on the metadata
s = s[s > 1]
stemmer = SnowballStemmer('english')

In [36]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [37]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [38]:
# Combine all the keywords, cast, directors and genres into a single column to be used as context
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))
smd['soup'][:11]

0     jealousi toy boy friendship friend rivalri boy...
1     boardgam disappear basedonchildren'sbook newho...
2     fish bestfriend duringcreditssting waltermatth...
3     basedonnovel interracialrelationship singlemot...
4     babi midlifecrisi confid age daughter motherda...
5     robberi detect bank obsess chase shoot thief h...
6     pari brotherbrotherrelationship chauffeur long...
7     jonathantaylorthomas bradrenfro rachaelleighco...
8     terrorist hostag explos vicepresid jean-claude...
9     cuba falselyaccus secretident computervirus se...
10    whitehous usapresid newlov widow michaeldougla...
Name: soup, dtype: object

In [39]:
# We now create the count matrix that gives the frequency of token in the metadta for each movie
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])
count_matrix.shape

(9219, 107377)

In [40]:
# We again calculate the pairwise matrix, this time for the count matrix
pairwise = cosine_similarity(count_matrix, count_matrix)
pairwise.shape

(9219, 9219)

In [41]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [42]:
# Function to get recos using the pairwise matrix, constructed using count vectorizer. We perform a similar 
# process here as the previous content-based filtering method, but using the pairwise matrix generated 
# using the Count Vectorizer

def improved_recommendations(title):


    idx = indices[title]
    sim_scores = list(enumerate(pairwise[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:30]
    movie_indices = [i[0] for i in sim_scores]

    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.65)

    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['weighted_rating'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('weighted_rating', ascending=False).head(10)
    return qualified

##### Results

In [43]:
print(f'\t\t\tImproved Recommendations for The Dark Knight\n')
improved_recommendations('The Dark Knight')

			Improved Recommendations for The Dark Knight



Unnamed: 0,title,vote_count,vote_average,year,weighted_rating
7648,Inception,14075,8,2010,7.891568
8613,Interstellar,11187,8,2014,7.864948
6623,The Prestige,4510,8,2006,7.687671
3381,Memento,4168,8,2000,7.665158
8031,The Dark Knight Rises,9263,7,2012,6.897144
6218,Batman Begins,7511,7,2005,6.874863
1134,Batman Returns,1706,6,1992,5.809246
132,Batman Forever,1529,5,1995,5.067066
9024,Batman v Superman: Dawn of Justice,7189,5,2016,5.018185
1260,Batman & Robin,1447,4,1997,4.354736


In [44]:
print(f'\t\t\tImproved Recommendations for Ice Age\n')
improved_recommendations('Ice Age')

			Improved Recommendations for Ice Age



Unnamed: 0,title,vote_count,vote_average,year,weighted_rating
7307,Up,7048,7,2009,6.867264
3233,The Emperor's New Groove,1544,7,2000,6.522756
543,The Aristocats,1287,7,1970,6.456943
7454,Fantastic Mr. Fox,1206,7,2009,6.432267
6441,Ice Age: The Meltdown,3034,6,2006,5.879406
8167,Ice Age: Continental Drift,2731,6,2012,5.868358
8371,The Croods,2447,6,2013,5.855994
7334,Ice Age: Dawn of the Dinosaurs,2330,6,2009,5.850197
1662,One Hundred and One Dalmatians,1643,6,1961,5.803832
6130,Robots,1383,6,2005,5.777805


## Collaborative Filtering

In [45]:
# Extract information about users and the ratings they have provided
reader = Reader()
data= Dataset.load_from_df(ratings_small[['userId', 'movieId', 'rating']], reader)
ratings_small.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [46]:
def get_title(id):
  return metadata.loc[metadata['id'] == id, 'original_title'].iloc[0]

#### Cross-Validation

In [47]:
# Cross-validate the data
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True);

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8987  0.8921  0.8906  0.9022  0.9002  0.8968  0.0046  
MAE (testset)     0.6937  0.6847  0.6874  0.6956  0.6920  0.6907  0.0041  
Fit time          5.13    5.09    5.10    5.07    5.07    5.09    0.02    
Test time         0.31    0.16    0.15    0.14    0.21    0.19    0.06    


#### Hyperparameter Tuning for the SVD model

In [48]:
param_grid = {
    "n_epochs": [10, 20],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.02]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], refit=True, cv=5)

gs.fit(data)

training_parameters = gs.best_params["rmse"]

print("BEST RMSE: \t", gs.best_score["rmse"])
print("BEST MAE: \t", gs.best_score["mae"])
print("BEST params: \t", gs.best_params["rmse"])

BEST RMSE: 	 0.8962594015739148
BEST MAE: 	 0.6902581317829999
BEST params: 	 {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}


#### Train Best model

In [49]:
trainset = data.build_full_trainset()

best_model = SVD(n_epochs=20, lr_all=0.005, reg_all=0.02)

tick = datetime.now()

best_model.fit(trainset)

tock = datetime.now()

print (f'Training done in {(tock-tick).seconds} seconds')

Training done in 5 seconds


In [50]:
# Find all the movies rated by User 1
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
5,1,1968,4.0,1425942148
6,1,2762,4.5,1425941300
7,1,2918,5.0,1425941593
8,1,2959,4.0,1425941601
9,1,4226,4.0,1425942228


In [51]:
print(f"Predicting User 1's rating for:\n")
print(f"{get_title(310)} = {best_model.predict(1, 310).est:.3f}")
print(f"{get_title(41609)} = {best_model.predict(1, 41609).est:.3f}")
print(f"{get_title(687)} = {best_model.predict(1, 687).est:.3f}")
print(f"{get_title(671)} = {best_model.predict(1, 671).est:.3f}")
print(f"{get_title(41248)} = {best_model.predict(1, 41248).est:.3f}")
print(f"{get_title(207932)} = {best_model.predict(1, 207932).est:.3f}")
print(f"{get_title(285783)} = {best_model.predict(1, 285783).est:.3f}")
print(f"{get_title(14652)} = {best_model.predict(1, 14652).est:.3f}")
print(f"{get_title(259693)} = {best_model.predict(1, 259693).est:.3f}")

Predicting User 1's rating for:

Bruce Almighty = 2.683
The Three Musketeers = 2.683
Dead Man Walking = 2.552
Harry Potter and the Philosopher's Stone = 3.237
La Fille du RER = 2.683
Inferno = 2.683
The Walk = 2.683
Bon voyage = 2.683
The Conjuring 2 = 2.683


## Hybrid Recommender

In [52]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [53]:
id_map = pd.read_csv('/content/drive/My Drive/EE 541/data/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

In [54]:
indices_map = id_map.set_index('id')

In [55]:
def hybrid(userId, title):

    # Indices 
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(pairwise[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

##### Results for Hybrid

In [56]:
print('\t\t\tHybrid Recommendations for The Dark Knight\n')
hybrid(1, 'The Dark Knight')

			Hybrid Recommendations for The Dark Knight



Unnamed: 0,title,vote_count,vote_average,year,id,est
3381,Memento,4168.0,8.1,2000,77,3.352599
6623,The Prestige,4510.0,8.0,2006,1124,3.349558
7648,Inception,14075.0,8.1,2010,27205,3.143566
8613,Interstellar,11187.0,8.1,2014,157336,3.009795
4145,Insomnia,1181.0,6.8,2002,320,2.970899
2448,Nighthawks,87.0,6.4,1981,21610,2.906174
5943,Thursday,84.0,7.0,1998,9812,2.905126
6218,Batman Begins,7511.0,7.5,2005,272,2.848029
8031,The Dark Knight Rises,9263.0,7.6,2012,49026,2.813446
7561,Harry Brown,351.0,6.7,2009,25941,2.804722


In [57]:
print('\t\t\tHybrid Recommendations for Avatar\n')
hybrid(1, 'Avatar')

			Hybrid Recommendations for Avatar



Unnamed: 0,title,vote_count,vote_average,year,id,est
1011,The Terminator,4208.0,7.4,1984,218,3.07494
2014,Fantastic Planet,140.0,7.6,1973,16306,3.057868
522,Terminator 2: Judgment Day,4274.0,7.7,1991,280,2.989671
974,Aliens,3282.0,7.7,1986,679,2.936557
8401,Star Trek Into Darkness,4479.0,7.4,2013,54138,2.869592
8658,X-Men: Days of Future Past,6155.0,7.5,2014,127585,2.854565
1621,Darby O'Gill and the Little People,35.0,6.7,1959,18887,2.729295
831,Escape to Witch Mountain,60.0,6.5,1975,14821,2.689719
7088,Star Wars: The Clone Wars,434.0,5.8,2008,12180,2.655283
4017,Hawk the Slayer,13.0,4.5,1980,25628,2.620397
