Movie Recommendation System: Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD, evaluate
import warnings; warnings.simplefilter('ignore')

In [2]:
md=pd.read_csv('../input/movies_metadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
md['genres']=md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [5]:
vote_counts=md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages=md[md['vote_average'].notnull()]['vote_average'].astype('int')
vote_mean=vote_averages.mean()
vote_mean
#C=VOTE_MEAN

5.244896612406511

In [6]:
vote_95= vote_counts.quantile(0.95)
vote_95
#vote_95=m

434.0

In [7]:
md['year']=pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [10]:
# qualified movies: vote>434
qualified=md[(md['vote_count']>=vote_95)&(md['vote_count'].notnull())&(md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count']=qualified['vote_count'].astype('int')
qualified['vote_average']=qualified['vote_average'].astype('int')
qualified.head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres
0,Toy Story,1995,5415,7,21.9469,"[Animation, Comedy, Family]"
1,Jumanji,1995,2413,6,17.0155,"[Adventure, Fantasy, Family]"
5,Heat,1995,1886,7,17.9249,"[Action, Crime, Drama, Thriller]"
9,GoldenEye,1995,1194,6,14.686,"[Adventure, Action, Thriller]"
15,Casino,1995,1343,7,10.1374,"[Drama, Crime]"


In [11]:
# calculate weighted rating
def weighted_rating(x):
    v_c=x['vote_count']
    v_r=x['vote_average']
    return (v_c/(v_c+vote_95)*v_r) + (vote_95/(vote_95+v_c)*vote_mean)

In [12]:
qualified['wr']=qualified.apply(weighted_rating, axis=1)

In [14]:
qualified=qualified.sort_values('wr', ascending=False).head(250)

In [15]:
# top 10 movies
qualified.head(10)
# top 3 movies are Nolan's movies
# adventure and science fiction are the most popular generes

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.95,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.6454,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.3072,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,"[Adventure, Fantasy, Action]",7.851924


In [16]:
s=md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name='genre'
gen_md=md.drop('genres', axis=1).join(s)
# look at movies in each genere

In [17]:
# new percentile of 85%
def build_chart(genre,percentile=0.85):
    df=gen_md[gen_md['genre']==genre]
    vote_counts=df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages=df[df['vote_average'].notnull()]['vote_average'].astype('int')
    v_mean=vote_averages.mean()
    v_quant=vote_counts.quantile(percentile)
    
    qualified=df[(df['vote_count']>=v_quant)&(df['vote_count'].notnull())&(df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count']=qualified['vote_count'].astype('int')
    qualified['vote_average']=qualified['vote_average'].astype('int')
    
    qualified['wr']=qualified.apply(lambda x:(x['vote_count']/(x['vote_count']+v_quant)*x['vote_average'])+(v_quant/(v_quant+x['vote_count'])*v_mean), axis=1)
    qualified=qualified.sort_values('wr',ascending=False).head(10)
    
    return qualified

In [18]:
# top 10 romamcne movie
build_chart('Romance').head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.565285
351,Forrest Gump,1994,8147,8,48.3072,7.971357
876,Vertigo,1958,1162,8,18.2082,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.8451,7.745154
1132,Cinema Paradiso,1988,834,8,14.177,7.744878
19901,Paperman,2012,734,8,7.19863,7.713951
37863,Sing Street,2016,669,8,10.672862,7.689483
882,The Apartment,1960,498,8,11.9943,7.599317
38718,The Handmaiden,2016,453,8,16.727405,7.566166


In [19]:
# Collaborative Filtering
reader = Reader()

In [20]:
ratings=pd.read_csv('../input/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [21]:
data=Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

In [22]:
svd=SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])
# small RMSE, good to use

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.9013
MAE:  0.6940
------------
Fold 2
RMSE: 0.9010
MAE:  0.6909
------------
Fold 3
RMSE: 0.8948
MAE:  0.6918
------------
Fold 4
RMSE: 0.8956
MAE:  0.6880
------------
Fold 5
RMSE: 0.8920
MAE:  0.6879
------------
------------
Mean RMSE: 0.8969
Mean MAE : 0.6905
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.69399586120736068,
                             0.69089174901306627,
                             0.69175554282933915,
                             0.6880047145913073,
                             0.68787473009998401],
                            'rmse': [0.90132681373440604,
                             0.90095789758267808,
                             0.89478058361166779,
                             0.89563762461501817,
                             0.89198962608545518]})

In [24]:
train=data.build_full_trainset()
svd.train(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f039f59fa90>

In [25]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [26]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=2.7421172046299596, details={'was_impossible': False})