In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate


import warnings; warnings.simplefilter('ignore')

# Read & Clean + Prepare Data

In [2]:
movies = pd.read_csv('movies_metadata.csv')
print(movies.shape)
movies.head()

(45466, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
movies['genres'] = movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [4]:
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [5]:
links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]
links_small['tmdbId'] = links_small['tmdbId'].astype('int')

In [6]:
ratings = pd.read_csv('ratings_small.csv')

In [7]:
links_small.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [8]:
movies = movies.drop([19730, 29503, 35587])

In [9]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

In [10]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
movies['id'] = movies['id'].astype('int')

In [11]:
movies.shape

(45463, 25)

In [12]:
movies = movies.merge(credits, on='id')
movies = movies.merge(keywords, on='id')

In [13]:
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,year,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [14]:
smd = movies[movies['id'].isin(links_small['tmdbId'])]
smd.shape

(9219, 28)

In [15]:
smd = smd.merge(links_small, left_on='id', right_on='tmdbId')

In [16]:
smd

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,video,vote_average,vote_count,year,cast,crew,keywords,movieId,imdbId,tmdbId
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,False,7.7,5415.0,1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",1,114709,862
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,False,6.9,2413.0,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",2,113497,8844
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,False,6.5,92.0,1995,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",3,113228,15602
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,False,6.1,34.0,1995,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...",4,114885,31357
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,False,5.7,173.0,1995,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",5,113041,11862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9214,False,,8000000,[Drama],,159550,tt0255313,en,The Last Brickmaker in America,A man must cope with the loss of his wife and ...,...,False,7.0,1.0,2001,"[{'cast_id': 1, 'character': 'Henry Cobb', 'cr...","[{'credit_id': '544475aac3a36819fb000578', 'de...","[{'id': 6054, 'name': 'friendship'}, {'id': 20...",161944,255313,159550
9215,False,,1000000,"[Thriller, Romance]",,392572,tt5165344,hi,रुस्तम,"Rustom Pavri, an honourable officer of the Ind...",...,False,7.3,25.0,2016,"[{'cast_id': 0, 'character': 'Rustom Pavri', '...","[{'credit_id': '5951baf692514129c4016600', 'de...","[{'id': 10540, 'name': 'bollywood'}]",162542,5165344,392572
9216,False,,15050000,"[Adventure, Drama, History, Romance]",,402672,tt3859980,hi,Mohenjo Daro,"Village lad Sarman is drawn to big, bad Mohenj...",...,False,6.7,26.0,2016,"[{'cast_id': 0, 'character': 'Sarman', 'credit...","[{'credit_id': '57cd5d3592514179d50018e8', 'de...","[{'id': 10540, 'name': 'bollywood'}]",162672,3859980,402672
9217,False,,15000000,"[Action, Adventure, Drama, Horror, Science Fic...",,315011,tt4262980,ja,シン・ゴジラ,From the mind behind Evangelion comes a hit la...,...,False,6.6,152.0,2016,"[{'cast_id': 4, 'character': 'Rando Yaguchi : ...","[{'credit_id': '560892fa92514177550018b2', 'de...","[{'id': 1299, 'name': 'monster'}, {'id': 7671,...",163056,4262980,315011


In [17]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [18]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [19]:
smd['director'] = smd['crew'].apply(get_director)

In [20]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [21]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

What we plan on doing is creating a metadata dump for every movie which consists of genres, director, main actors and keywords.

We then use a Count Vectorizer to create our count matrix as we did in the Description Recommender. 

The remaining steps are similar to what we did earlier: we calculate the cosine similarities and return movies that are most similar.

In [22]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [23]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

In [24]:
keys = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
keys.name = 'keyword'

In [25]:
keys = keys.value_counts()
keys[:10]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
violence                264
love                    222
musical                 219
sex                     219
suspense                212
Name: keyword, dtype: int64

Next, we remove all keywords that appear only once and then we will have to deal with the similars one by merge them into one like 'dog' and 'dogs'

In [26]:
keys = keys[keys > 1]
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [27]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in keys:
            words.append(i)
    return words

In [28]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [29]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [30]:
smd_new = smd.merge(ratings, on = 'movieId')

In [157]:
movie = smd_new[['userId','movieId','title','soup','popularity','vote_average','vote_count','year']]

In [158]:
movie

Unnamed: 0,userId,movieId,title,soup,popularity,vote_average,vote_count,year
0,7,1,Toy Story,jealousi toy boy friendship friend rivalri boy...,21.946943,7.7,5415.0,1995
1,9,1,Toy Story,jealousi toy boy friendship friend rivalri boy...,21.946943,7.7,5415.0,1995
2,13,1,Toy Story,jealousi toy boy friendship friend rivalri boy...,21.946943,7.7,5415.0,1995
3,15,1,Toy Story,jealousi toy boy friendship friend rivalri boy...,21.946943,7.7,5415.0,1995
4,19,1,Toy Story,jealousi toy boy friendship friend rivalri boy...,21.946943,7.7,5415.0,1995
...,...,...,...,...,...,...,...,...
100117,624,161918,Sharknado 4: The 4th Awakens,sharkattack sequel farc lasvega creaturefeatur...,4.574494,4.3,88.0,2016
100118,287,161944,The Last Brickmaker in America,friendship sidneypoitier wendycrewson jayo.san...,0.038998,7.0,1.0,2001
100119,611,162542,Rustom,bollywood akshaykumar ileanad'cruz eshagupta t...,7.333139,7.3,25.0,2016
100120,611,162672,Mohenjo Daro,bollywood hrithikroshan poojahegde kabirbedi a...,1.423358,6.7,26.0,2016


# CountVectorizer Model

In [32]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [33]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [34]:
cosine_sim[0]

array([1.        , 0.02441931, 0.02738955, ..., 0.        , 0.        ,
       0.        ])

In [63]:
movie = movie.reset_index()
titles = movie['title']
indices = pd.Series(smd.index, index=smd['title'])
indices

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
                                                      ... 
The Last Brickmaker in America                        9214
Rustom                                                9215
Mohenjo Daro                                          9216
Shin Godzilla                                         9217
The Beatles: Eight Days a Week - The Touring Years    9218
Length: 9219, dtype: int64

Some test before we write functuon 

In [64]:
user_movies = movie[movie['userId'] == 5]
movie_list = user_movies['movieId'].values.tolist()
for movie_id in movie_list:
    # Call content based 
    title = movie['title'][movie['movieId'] == movie_id].values[0]

In [65]:
title

'Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan'

But we can notice that the recommender is not good yet, there are some problems with it and the most major one is that it recommend any movies that most suited the conditions without filtering, thats why some movies like "Death Become Her' and "What Lies Beneath" got recommend even with their low rating score

Thats why, we will continue to improve it by running through a filtering process of rating score

In [191]:
def top_10_recommendations(userId):
    user_movies = movie[movie['userId'] == userId]
    recommend_list = user_movies[['userId','movieId','popularity']]
    movie_list = recommend_list['movieId'].values.tolist()
    
    for movie_id in movie_list: 
        movie_title = movie['title'][movie['movieId'] == movie_id].values[0]
    idx = indices[movie_title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = movie.iloc[movie_indices][['userId','movieId', 'title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    mean_vote = vote_averages.mean()
    cut_point = vote_counts.quantile(0.60)
    
    def weighted_rating(x):
        v = x['vote_count']
        R = x['vote_average']
        return (v/(v+cut_point) * R) + (cut_point/(cut_point+v) * mean_vote)
    
    qualified = movies[(movies['vote_count'] >= cut_point) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False)
    return qualified

In [192]:
movie_ids = top_10_recommendations(70)

In [193]:
movie_ids

Unnamed: 0,userId,movieId,title,vote_count,vote_average,year,wr
8997,668,296,Pulp Fiction,8670,8,1994,7.758667
4105,475,111,Taxi Driver,2632,8,1976,7.421453
4098,447,111,Taxi Driver,2632,8,1976,7.421453
58,154,1,Toy Story,5415,7,1995,6.885868
1659,21,32,Twelve Monkeys,2470,7,1995,6.805055
1842,655,32,Twelve Monkeys,2470,7,1995,6.805055
1662,25,32,Twelve Monkeys,2470,7,1995,6.805055
1672,70,32,Twelve Monkeys,2470,7,1995,6.805055
1727,261,32,Twelve Monkeys,2470,7,1995,6.805055
495,73,6,Heat,1886,7,1995,6.773211


There we go, a better recommender using cosine similarity on crews, taglines, genres, etc,... with the improvement of vote filtering

But there are still some problems here, most of the recommendations here still base on the vote of other users, thats why  it not personalize enough and still recommend the past movies of others but we cant do much about it, although this can be fix by using hybrid method filtering but that is for other part of this project. This is the conclusion of content base filtering using cosine similarity

# Lastly, we will run it through evaluate model for checking, the validate data is the test data we split from the main one

In [194]:
def check_movieId(pred_df, val_df):
    result = pred_df['movieId'].isin(val_df[val_df['userId'] == \
                                   pred_df['userId'].iloc[0]]['movieId'])
    return result.reset_index(drop=True)

In [198]:
def evaluate(pred_df, val_df):
    """ Proportion of movies recommended that were actually watched by users

    Args:
    pred_df: dataframe, 2 columns: userId, movieId
    val_df: dataframe, 2 columns: userId, movieId

    Returns:
    
    """
    result = pred_df.groupby('userId').apply(lambda x: check_movieId(x, val_df))
    n_user = val_df.userId.nunique()
    top_k = pred_df.groupby('userId').count().iloc[0]
    top_k = int(top_k.iloc[0])

    return result.sum().sum() / (n_user*top_k)

In [199]:
val_df = pd.read_csv('test_set.csv')

In [200]:
evaluate(movie_ids, val_df)

0.0029806259314456036

as you can see, the accuracy of the model is really low, mostly because of how unpersonelize the content base model is, we only recommend the user base on their most watched genre, director, etc, ... and filter it through vote average and doesnt pay attention to other matters, this result has been foreseen and we still trying to improve the model by using other means