In [28]:
import pandas as pd
import numpy as np
import ast
import warnings
import pickle

warnings.filterwarnings('ignore')
# pd.set_option('display.max_colwidth', None)

In [29]:
#Imports
links_small = pd.read_csv('dataset/archive/links_small.csv')
all_movies = pd.read_csv('dataset/archive/movies_metadata.csv')
keywords = pd.read_csv('dataset/archive/keywords.csv')
credits = pd.read_csv('dataset/archive/credits.csv')

In [30]:
#Get the ID mappings
tmdbId = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
movieId = links_small[links_small['tmdbId'].notnull()]['movieId'].astype('int')

In [31]:
all_movies['id'] = pd.to_numeric(all_movies['id'], errors='coerce')

In [32]:
all_movies.shape

(45466, 24)

In [33]:
all_movies.dropna(subset='id', inplace=True)

In [34]:
all_movies.duplicated().sum()

13

In [35]:
all_movies.drop_duplicates(inplace=True)

In [36]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
all_movies['id'] = all_movies['id'].astype('int')

In [37]:
all_movies = all_movies.merge(credits, on='id')
all_movies = all_movies.merge(keywords, on='id')

In [38]:
smd = all_movies[all_movies['id'].isin(tmdbId)]
smd.shape

(9191, 27)

In [39]:
smd.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords'],
      dtype='object')

In [40]:
smd = smd[['id', 'title', 'keywords', 'genres', 'overview', 'cast', 'crew', 'poster_path']]

In [41]:
#To Get the Genres and the keywords in List from Json Object
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L

In [42]:
smd['genres'] = smd['genres'].apply(convert)
smd['keywords'] = smd['keywords'].apply(convert)

In [43]:
#To get the top 3 Cast Members
def getCast(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter+=1
    return L 

In [44]:
smd['cast'] = smd['cast'].apply(getCast)

In [45]:
#To get the Director's name
def getDirector(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

In [46]:
smd['director'] = smd['crew'].apply(getDirector)

In [47]:
smd.drop(['crew'], axis=1, inplace=True)

In [48]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
smd['genres'] = smd['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
smd['cast'] = smd['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
smd['director'] = smd['director'].apply(lambda x: [i.replace(" ", "") for i in x])

## Popularity Based Recommender

This Recommender offers generalized recommnendations to every user based on movie popularity and (sometimes) genre. The basic idea behind this recommender is that movies that are more popular and more critically acclaimed will have a higher probability of being liked by the average audience. This model does not give personalized recommendations based on the user.

The implementation of this model is extremely trivial. All we have to do is sort our movies based on ratings and popularity and display the top movies of our list. As an added step, we can pass in a genre argument to get the top movies of a particular genre.

I use the TMDB Ratings to come up with our **Top Movies Chart.** I will use IMDB's *weighted rating* formula to construct my chart. Mathematically, it is represented as follows:

Weighted Rating (WR) = $(\frac{v}{v + m} . R) + (\frac{m}{v + m} . C)$

where,
* *v* is the number of votes for the movie
* *m* is the minimum votes required to be listed in the chart
* *R* is the average rating of the movie
* *C* is the mean vote across the whole report

The next step is to determine an appropriate value for *m*, the minimum votes required to be listed in the chart. We will use **95th percentile** as our cutoff. In other words, for a movie to feature in the charts, it must have more votes than at least 95% of the movies in the list.

I will build our overall Top 250 Chart and will define a function to build charts for a particular genre. Let's begin!

In [60]:
mean_vote = all_movies[all_movies.vote_average.notnull()]['vote_average'].mean()

minimum_votes = all_movies['vote_count'].quantile(0.95)

In [61]:
all_movies['year'] = pd.to_datetime(all_movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [62]:
qualified = all_movies[(all_movies['vote_count'] >= minimum_votes) & (all_movies['vote_count'].notnull()) & (all_movies['vote_average'].notnull())][['id','title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified['id'] = qualified['id'].astype('int')
qualified.shape

(2331, 7)

In [69]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+minimum_votes) * R) + (minimum_votes/(minimum_votes+v) * mean_vote)

qualified['wr'] = qualified.apply(weighted_rating, axis=1)

qualified = qualified.sort_values('wr', ascending=False).head(250)

In [70]:
top_movies=qualified.head(200)

In [72]:
top_movies.head()

Unnamed: 0,id,title,year,vote_count,vote_average,popularity,genres,wr
15555,27205,Inception,2010,14075,8,29.108149,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",7.929831
12529,155,The Dark Knight,2008,12269,8,123.167259,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",7.919849
22976,157336,Interstellar,2014,11187,8,32.213481,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",7.912381
2854,550,Fight Club,1999,9678,8,63.869599,"[{'id': 18, 'name': 'Drama'}]",7.899295
4880,120,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",7.890801


In [93]:
#Saving the Top movies data
pickle.dump(top_movies,open('saved_models/top_movies.pkl','wb'))

## Content Based Recommender

The recommender we built in the previous section suffers some severe limitations. For one, it gives the same recommendation to everyone, regardless of the user's personal taste. If a person who loves romantic movies (and hates action) were to look at our Top 15 Chart, s/he wouldn't probably like most of the movies. If s/he were to go one step further and look at our charts by genre, s/he wouldn't still be getting the best recommendations.

For instance, consider a person who loves *Dilwale Dulhania Le Jayenge*, *My Name is Khan* and *Kabhi Khushi Kabhi Gham*. One inference we can obtain is that the person loves the actor Shahrukh Khan and the director Karan Johar. Even if s/he were to access the romance chart, s/he wouldn't find these as the top recommendations.

To personalise our recommendations more, I am going to build an engine that computes similarity between movies based on certain metrics and suggests movies that are most similar to a particular movie that a user liked. Since we will be using movie metadata (or content) to build this engine, this also known as **Content Based Filtering.**

I will build two Content Based Recommenders based on:
* Movie Overviews and Taglines
* Movie Cast, Crew, Keywords and Genre

Also, as mentioned in the introduction, I will be using a subset of all the movies available to us due to limiting computing power available to me. 

In [74]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [75]:
stop_words = set(stopwords.words('english'))
stop_words.update(["''", "``", "\'s", ','])

In [76]:
#Tokenization and removing stopwords
smd['overview'] = smd['overview'].apply(lambda tokens: [w for w in word_tokenize(str(tokens).lower()) if w not in stop_words])

In [77]:
smd['tags'] = smd['overview'] + smd['genres'] + smd['keywords'] + smd['cast'] + smd['director']
smd = smd.drop(columns=['overview','genres','keywords','cast','director'])

In [78]:
smd['tags'] = smd['tags'].apply(lambda x: ' '.join(x))

In [79]:
smd.reset_index(inplace=True)

In [80]:
smd.head(10)

Unnamed: 0,index,id,title,poster_path,tags
0,0,862,Toy Story,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,led woody andy toys live happily room andy bir...
1,1,8844,Jumanji,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,siblings judy peter discover enchanted board g...
2,2,15602,Grumpier Old Men,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,family wedding reignites ancient feud next-doo...
3,3,31357,Waiting to Exhale,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,cheated mistreated stepped women holding breat...
4,4,11862,Father of the Bride Part II,/e64sOI48hQXyru7naBFyssKFxVd.jpg,george banks recovered daughter wedding receiv...
5,5,949,Heat,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,obsessive master thief neil mccauley leads top...
6,6,11860,Sabrina,/jQh15y5YB7bWz1NtffNZmRw0s9D.jpg,ugly duckling undergone remarkable change stil...
7,7,45325,Tom and Huck,/sGO5Qa55p7wTu7FJcX4H4xIVKvS.jpg,mischievous young boy tom sawyer witnesses mur...
8,8,9091,Sudden Death,/eoWvKD60lT95Ss1MYNgVExpo5iU.jpg,international action superstar jean claude van...
9,9,710,GoldenEye,/5c0ovjT41KnYIHYuF4AWsTe3sKh.jpg,james bond must unmask mysterious head janus s...


In [81]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cv = CountVectorizer(max_features=3000)

vector = cv.fit_transform(smd['tags']).toarray()

similarity = cosine_similarity(vector)

In [82]:
def recommend(movie):
    index = smd[smd['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(smd.iloc[i[0]].title)

In [83]:
recommend('Toy Story')

Toy Story 2
Toy Story 3
The 40 Year Old Virgin
Creature Comforts
Over the Hedge


In [94]:
#Saving the model for later use
pickle.dump(smd,open('saved_models/movie_info.pkl','wb'))
pickle.dump(similarity,open('saved_models/similarity_matrix.pkl','wb'))

## Collaborative Filtering

Our content based engine suffers from some severe limitations. It is only capable of suggesting movies which are *close* to a certain movie. That is, it is not capable of capturing tastes and providing recommendations across genres.

Also, the engine that we built is not really personal in that it doesn't capture the personal tastes and biases of a user. Anyone querying our engine for recommendations based on a movie will receive the same recommendations for that movie, regardless of who s/he is.

Therefore, in this section, we will use a technique called **Collaborative Filtering** to make recommendations to Movie Watchers. Collaborative Filtering is based on the idea that users similar to a me can be used to predict how much I will like a particular product or service those users have used/experienced but I have not.

I will not be implementing Collaborative Filtering from scratch. Instead, I will use the **Surprise** library that used extremely powerful algorithms like **Singular Value Decomposition (SVD)** to minimise RMSE (Root Mean Square Error) and give great recommendations.

In [85]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

In [87]:
# pickle.dump(ratings,open('ratings_info.pkl','wb'))

In [88]:
reader = Reader()
ratings = pd.read_csv('dataset/archive/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [89]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8933  0.9028  0.8885  0.8950  0.9002  0.8960  0.0051  
MAE (testset)     0.6889  0.6950  0.6847  0.6890  0.6935  0.6902  0.0037  
Fit time          0.64    0.73    0.65    0.75    0.73    0.70    0.05    
Test time         0.06    0.06    0.17    0.06    0.07    0.08    0.04    


{'test_rmse': array([0.89333713, 0.90279359, 0.88845585, 0.89502233, 0.90023404]),
 'test_mae': array([0.68887353, 0.69497974, 0.68465166, 0.68898151, 0.69346978]),
 'fit_time': (0.6442360877990723,
  0.7333040237426758,
  0.6465420722961426,
  0.7529840469360352,
  0.7263176441192627),
 'test_time': (0.05719494819641113,
  0.057012081146240234,
  0.17303848266601562,
  0.061469316482543945,
  0.07029104232788086)}

In [90]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x70cf794031d0>

In [95]:
pickle.dump(svd,open('saved_models/svd_model.pkl','wb'))