In [1]:
# https://grouplens.org/datasets/movielens/latest/ 
#our data
#https://www.kaggle.com/rounakbanik/the-movies-dataset?select=movies_metadata.csv
import pandas as pd
metadata = pd.read_csv('D:/movies_metadata.csv',low_memory=False)
metadata.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


# We Will Create Simple Recommendation system/engine

In [2]:
C = metadata['vote_average'].mean()

In [6]:
C


5.618207215133889

In [7]:
m = metadata['vote_count'].quantile(0.90)
print(m)

160.0


In [8]:
# for safety of our original dataset we are copying new data set
movies_new  = metadata.copy().loc[metadata['vote_count']>=m] 
movies_new.shape

(4555, 24)

In [9]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [10]:
movies_new['score'] = movies_new.apply(weighted_rating,axis = 1)

In [11]:
movies_new = movies_new.sort_values('score',ascending = False)

In [12]:
# print top 7 movies to recommend.
movies_new[['title', 'vote_count', 'vote_average', 'score']].head(7)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639


In [13]:
# we can see some are very great movies.


# Content based Recommender

In [75]:
# movies that are similar to perticular movie
# based on similarity 
# based on their siilarity score threshold.
# plot
# Description Based

In [76]:
import pandas as pd
metadata = pd.read_csv('D:/movies_metadata.csv',low_memory=False)
metadata.head(1)

# My computer is unable to process large data so just for practice i will create a very small dataset.
import random
dataForProcess = metadata.sample(frac = .10)

In [77]:
dataForProcess['overview'].head()

20656    Working at an underground city storage facilit...
37910    The silent routine of 5 Nuns living in the Wes...
34394    Swedish tv movie directed by Lars Lennart Fors...
12230    A young girl named Juno gets herself pregnant ...
38124    Don Fabijan is a young priest who comes to ser...
Name: overview, dtype: object

In [78]:
# here comes natural language processing problem
# we need to extract feature from the above text data before 
# we extract similarity 
# we need to compute word vector of each overview or documents
# word vector are vectorized representation of words in a doc
# The vecot carry semantic meaning
# example : man and king will have vector representation close to each other
# while man and women would have representation far from each other.

# will compute Term Frequency - Inverse Document Frequency (TF-IFD) vectors 
# for each document. 
# This will give you a matrix where each column represents 
# a word in the overview vocabulary and each column represents a movie



In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [80]:
#TF IDF Vectorizer object  
tfidf = TfidfVectorizer(stop_words = 'english')

In [81]:
# replace NaN with an empty string
dataForProcess['overview'] = dataForProcess['overview'].fillna('')


In [82]:
# construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(dataForProcess['overview'])

In [83]:
# Output the shape of tfidf_matrix
tfidf_matrix.shape

(4547, 22982)

In [84]:
#Array mapping from feature integer indices to feature name
tfidf.get_feature_names()[5000:5010]

['curricular',
 'curry',
 'curse',
 'cursed',
 'cursing',
 'curt',
 'curtail',
 'curtain',
 'curtis',
 'curumim']

In [85]:
# https://www.youtube.com/watch?v=xY3jrJdpuQg

In [86]:

from sklearn.metrics.pairwise import linear_kernel

In [87]:
# Compute the cosine similarity matrix
# don't run suffer system hang here....
# just to run anyhow i am giving only 0:100 cross matrix or my system hang
# original
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

#cosine_sim = linear_kernel(tfidf_matrix[0:100], tfidf_matrix[0:100])

In [88]:
cosine_sim.shape

(4547, 4547)

In [89]:
cosine_sim[1]

array([0.        , 1.        , 0.        , ..., 0.        , 0.        ,
       0.05281595])

In [100]:
# mapping of movie title and dataframe indices :
#we need identify index of a movie in matadata DataFrame.


In [101]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(dataForProcess.index, index=dataForProcess['title']).drop_duplicates()

In [106]:
indices[indices<4500][:10] 

title
Big Shots                    3961
Magnolia                     3043
What Planet Are You From?    3208
Nowhere                      1462
Jamaica Inn                  2098
Volunteers                   3266
Mr. Holland's Opus             61
Road Trip                    3495
Penitentiary                 2838
Moonlight and Valentino       179
dtype: int64

In [98]:
# Now recommendation function
# Function that recommends movies as per its title.

def get_recommendations(title, cosine_sim = cosine_sim):
    #index of movie
    idx = indices[title]
    
    #pairwise similarity with given movie with other movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    #sort movie based on similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    #score of 7 most similar movie
    sim_scores = sim_scores[1:11]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return dataForProcess['title'].iloc[movie_indices]

In [107]:
get_recommendations('Big Shots')
#dont give random name or it will throw error as we have created 
#small dataset which do not contain many movie name
#neither give index out of range our data is 4500 around


11923                     Evening
4313                Game of Death
26839                   Juke Girl
35764           Lady of Vengeance
11386              Four Daughters
11096           Young Mr. Lincoln
25281    Bus Riley's Back In Town
13691                  Love Songs
26298               Night and Day
17671                Freakonomics
Name: title, dtype: object

# Credit, Genre and Keywords Based Recommender

In [108]:
# Based On 
# actors 
# directors 
# movie 
# plot poetry drama action etc

In [111]:
import pandas as pd
metadata = pd.read_csv('D:/movies_metadata.csv',low_memory=False)
metadata.head(1)

# remove rows with bad ID
metadata = metadata.drop([19730, 29503, 35587])