In [1]:
import numpy as np 
import pandas as pd 

In [2]:
df1=pd.read_csv('/Users/merrillsequeira/Google Drive/UPGRAD_DATASETS/MOVIE_RECC/tmdb_5000_credits.csv')
df2=pd.read_csv('/Users/merrillsequeira/Google Drive/UPGRAD_DATASETS/MOVIE_RECC/tmdb_5000_movies.csv')

In [3]:
df1.columns = ['id','tittle','cast','crew']
df2= df2.merge(df1,on='id')

In [5]:
df2.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,tittle,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [6]:
df2.shape

(4803, 23)

In [7]:
df2.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'tittle', 'cast', 'crew'],
      dtype='object')

####  RECOMENDER SYSTEM 1 - using a score based system to recomend movies based on no of weighted votes 

In [9]:
C= df2['vote_average'].mean()

m= df2['vote_count'].quantile(0.9)


In [10]:
q_movies = df2.copy().loc[df2['vote_count'] >= m]
q_movies.shape


(481, 23)

In [11]:
# creating a weighted rating function  
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [12]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [13]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies as a recomendation 
print('Recomended popular movies are ...')
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)

Recomended popular movies are ...


Unnamed: 0,title,vote_count,vote_average,score
1881,The Shawshank Redemption,8205,8.5,8.059258
662,Fight Club,9413,8.3,7.939256
65,The Dark Knight,12002,8.2,7.92002
3232,Pulp Fiction,8428,8.3,7.904645
96,Inception,13752,8.1,7.863239
3337,The Godfather,5893,8.4,7.851236
95,Interstellar,10867,8.1,7.809479
809,Forrest Gump,7927,8.2,7.803188
329,The Lord of the Rings: The Return of the King,8064,8.1,7.727243
1990,The Empire Strikes Back,5879,8.2,7.697884


####  RECOMENDER SYSTEM 2 - plot based 

In [14]:
df2.overview.head(2)

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
Name: overview, dtype: object

##### this is designed using term frequency and inverse document frequency that creates a matric of movies in the rows and the relevant frequnt words as columns - I'll be using sci kit learn to produce the matrix 

In [15]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

df2['overview'] = df2['overview'].fillna('')

tfidf_matrix = tfidf.fit_transform(df2['overview'])

tfidf_matrix.shape

(4803, 20978)

In [17]:
#### this outputs a grid of 20978 different words against 4803 movies 
tfidf_matrix

<4803x20978 sparse matrix of type '<class 'numpy.float64'>'
	with 125840 stored elements in Compressed Sparse Row format>

##### now we can compute the cosine product of the matrix to give us the similarity score. We are going to define a function that takes in a movie title as an input and outputs a list of the 10 most similar movies  

In [19]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [20]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()

In [22]:
indices.head()

title
Avatar                                      0
Pirates of the Caribbean: At World's End    1
Spectre                                     2
The Dark Knight Rises                       3
John Carter                                 4
dtype: int64

In [23]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 20 most similar movies
    sim_scores = sim_scores[1:21]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 20 most similar movies
    return df2['title'].iloc[movie_indices]

In [24]:
 #output the recomendations 
    get_recommendations('John Carter')

1254                             Get Carter
4161            The Marine 4: Moving Target
2932                           Raising Cain
3349                              Desperado
1307                          The Hurricane
3068                            Rescue Dawn
345                             Rush Hour 2
581                 Star Trek: Insurrection
2998                                  Devil
4274       Eddie: The Sleepwalking Cannibal
46               X-Men: Days of Future Past
1735                         Ghosts of Mars
2755                          Earth to Echo
2968                   In the Land of Women
1057                           Coach Carter
85      Captain America: The Winter Soldier
322                       The Fifth Element
1300                         The Ugly Truth
2300           The Fall of the Roman Empire
2155                        The World's End
Name: title, dtype: object