<a href="https://colab.research.google.com/github/codekennML/Recommendation-Engine---Movies/blob/main/Content_Based_Movie_Recommender_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Steps in Building Model 

-  Data Sourcing 
- Data Preprocessing 
- Model Building 


Data Sourcing 

- The data for the prroject is the TMDB 5000 Movie dataset located at 
https://www.kaggle.com/tmdb/tmdb-movie-metadata?select=tmdb_5000_movies.csv

Data Preprocessing 

In [None]:
# Load in required libraries & dependencies 

import numpy as np   # Array manipulation 
import pandas as pd  # DataFrame manipulation 


In [None]:
# Loading in the datasets
movies   =  pd.read_csv('/content/tmdb_5000_movies.csv')

credits  =  pd.read_csv('/content/tmdb_5000_credits.csv')


In [None]:
# Lets examine the content of the dataset
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [None]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [None]:
#Lets merge the two datasets as they have similar contents 
movies = movies.merge(credits ,  on = 'title', how = 'right' )

In [None]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [None]:
# We will now evaluate the columns we need and those we need to ignore 

#Budget -  Not necessary 
#Genres -  Very Important 
#Homepage -  Unneccessary 

#In summary, The columns we need are 

movies =  movies [['movie_id', 'title','overview','genres','keywords', 'cast', 'crew']]
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [None]:
#Lets perform some exploratory checks on our dataset

movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   title     4809 non-null   object
 2   overview  4806 non-null   object
 3   genres    4809 non-null   object
 4   keywords  4809 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.6+ KB


In [None]:
#Lets perform some exploratory checks on our dataset
# movies.isnull().sum()

#We have three missing values in the overview column and since the number is minimal 
#We can drop them 

movies.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [None]:
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [None]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [None]:
#Lets handle the json type columns 
#We have to create a helper function to extract the contents 
movies['genres'].head(1).values

array(['[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'],
      dtype=object)

In [None]:
import ast 
def extractor(genres):
  extract  =  []
  for entry in ast.literal_eval(genres):
    extract.append(entry['name'])

  return extract 



In [None]:
#Parsing all json-like columns [Genres & keywords]
movies['genres'] =  movies['genres'].apply(extractor)
movies['keywords'] =  movies['keywords'].apply(extractor)


In [None]:

def extractcast(cast):
   extract  =  []
   counter =  0 
   for entry in ast.literal_eval(cast):
     if counter != 3:
       extract.append(entry['name'])
       counter+=1
     else:
       break
   return extract 


In [None]:
movies['cast'] = movies['cast'].apply(extractcast)

In [None]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [None]:
#Lets create a helper function to extract the director name from the crew columns 

def extractdirector(crew):
  extract =  []
  for director in ast.literal_eval(crew):
    if director['job'] == 'Director' :
        extract.append(director['name'])
        break 
  return extract


In [None]:
movies['crew'] = movies['crew'].apply(extractdirector)

In [None]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [None]:
#Split the overview column

movies['overview'] =  movies['overview'].apply(lambda x : x.split())


In [None]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [None]:
#Some first names are similar and it would be wise to ensure that both first and last name have no space between

movies['cast'] =  movies['cast'].apply(lambda x : [i.replace(' ', '') for i in x ])
movies['crew'] =  movies['crew'].apply(lambda x : [i.replace(' ', '') for i in x ])
movies['genres'] =  movies['genres'].apply(lambda x : [i.replace(' ', '') for i in x ])
movies['keywords'] =  movies['keywords'].apply(lambda x : [i.replace(' ', '') for i in x ])

In [None]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [None]:
# Converting all the extraneous columns into one column named tags 
movies['tags'] =  movies['overview'] + movies['keywords'] + movies['cast'] + movies['crew'] + movies['genres']


In [None]:
movies_df =  movies[['movie_id', 'title', 'tags']]

movies_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [None]:
#Lets join the strings in the tags columns 
movies_df['tags'] =  movies_df['tags'].apply(lambda x : ' '.join(x))

movies_df['tags']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0       In the 22nd century, a paraplegic Marine is di...
1       Captain Barbossa, long believed to be dead, ha...
2       A cryptic message from Bond’s past sends him o...
3       Following the death of District Attorney Harve...
4       John Carter is a war-weary, former military ca...
                              ...                        
4804    El Mariachi just wants to play his guitar and ...
4805    A newlywed couple's honeymoon is upended by th...
4806    "Signed, Sealed, Delivered" introduces a dedic...
4807    When ambitious New York attorney Sam is sent t...
4808    Ever since the second grade when he first saw ...
Name: tags, Length: 4806, dtype: object

In [None]:
movies_df.head(5)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [None]:
#Lets examine the contents of the tags column now after joining the strings 
movies_df['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron Action Adventure Fantasy ScienceFiction'

In [None]:
#Now , we convert the uppercase txt to all lowercase
movies_df['tags'] =  movies_df['tags'].apply(lambda x : x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
movies_df.head(2)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."


In [None]:
#Since our df is textual data, we need to convert the text into numerical data and for that we need 
#CountVectorizer  from sklearn. CV vectorizes and counts the number of vectors formed from a grouo of texts 

from sklearn.feature_extraction.text import CountVectorizer
cv  =  CountVectorizer(max_features = 5000, stop_words='english')

#Convert the vectorized text into a numpy array
vectors  = cv.fit_transform(movies_df['tags']).toarray()

In [None]:
import nltk 
from nltk.stem.porter import PorterStemmer
#PorterStemmer strips a word into its most basic form 
#e.g loving - love , dancer - dance etc 

In [None]:
ps =  PorterStemmer()

In [None]:
#Lets define a helper function to stem the words in the tags columns

def stem(text):
  stemmed_content  =  []
  for entry in text.split():
    stemmed_content.append(ps.stem(entry))
  return ' '.join(stemmed_content)

In [None]:
movies_df['tags'] =  movies_df['tags'].apply(stem)
movies_df['tags']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


0       in the 22nd century, a parapleg marin is dispa...
1       captain barbossa, long believ to be dead, ha c...
2       a cryptic messag from bond’ past send him on a...
3       follow the death of district attorney harvey d...
4       john carter is a war-weary, former militari ca...
                              ...                        
4804    el mariachi just want to play hi guitar and ca...
4805    a newlyw couple' honeymoon is upend by the arr...
4806    "signed, sealed, delivered" introduc a dedic q...
4807    when ambiti new york attorney sam is sent to s...
4808    ever sinc the second grade when he first saw h...
Name: tags, Length: 4806, dtype: object

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
#Import the cosine similarity method from metrics 

In [None]:
#Lets take the cosine similarity of the textual vectors we generated earlier 
similarity  =  cosine_similarity(vectors)

In [None]:
#Its now time to build our recommender based off of the vectors and the similarirty 

#The idea is simple :
#We need to be able to get the mocie we are looking for, say Avatar for example and then we need to find its index 
#The movie 'Avatar' is at index zero and Batman Begins at index 119 

#movies_df[movies_df['title'] == 'Avatar'].index[0]
movies_df[movies_df['title'] == 'Batman Begins'].index[0]

119

In [None]:
#Now that we have established the index, we will need to go to the list of our similarity
#E.g the similarity array of batman is this 
similarity[119]


array([0.04829453, 0.05050763, 0.07576144, ..., 0.02129589, 0.02380952,
       0.02317449])

In [None]:
sorted(list(enumerate(similarity[0])), reverse = True , key =  lambda x :x[1])[1:6]

[(539, 0.26089696604360174),
 (1192, 0.2581988897471611),
 (507, 0.25302403842552984),
 (260, 0.25110592822973776),
 (1214, 0.24944382578492943)]

In [None]:
#The array above shows the angular distance between batman and everyother movie in the dataset
#So we also need to find the similarity distance of the movie we are looking for 

#Next we need to sort the similarity distance of the movie without messing with the index or ordering of the mocies attached to it 
#And for that we need the enumerate function
#Lets craete our recommender function 

def recommend(movie):
  #Step 1 : Find the index of the requested movie
  movie_index = movies_df[movies_df['title'] == movie].index[0]
  #Step 2 : Get the similarity array of the movie requested 
  distances  =  similarity[movie_index]
  #Step 3 : Sort the distance between the requested movie and the ones in the dataset similar to it and order by the similariry
  similar_movies  =   sorted(list(enumerate(distances)), reverse=True , key =  lambda x :x[1])[1:6]
  
  for movie in similar_movies:

    print( movies_df.iloc[movie[0]].title )

In [None]:
filter =  movies_df['title' ] == 'Falcon Rising'
movies_df.loc[filter]

Unnamed: 0,movie_id,title,tags
3729,270938,Falcon Rising,"chapman is an ex-marin in brazil' slums, battl..."


In [None]:
recommend('Falcon Rising')

Amidst the Devil's Wings
Street Kings
Swelter
The Glimmer Man
Tracker
