# Introduction

Write introduction for this subject!

# Content Based Filtering

Import libraries and dependencies

In [139]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Read data from csv file

In [140]:
movie1 = pd.read_csv('tmdb_5000_movies.csv')
movie2 = pd.read_csv('tmdb_5000_credits.csv')

movie2.columns = ['id','tittle','cast','crew'] #change 'movie_id' to 'id'

# merge two dataframe on 'id'
movie = movie1.merge(movie2, on='id')

#show several above rows of dataframe
movie.head(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,tittle,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


In [141]:
#show the shape of dataframe
print(movie.shape)

(4803, 23)


## Data Pre-Processing

In [142]:
# Parse the stringified features into their corresponding python objects
# And selecting features
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    movie[feature] = movie[feature].apply(literal_eval)

In [143]:
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [144]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        return names
    #Return empty list in case of missing/malformed data
    return []

In [145]:
# Define new director, cast, genres and keywords features that are in a suitable form.
movie['director'] = movie['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    movie[feature] = movie[feature].apply(get_list)

In [146]:
# Print the new features of the first 5 films
movie[['title', 'cast', 'director', 'keywords', 'genres']].head(5)

Unnamed: 0,title,cast,director,keywords,genres
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",James Cameron,"[culture clash, future, space war, space colon...","[Action, Adventure, Fantasy, Science Fiction]"
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",Gore Verbinski,"[ocean, drug abuse, exotic island, east india ...","[Adventure, Fantasy, Action]"
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",Sam Mendes,"[spy, based on novel, secret agent, sequel, mi...","[Action, Adventure, Crime]"
3,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman, A...",Christopher Nolan,"[dc comics, crime fighter, terrorist, secret i...","[Action, Crime, Drama, Thriller]"
4,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",Andrew Stanton,"[based on novel, mars, medallion, space travel...","[Action, Adventure, Science Fiction]"


In [147]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [148]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    movie[feature] = movie[feature].apply(clean_data)

In [149]:
# Print the new features of the first 5 films afer clean_data
movie[['title', 'cast', 'director', 'keywords', 'genres']].head(5)

Unnamed: 0,title,cast,director,keywords,genres
0,Avatar,"[samworthington, zoesaldana, sigourneyweaver, ...",jamescameron,"[cultureclash, future, spacewar, spacecolony, ...","[action, adventure, fantasy, sciencefiction]"
1,Pirates of the Caribbean: At World's End,"[johnnydepp, orlandobloom, keiraknightley, ste...",goreverbinski,"[ocean, drugabuse, exoticisland, eastindiatrad...","[adventure, fantasy, action]"
2,Spectre,"[danielcraig, christophwaltz, léaseydoux, ralp...",sammendes,"[spy, basedonnovel, secretagent, sequel, mi6, ...","[action, adventure, crime]"
3,The Dark Knight Rises,"[christianbale, michaelcaine, garyoldman, anne...",christophernolan,"[dccomics, crimefighter, terrorist, secretiden...","[action, crime, drama, thriller]"
4,John Carter,"[taylorkitsch, lynncollins, samanthamorton, wi...",andrewstanton,"[basedonnovel, mars, medallion, spacetravel, p...","[action, adventure, sciencefiction]"


## Main algorithm

Create metadata: a string to vectorize

In [153]:
def merge_string(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
movie['metadata'] = movie.apply(create_soup, axis=1)
s = movie.apply(create_soup, axis=1)

In [132]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movie['metadata'])

In [133]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype = np.float


In [134]:
# Reset index of our main DataFrame and construct reverse mapping as before
movie = movie.reset_index()
indices = pd.Series(movie.index, index=movie['title'])

In [135]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movie['title'].iloc[movie_indices]

In [136]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

4638    Amidst the Devil's Wings
65               The Dark Knight
96                     Inception
119                Batman Begins
4099                 Harsh Times
2060          Out of the Furnace
210               Batman & Robin
4408              Jimmy and Judy
1431                Premium Rush
95                  Interstellar
Name: title, dtype: object

In [154]:
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(s)
print(feature_vectors)

  (0, 13386)	0.07945601965510801
  (0, 21704)	0.065803008680591
  (0, 58525)	0.09834752821618405
  (0, 58509)	0.08840198470709781
  (0, 58211)	0.08840198470709781
  (0, 58524)	0.07443870137081883
  (0, 21708)	0.08421535053818528
  (0, 54020)	0.0312799940310771
  (0, 58504)	0.0721297249393013
  (0, 1762)	0.06025559973340037
  (0, 63382)	0.09235148042208353
  (0, 1775)	0.08683459933146478
  (0, 9999)	0.09021139833228578
  (0, 40584)	0.08421535053818528
  (0, 58252)	0.0668243442416656
  (0, 5552)	0.07056233956366825
  (0, 38871)	0.09497072921536304
  (0, 3933)	0.09834752821618405
  (0, 50515)	0.09497072921536304
  (0, 44456)	0.09021139833228578
  (0, 22)	0.057305682475511086
  (0, 55521)	0.08309660400946212
  (0, 66819)	0.07550652394012229
  (0, 57751)	0.06985073052724072
  (0, 59167)	0.08207526844838751
  :	:
  (4800, 5876)	0.2786526700522405
  (4800, 50489)	0.29213750573938013
  (4800, 36281)	0.29213750573938013
  (4800, 56280)	0.29213750573938013
  (4801, 6490)	0.3216452438551558
  (48

In [155]:
similarity = cosine_similarity(feature_vectors)
print(similarity.shape)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype = np.float


(4803, 4803)


In [None]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movie['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1