In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

<p style="background-image: linear-gradient(to right, #0aa98f, #68dab2)"> &nbsp; </p>

In [2]:
data = pd.read_csv('data/02_movies_metadata.csv', low_memory=False)
data = data.head(30000) # MEMORY ISSUE
data.sample()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
2351,False,,0,"[{'id': 27, 'name': 'Horror'}, {'id': 10402, '...",,25438,tt0092112,en,Trick or Treat,Eddie is your average 80's metal head teen. No...,...,1986-10-24,6797218.0,98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,What are you afraid of? It's only rock and rol...,Trick or Treat,False,5.6,40.0


In [3]:
data = data[['id', 'title', 'overview']]
data.fillna({'overview':''}, inplace=True)
data.head(3)

Unnamed: 0,id,title,overview
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...


<p style="background-image: linear-gradient(#0aa98f, #FFFFFF 10%)"> &nbsp; </p>

In [4]:
td_idf = TfidfVectorizer(stop_words='english')
matrix = td_idf.fit_transform(data['overview'])
matrix.shape

(30000, 58562)

In [5]:
similarity_matrix = linear_kernel(matrix, matrix)
pd.DataFrame(similarity_matrix[:11, :11])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,0.015614,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.015614,1.0,0.048675,0.0,0.0,0.051311,0.0,0.0,0.104723,0.0,0.007406
2,0.0,0.048675,1.0,0.0,0.025491,0.0,0.0,0.00645,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.007126,0.0,0.009079,0.0,0.0,0.0
4,0.0,0.0,0.025491,0.0,1.0,0.0,0.031646,0.0,0.032802,0.0,0.0
5,0.0,0.051311,0.0,0.007126,0.0,1.0,0.0,0.0,0.048521,0.0,0.0
6,0.0,0.0,0.0,0.0,0.031646,0.0,1.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.00645,0.009079,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,0.0,0.104723,0.0,0.0,0.032802,0.048521,0.0,0.0,1.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
# cos_similarity_matrix = cosine_similarity(matrix, matrix)
# pd.DataFrame(cos_similarity_matrix[:11, :11])

<p style="background-image: linear-gradient(#0aa98f, #FFFFFF 10%)"> &nbsp; </p>

In [7]:
def recommend_like(title:'str'):
    index = data[data['title']==title].index[0]

    similar_movies = list(enumerate(similarity_matrix[index]))
    order = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    order_index = [i[0] for i in order[1:6]]     # index 0 is itself

    return pd.DataFrame(data['title'].iloc[order_index])

recommend_like('Toy Story')

Unnamed: 0,title
15348,Toy Story 3
2997,Toy Story 2
10301,The 40 Year Old Virgin
24523,Small Fry
23843,Andy Hardy's Blonde Trouble


<p style="background-image: linear-gradient(to right, #0aa98f, #68dab2)"> &nbsp; </p>