<a href="https://colab.research.google.com/github/ethanpnguyen/ds4e/blob/main/notebooks/task8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preference Recommendations

Can you recommend movies that users may like?

In [3]:
import pandas as pd

In [4]:
dfMvs = pd.read_csv('/content/movies_metadata_clean.csv')
dfRate = pd.read_csv('/content/movies_ratings_small.csv')

In [5]:
dfMvs.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

# Content Based Filtering

## Prepare Data

In [6]:
dfMvs.shape

(45463, 24)

In [7]:
# Drop duplicate movie titles
dfMvs.drop_duplicates(subset='title', keep='first', inplace=True)
dfMvs.shape

(42277, 24)

In [8]:
# Prepare description column

dfMvs['tagline'] = dfMvs['tagline'].fillna('')
dfMvs['description'] = dfMvs['overview'] + dfMvs['tagline']
dfMvs['description'] = dfMvs['description'].fillna('')

In [9]:
dfMvs.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,description
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415,"Led by Woody, Andy's toys live happily in his ..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413,When siblings Judy and Peter discover an encha...
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92,A family wedding reignites the ancient feud be...
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34,"Cheated on, mistreated and stepped on, the wom..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173,Just when George Banks has recovered from his ...


## Build Model

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidvectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
tfid_wm = tfidvectorizer.fit_transform(dfMvs['description'])
tfid_wm.shape

(42277, 75440)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfid_wm, tfid_wm, dense_output=True)
cosine_sim.shape

## Predict Similar Movies

In [11]:
new_title = 'Kidnap'

In [16]:
titles = dfMvs['title']

In [12]:
indices = pd.Series(dfMvs.index, index = dfMvs['title'])
indices

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Caged Heat 3000                45456
Subdue                         45458
Century of Birthing            45459
Satan Triumphant               45461
Queerama                       45462
Length: 42277, dtype: int64

In [13]:
idx = indices[new_title]

In [14]:
idx

44279

In [18]:
def get_move_recommendations(new_title):
  idx = indices[new_title]
  sim_scores = list(enumerate(cosine_sim[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:11]
  movie_indices = [i[0] for i in sim_scores]
  return titles.iloc[movie_indices]

In [None]:
# get_move_recommendations('Bad Boys')