---
## Import Libraries
---

In [1]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.wordnet import WordNetLemmatizer
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

---
## Load Dataset
---

In [2]:
credits = pd.read_csv('./Data/credits.csv')
keywords = pd.read_csv('./Data/keywords.csv')
links = pd.read_csv('./Data/links_small.csv')
md = pd.read_csv('./Data/movies_metadata.csv')
ratings = pd.read_csv('./Data/ratings_small.csv')

---
## Understand Dataset
---

### **Credit DataFrame**

In [3]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [4]:
credits.columns

Index(['cast', 'crew', 'id'], dtype='object')

* **cast:** Thông tin về dàn cast: Diễn viên, Giới tính, Nhân vật thủ vai
* **crew:** Thông tin đoàn phim như: Đạo diễn, Editor...
* **id:** ID TMDb

In [5]:
credits.shape

(45476, 3)

In [6]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


### **Keywords DataFrame**

In [7]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [8]:
keywords = keywords.dropna(ignore_index=True)
keywords.columns

Index(['id', 'keywords'], dtype='object')

* **id:** ID TMDb
* **Keywords:** Tags/Từ khóa của bộ phim

In [9]:
keywords.shape

(46419, 2)

In [10]:
keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


### **Link DataFrame**

In [11]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [12]:
links = links.drop(columns=['imdbId'])
links = links.dropna(ignore_index=True)
links.columns = ['Movie_Index', 'id']
links.columns

Index(['Movie_Index', 'id'], dtype='object')

* **movieId:** STT của phim
* **tmdbId:** ID TMDb
* **imdbId:** ID IMDb (Drop)

In [13]:
links.shape

(9112, 2)

In [14]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9112 entries, 0 to 9111
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Movie_Index  9112 non-null   int64  
 1   id           9112 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 142.5 KB


### **Metadata DataFrame**

In [15]:
md.iloc[0:3].transpose()

Unnamed: 0,0,1,2
adult,False,False,False
belongs_to_collection,"{'id': 10194, 'name': 'Toy Story Collection', ...",,"{'id': 119050, 'name': 'Grumpy Old Men Collect..."
budget,30000000,65000000,0
genres,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
homepage,http://toystory.disney.com/toy-story,,
id,862,8844,15602
imdb_id,tt0114709,tt0113497,tt0113228
original_language,en,en,en
original_title,Toy Story,Jumanji,Grumpier Old Men
overview,"Led by Woody, Andy's toys live happily in his ...",When siblings Judy and Peter discover an encha...,A family wedding reignites the ancient feud be...


In [16]:
md = md.drop(columns=['adult', 'belongs_to_collection', 'budget', 'homepage', 'imdb_id', 'original_language', 'original_title', 'popularity', 'poster_path',
                      'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'video'])
md = md.dropna(subset=['title', 'id'], ignore_index=True)
md.columns

Index(['genres', 'id', 'overview', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

* **genres:** Danh sách các thể loại liên quan đến bộ phim
* **id:** ID TMDb
* **overview:** Mô tả phim
* **tagline:** Câu dẫn đầu
* **title:** Tiêu đề phim (tiếng anh)
* **vote_average:** Điểm số vote trung bình
* **vote_count:** Tổng số lượt vote
* **adult, belongs_to_collection, budget, homepage, imdb_id, original_language, original_title, popularity, status** (Drop)
* **poster_path, production_companies, production_countries, release_date, revenue, runtime, spoken_languages, video** (Drop)

In [17]:
md.shape

(45460, 7)

In [18]:
md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45460 entries, 0 to 45459
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   genres        45460 non-null  object 
 1   id            45460 non-null  object 
 2   overview      44506 non-null  object 
 3   tagline       20412 non-null  object 
 4   title         45460 non-null  object 
 5   vote_average  45460 non-null  float64
 6   vote_count    45460 non-null  float64
dtypes: float64(2), object(5)
memory usage: 2.4+ MB


### **Ratings DataFrame**

In [19]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [20]:
ratings = ratings.drop(columns=['timestamp'])
ratings = ratings.dropna(ignore_index=True)
ratings.columns

Index(['userId', 'movieId', 'rating'], dtype='object')

* **userId:** User ID
* **movieID:** ID TMDb
* **rating:** Rating của user cho movie
* **timestamp:** Thời gian người dùng đánh giá (Drop)

In [21]:
ratings.shape

(100004, 3)

In [22]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100004 non-null  int64  
 1   movieId  100004 non-null  int64  
 2   rating   100004 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB


---
## Preprocessing
---

**Xử lý Missing Value**

In [23]:
# Chuyển dữ liệu sang kiểu int
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [24]:
md['id'] = md['id'].apply(convert_int)
md[md['id'].isnull()]

Unnamed: 0,genres,id,overview,tagline,title,vote_average,vote_count


In [25]:
links = links.astype('int')

**Xử lý kiểu dữ liệu**

In [26]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

**Xử lý bảng dữ liệu**

In [27]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [28]:
smd = md[md['id'].isin(links.id)]
smd.shape

(9219, 10)

In [29]:
smd = smd.drop_duplicates(['id'], keep="first", inplace=False)
smd = smd.drop_duplicates(['title'], keep="first", inplace=False)
smd.shape

(8809, 10)

In [30]:
smd.head()

Unnamed: 0,genres,id,overview,tagline,title,vote_average,vote_count,cast,crew,keywords
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,"Led by Woody, Andy's toys live happily in his ...",,Toy Story,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


**Xử lý Small Movie Data**

In [31]:
# Xử lý text
lemmatizer = WordNetLemmatizer()

def text_processing(text):
    a = re.sub('^a-zA-Z0-9',' ', str(text))
    a = re.sub(r'[^\w\s]','', a)
    a = a.lower()
    a = a.split()
    a = [lemmatizer.lemmatize(i) for i in a]
    a = " ".join(a)
    return a

In [32]:
smd['genres'] = smd['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [33]:
smd['tagline'] = smd['tagline'].fillna('').apply(text_processing)
smd['overview'] = smd['overview'].apply(text_processing)
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('').apply(lambda x: [x])

In [34]:
smd['cast'] = smd['cast'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else []).apply(lambda x: x[:5] if len(x) >=5 else x)
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "_")) for i in x])

In [35]:
smd['crew'] = smd['crew'].apply(literal_eval)

In [36]:
smd['keywords'] = smd['keywords'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
keywords_uniq = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
keywords_uniq.name = 'keyword'
keywords_uniq = keywords_uniq.value_counts()
keywords_uniq = keywords_uniq[keywords_uniq > 1]

In [37]:
keywords_uniq

keyword
independent film            598
woman director              532
murder                      379
duringcreditsstinger        314
based on novel              296
                           ... 
cartoon mouse                 2
black and white to color      2
gender roles                  2
croupier                      2
protestant                    2
Name: count, Length: 6563, dtype: int64

In [38]:
# Lọc keyword
def filter_keywords(x):
    words = []
    for i in x:
        if i in keywords_uniq:
            words.append(i)
    return words

In [39]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [lemmatizer.lemmatize(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "_")) for i in x])

In [40]:
# Lấy tên đạo diễn
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

smd['director'] = smd['crew'].apply(get_director)

In [41]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x, x, x])

In [42]:
smd['cast']

0        [tom_hanks, tim_allen, don_rickles, jim_varney...
1        [robin_williams, jonathan_hyde, kirsten_dunst,...
2        [walter_matthau, jack_lemmon, ann-margret, sop...
3        [whitney_houston, angela_bassett, loretta_devi...
4        [steve_martin, diane_keaton, martin_short, kim...
                               ...                        
40928    [sidney_poitier, wendy_crewson, jay_o._sanders...
41148    [akshay_kumar, ileana_d'cruz, esha_gupta, arja...
41205    [hrithik_roshan, pooja_hegde, kabir_bedi, arun...
41371    [hiroki_hasegawa, yutaka_takenouchi, satomi_is...
41653    [paul_mccartney, ringo_starr, john_lennon, geo...
Name: cast, Length: 8809, dtype: object

In [43]:
smd = smd.reset_index(drop=True)
smd

Unnamed: 0,genres,id,overview,tagline,title,vote_average,vote_count,cast,crew,keywords,description,director
0,"[Animation, Comedy, Family]",862,led by woody andys toy live happily in his roo...,,Toy Story,7.7,5415.0,"[tom_hanks, tim_allen, don_rickles, jim_varney...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy, friendship, friend, rival...",[led by woody andys toy live happily in his ro...,"[johnlasseter, johnlasseter, johnlasseter]"
1,"[Adventure, Fantasy, Family]",8844,when sibling judy and peter discover an enchan...,roll the dice and unleash the excitement,Jumanji,6.9,2413.0,"[robin_williams, jonathan_hyde, kirsten_dunst,...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[board_game, disappearance, based_on_children'...",[when sibling judy and peter discover an encha...,"[joejohnston, joejohnston, joejohnston]"
2,"[Romance, Comedy]",15602,a family wedding reignites the ancient feud be...,still yelling still fighting still ready for love,Grumpier Old Men,6.5,92.0,"[walter_matthau, jack_lemmon, ann-margret, sop...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fishing, best_friend, duringcreditsstinger]",[a family wedding reignites the ancient feud b...,"[howarddeutch, howarddeutch, howarddeutch]"
3,"[Comedy, Drama, Romance]",31357,cheated on mistreated and stepped on the woman...,friend are the people who let you be yourself ...,Waiting to Exhale,6.1,34.0,"[whitney_houston, angela_bassett, loretta_devi...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[based_on_novel, interracial_relationship, sin...",[cheated on mistreated and stepped on the woma...,"[forestwhitaker, forestwhitaker, forestwhitaker]"
4,[Comedy],11862,just when george bank ha recovered from his da...,just when his world is back to normal he in fo...,Father of the Bride Part II,5.7,173.0,"[steve_martin, diane_keaton, martin_short, kim...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlife_crisis, confidence, aging, daug...",[just when george bank ha recovered from his d...,"[charlesshyer, charlesshyer, charlesshyer]"
...,...,...,...,...,...,...,...,...,...,...,...,...
8804,[Drama],159550,a man must cope with the loss of his wife and ...,,The Last Brickmaker in America,7.0,1.0,"[sidney_poitier, wendy_crewson, jay_o._sanders...","[{'credit_id': '544475aac3a36819fb000578', 'de...",[friendship],[a man must cope with the loss of his wife and...,"[greggchampion, greggchampion, greggchampion]"
8805,"[Thriller, Romance]",392572,rustom pavri an honourable officer of the indi...,decorated officer devoted family man defending...,Rustom,7.3,25.0,"[akshay_kumar, ileana_d'cruz, esha_gupta, arja...","[{'credit_id': '5951baf692514129c4016600', 'de...",[bollywood],[rustom pavri an honourable officer of the ind...,"[tinusureshdesai, tinusureshdesai, tinusureshd..."
8806,"[Adventure, Drama, History, Romance]",402672,village lad sarman is drawn to big bad mohenjo...,,Mohenjo Daro,6.7,26.0,"[hrithik_roshan, pooja_hegde, kabir_bedi, arun...","[{'credit_id': '57cd5d3592514179d50018e8', 'de...",[bollywood],[village lad sarman is drawn to big bad mohenj...,"[ashutoshgowariker, ashutoshgowariker, ashutos..."
8807,"[Action, Adventure, Drama, Horror, Science Fic...",315011,from the mind behind evangelion come a hit lar...,a god incarnate a city doomed,Shin Godzilla,6.6,152.0,"[hiroki_hasegawa, yutaka_takenouchi, satomi_is...","[{'credit_id': '560892fa92514177550018b2', 'de...","[monster, godzilla, giant_monster, destruction...",[from the mind behind evangelion come a hit la...,"[hideakianno, hideakianno, hideakianno]"


In [44]:
import joblib

joblib.dump(smd, "recommender_dict/movie_dict.pkl")
joblib.dump(smd.title, "recommender_dict/movie_title_dict.pkl")

['recommender/movie_title_dict.pkl']

## 4. Build Recommendation System

### 4.1. Content based recommendation system

**Content-based RS : Sử dụng movie description, taglines, keywords, cast, director and genres**

In [45]:
smd['soup'] = smd['description'] + smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [46]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [47]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [48]:
import joblib

joblib.dump(cosine_sim, "recommender_dict/similarity.pkl") 

['recommender/similarity.pkl']

In [49]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

**Content-based RS cải tiến: Bổ sung thêm trọng số lượt vote và điểm vote**

In [50]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
m = vote_counts.quantile(0.95)

def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [51]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average']]
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & 
                       (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [52]:
improved_recommendations('Toy Story')

Unnamed: 0,title,vote_count,vote_average,wr
3755,"Monsters, Inc.",6150,7,6.886152
7328,Toy Story 3,4710,7,6.854225
2471,Toy Story 2,3914,7,6.827482
8242,The Lego Movie,3127,7,6.789258
8773,Finding Dory,4333,6,5.931998
6268,Cars,3991,6,5.926731
8026,The Croods,2447,6,5.887342
1852,A Bug's Life,2379,6,5.88461
1634,One Hundred and One Dalmatians,1643,6,5.843543
5922,Robots,1383,6,5.821043


### 4.2. Collaborative Filtering based Recommendation System

In [53]:
# surprise reader API to read the dataset
reader = Reader()

In [54]:
data = Dataset.load_from_df(ratings, reader)

In [55]:
svd = SVD()
cross_validate(svd, data, measures=['rmse', 'mae'], cv=5)

{'test_rmse': array([0.88948418, 0.89966251, 0.89749256, 0.90021967, 0.89966194]),
 'test_mae': array([0.68835211, 0.69367732, 0.68921536, 0.69183314, 0.69094585]),
 'fit_time': (1.516711711883545,
  1.1317150592803955,
  1.0807480812072754,
  1.0617411136627197,
  1.06691312789917),
 'test_time': (0.11483502388000488,
  0.10390806198120117,
  0.10243630409240723,
  0.10540509223937988,
  0.1012728214263916)}

In [56]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x26dbe0a2fe0>

In [57]:
svd.predict(1, 6)

Prediction(uid=1, iid=6, r_ui=None, est=3.001013352218755, details={'was_impossible': False})

In [58]:
import joblib

joblib.dump(svd, "recommender_dict/svd_recommender.pkl") 

['recommender/svd_recommender.pkl']

### 4.3. Hybird Recommendation System

In [59]:
id_map = links
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

In [60]:
id_map

Unnamed: 0_level_0,Movie_Index,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story,1,862
Jumanji,2,8844
Grumpier Old Men,3,15602
Waiting to Exhale,4,31357
Father of the Bride Part II,5,11862
...,...,...
The Last Brickmaker in America,161944,159550
Rustom,162542,392572
Mohenjo Daro,162672,402672
Shin Godzilla,163056,315011


In [61]:
indices_map = id_map.set_index('id')
indices_map

Unnamed: 0_level_0,Movie_Index
id,Unnamed: 1_level_1
862,1
8844,2
15602,3
31357,4
11862,5
...,...
159550,161944
392572,162542
402672,162672
315011,163056


In [62]:
indices_map.loc[862]

Movie_Index    1
Name: 862, dtype: int32

In [63]:
def hybrid(userId, title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]["Movie_Index"]).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [66]:
hybrid(4, 'Toy Story')

Unnamed: 0,title,vote_count,vote_average,id,est
2471,Toy Story 2,3914.0,7.3,863,4.880684
7328,Toy Story 3,4710.0,7.6,10193,4.860422
3755,"Monsters, Inc.",6150.0,7.5,585,4.744068
8242,The Lego Movie,3127.0,7.5,137106,4.711277
2699,Creature Comforts,29.0,7.3,54825,4.578328
1852,A Bug's Life,2379.0,6.8,9487,4.474797
6162,Luxo Jr.,148.0,7.1,13925,4.449329
887,Rebel Without a Cause,351.0,7.6,221,4.412029
1634,One Hundred and One Dalmatians,1643.0,6.8,12230,4.40524
6490,Meet the Robinsons,787.0,6.7,1267,4.303986
