In [1]:
import pandas as pd
import numpy as np

In [2]:
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from sklearn.model_selection import train_test_split
from surprise import dump

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# **Read file, chia train test, convert**

In [4]:
movie = pd.read_csv('final_data')
movie.drop('Unnamed: 0', axis = 1, inplace = True)
movie = movie[['movieId', 'title', 'genres', 'description', 'popularity']]

In [5]:
movie.head()

Unnamed: 0,movieId,title,genres,description,popularity
0,1,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...",21.946943
1,2,Jumanji,"['Adventure', 'Fantasy', 'Family']",When siblings Judy and Peter discover an encha...,17.015539
2,3,Grumpier Old Men,"['Romance', 'Comedy']",A family wedding reignites the ancient feud be...,11.7129
3,4,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']","Cheated on, mistreated and stepped on, the wom...",3.859495
4,5,Father of the Bride Part II,['Comedy'],Just when George Banks has recovered from his ...,8.387519


In [6]:
rating = pd.read_csv('ratings_small.csv')
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


# **Merge data để chia train, test**

In [7]:
data = rating.merge(movie, on = 'movieId')

In [8]:
data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,description,popularity
0,1,31,2.5,1260759144,Dangerous Minds,"['Drama', 'Crime']",Former Marine Louanne Johnson lands a gig teac...,9.481338
1,7,31,3.0,851868750,Dangerous Minds,"['Drama', 'Crime']",Former Marine Louanne Johnson lands a gig teac...,9.481338
2,31,31,4.0,1273541953,Dangerous Minds,"['Drama', 'Crime']",Former Marine Louanne Johnson lands a gig teac...,9.481338
3,32,31,4.0,834828440,Dangerous Minds,"['Drama', 'Crime']",Former Marine Louanne Johnson lands a gig teac...,9.481338
4,36,31,3.0,847057202,Dangerous Minds,"['Drama', 'Crime']",Former Marine Louanne Johnson lands a gig teac...,9.481338
...,...,...,...,...,...,...,...,...
100117,664,64997,2.5,1343761859,H.G. Wells' War of the Worlds,['Science Fiction'],In this modern retelling of H.G. Wells' classi...,1.314076
100118,664,72380,3.5,1344435977,The Box,"['Thriller', 'Science Fiction']","Norma and Arthur Lewis, a suburban couple with...",10.424946
100119,665,129,3.0,995232528,Pie in the Sky,"['Comedy', 'Romance']",Pie in the Sky is a 1996 American romantic com...,0.699066
100120,665,4736,1.0,1010197684,Summer Catch,"['Drama', 'Comedy', 'Romance']",A coming-of-age romantic comedy set against th...,5.499867


In [9]:
train_df, test_df = train_test_split(data, test_size = 0.3, random_state = 42, stratify=data['userId'])#chia train, test

In [10]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):#convert để dùng cho thư viẹn surprise
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [11]:
train_set, test_set = convert_traintest_dataframe_forsurprise(train_df, test_df)

# CF and Laten Factor model 

In [12]:
sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
knnbaseline_algo = KNNBaseline(sim_options=sim_options)

knnbaseline_algo.fit(train_set)
knnbaseline_predictions = knnbaseline_algo.test(test_set)

file_name = 'KnnBaseline_model'
dump.dump(file_name, algo=knnbaseline_predictions)
# _, loaded_algo = dump.load(file_name)

accuracy.rmse(knnbaseline_predictions)
accuracy.mae(knnbaseline_predictions)
print("Done!")

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9024
MAE:  0.6960
Done!


In [13]:
svd_algo = SVD()

svd_algo.fit(train_set)
svd_predictions = svd_algo.test(test_set)

file_name = 'svd_model'
dump.dump(file_name, algo=svd_algo)
# _, loaded_algo = dump.load(file_name)

accuracy.rmse(svd_predictions)
accuracy.mae(svd_predictions)
print("Done!")

RMSE: 0.8936
MAE:  0.6875
Done!


In [14]:
svdpp_algo = SVDpp()

svdpp_algo.fit(train_set)
svdpp_predictions = svdpp_algo.test(test_set)

file_name = 'svdpp_model'
dump.dump(file_name, algo=svdpp_algo)
# _, loaded_algo = dump.load(file_name)

accuracy.rmse(svdpp_predictions)
accuracy.mae(svdpp_predictions)
print("Done!")

RMSE: 0.8848
MAE:  0.6796
Done!


# **Encode Genres**

In [15]:
genre_to_idx = {'Animation': 0,
 'Comedy': 1,
 'Family': 2,
 'Adventure': 3,
 'Fantasy': 4,
 'Romance': 5,
 'Drama': 6,
 'Action': 7,
 'Crime': 8,
 'Thriller': 9,
 'Horror': 10,
 'History': 11,
 'Science Fiction': 12,
 'Mystery': 13,
 'War': 14,
 'Foreign': 15,
 'Music': 16,
 'Documentary': 17,
 None: 18,
 'Western': 19,
 'TV Movie': 20}

In [16]:
idx_to_genre = {0: 'Animation',
 1: 'Comedy',
 2: 'Family',
 3: 'Adventure',
 4: 'Fantasy',
 5: 'Romance',
 6: 'Drama',
 7: 'Action',
 8: 'Crime',
 9: 'Thriller',
 10: 'Horror',
 11: 'History',
 12: 'Science Fiction',
 13: 'Mystery',
 14: 'War',
 15: 'Foreign',
 16: 'Music',
 17: 'Documentary',
 18: None,
 19: 'Western',
 20: 'TV Movie'}

# **TFIDF Matrix**

In [17]:
movie['description_genre'] = movie['description']+ 2*movie['genres']
movie['description_genre'] = movie['description_genre'].fillna('')

In [18]:
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english')
tfidf_matrix = tfidf.fit_transform(movie['description_genre'])

In [19]:
cosine_sim= linear_kernel(tfidf_matrix, tfidf_matrix)

In [20]:
# map title với index của table movie, index của title = index của bảng, value = title
titles = movie['title']
indices = pd.Series(movie.index, index=movie['title'])
indices

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
                                                      ... 
The Last Brickmaker in America                        9214
Rustom                                                9215
Mohenjo Daro                                          9216
Shin Godzilla                                         9217
The Beatles: Eight Days a Week - The Touring Years    9218
Length: 9219, dtype: int64

# **Movie Similarity model**

In [21]:
def get_recommendations_new(title):
    idx = indices[title] #lấy ra index của title
    if type(idx) != np.int64:
        if len(idx)>1:
            print("ALERT: Multiple values")
            idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movie['movieId'].iloc[movie_indices]

In [22]:
get_recommendations_new("Toy Story")

2522      3114
7629     78499
6267     35836
8432    103335
2751      3429
2567      3174
1432      1822
6809     54272
3833      4886
7254     67295
Name: movieId, dtype: int64

# **Popularity Model**

In [23]:
def genre_based_popularity(genre):
    mask = movie.genres.apply(lambda x: genre in x) # trả về dạng bool, check xem genre có trong cái list genres đó k
    filtered_movie = movie[mask]# trả về dataframe các film match với genre
    filtered_movie = filtered_movie.sort_values(by='popularity', ascending=False) #xếp theo độ phổ biến nhất
    return filtered_movie['movieId'].head(10).values.tolist() #trả về list top 10 movie similar

# genre_based_popularity('Animation')[['title', 'popularity']].head(25)

In [24]:
mask = movie.genres.apply(lambda x: "Animation" in x)

In [25]:
mask

0        True
1       False
2       False
3       False
4       False
        ...  
9214    False
9215    False
9216    False
9217    False
9218    False
Name: genres, Length: 9219, dtype: bool

In [26]:
filtered_movie = movie[mask]

In [27]:
filtered_movie

Unnamed: 0,movieId,title,genres,description,popularity,description_genre
0,1,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...",21.946943,"Led by Woody, Andy's toys live happily in his ..."
12,13,Balto,"['Family', 'Animation', 'Adventure']",An outcast half-wolf risks his life to prevent...,12.140733,An outcast half-wolf risks his life to prevent...
46,48,Pocahontas,"['Adventure', 'Animation', 'Drama', 'Family']",History comes gloriously to life in Disney's e...,13.280069,History comes gloriously to life in Disney's e...
211,239,A Goofy Movie,"['Romance', 'Animation', 'Family', 'Comedy', '...","Though Goofy always means well, his amiable cl...",10.177977,"Though Goofy always means well, his amiable cl..."
216,244,Gumby: The Movie,"['Fantasy', 'Animation', 'Science Fiction', 'F...",The band is back together! Gumby reunites with...,0.090452,The band is back together! Gumby reunites with...
...,...,...,...,...,...,...
9165,152081,Zootopia,"['Animation', 'Adventure', 'Family', 'Comedy']","Determined to prove herself, Officer Judy Hopp...",26.024868,"Determined to prove herself, Officer Judy Hopp..."
9176,156025,Ice Age: The Great Egg-Scapade,"['Adventure', 'Animation', 'Comedy', 'Family',...",A harried prehistoric bird mom Ethel entrusts ...,6.263676,A harried prehistoric bird mom Ethel entrusts ...
9182,157296,Finding Dory,"['Adventure', 'Animation', 'Comedy', 'Family']",Dory is reunited with her friends Nemo and Mar...,14.477677,Dory is reunited with her friends Nemo and Mar...
9205,160718,Piper,"['Family', 'Animation']",A mother bird tries to teach her little one ho...,11.243161,A mother bird tries to teach her little one ho...


In [28]:
filtered_movie['movieId'].head(10).values.tolist() 

[1, 13, 48, 239, 244, 313, 364, 392, 551, 558]

In [29]:
genre_based_popularity('Animation')

[135887, 115617, 5618, 4886, 152081, 152081, 6377, 103335, 106696, 134853]

In [30]:
user_info = pd.read_csv('user_info.csv')

In [31]:
user_info.head()

Unnamed: 0.1,Unnamed: 0,userId,user_vector,avg_rating,num_movies_rated
0,0,95,[4. 3.93670886 3.77777778 3.96969697 3...,3.865385,208
1,1,60,[4.16666667 4. 3.875 3.625 3...,4.214286,42
2,2,616,[4. 3.5 3.33333333 4. 3...,3.681818,22
3,3,111,[3. 3.43129771 3.5 3.57317073 3...,3.476793,237
4,4,48,[3.54651163 3.47395833 3.45238095 3.53370787 3...,3.543956,364


In [32]:
user_info['user_vector'] = user_info['user_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())
user_info['user_vector'] = user_info['user_vector'].apply(lambda x: np.asarray(x).astype(float))

In [33]:
def user_top_genre(userId):
    user_vec = user_info['user_vector'][user_info['userId'] == userId].values[0].copy()
    print("User Vector: ", user_vec)
    top_genre_indices = np.flip(np.argsort(user_vec))
    genre_list = []
    for i in top_genre_indices[:3]:
        genre_list.append(idx_to_genre[i])
    return genre_list

In [34]:
user_top_genre(1)

User Vector:  [2.    2.25  1.5   2.25  1.    3.5   2.3   3.    2.875 3.    3.5   2.
 2.5   2.5   2.    0.    0.    0.    0.    3.    0.   ]


['Horror', 'Romance', 'Thriller']

# **Hybrid model**

In [35]:
from surprise.model_selection import train_test_split

In [36]:
knn_baseline = dump.load('KnnBaseline_model')
svdpp = dump.load('svdpp_model') 

In [37]:
user_list = test_df['userId'].unique()

In [38]:
user_list[:10]

array([481, 217, 547, 505, 271, 659,  56, 157, 471, 282], dtype=int64)

In [39]:
def hybrid(userId):
    user_movies = test_df[test_df['userId'] == userId]
    user_movies['est'] = user_movies['movieId'].apply(lambda x: 0.6*knnbaseline_algo.predict(userId,x).est + 0.4*svdpp_algo.predict(userId, x).est)    
    user_movies = user_movies.sort_values(by ='est', ascending=False).head(4)
    user_movies['Model'] = 'SVD + CF'
#     user_movies = user_movies['movieId'].values.tolist()
#     print("User liked movies list: ", user_movies)
    
    recommend_list = user_movies[['movieId', 'est', 'Model']]
    print(recommend_list.head())

#     top_movie = user_movies['movieId'].iloc[0]
#     print("Top movie id", top_movie)
#     top_movie_title = movies['title'][movies['movieId'] == top_movie].values[0]
#     print("Top movie title", top_movie_title)

    
    movie_list = recommend_list['movieId'].values.tolist()
    print(movie_list)
    sim_movies_list = []
    for movie_id in movie_list:
        # Call content based 
        movie_title = movie['title'][movie['movieId'] == movie_id].values[0]
        sim_movies = get_recommendations_new(movie_title)
#         print(sim_movies.values.tolist())
        sim_movies_list.extend(sim_movies)
    
    
    # Compute ratings for the popular movies
    for movie_id in sim_movies_list:
        pred_rating = 0.6*knnbaseline_algo.predict(userId, movie_id).est + 0.4*svdpp_algo.predict(userId, movie_id).est
        row_df = pd.DataFrame([[movie_id, pred_rating, 'Movie similarity']], columns=['movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    
    # Popular based movies
    top_genre_list = user_top_genre(userId)
    print("User top genre list: ", top_genre_list)
    
    popular_movies = []
    for top_genre in top_genre_list:
        popular_movies.extend(genre_based_popularity(top_genre))
    print("Final list: ", popular_movies)
    
    # Compute ratings for the popular movies
    for movie_id in popular_movies:
        pred_rating = 0.6*knnbaseline_algo.predict(userId, movie_id).est + 0.4*svdpp_algo.predict(userId, movie_id).est
        row_df = pd.DataFrame([[movie_id, pred_rating, 'Popularity']], columns=['movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    recommend_list = recommend_list.drop_duplicates(subset=['movieId'])
    train_movie_list = train_df[train_df['userId']==userId]['movieId'].values.tolist()
    
    # Remove movies in training for this user
    mask = recommend_list.movieId.apply(lambda x: x not in train_movie_list)
    recommend_list = recommend_list[mask]
    
    return recommend_list

# **Ví dụ**

In [40]:
user_movies = test_df[test_df['userId'] == 60]
user_movies['est'] = user_movies['movieId'].apply(lambda x: 0.6*knnbaseline_algo.predict(60,x).est + 0.4*svdpp_algo.predict(60, x).est)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_movies['est'] = user_movies['movieId'].apply(lambda x: 0.6*knnbaseline_algo.predict(60,x).est + 0.4*svdpp_algo.predict(60, x).est)


In [41]:
user_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,description,popularity,est
46539,60,1209,4.0,1125829215,Once Upon a Time in the West,['Western'],This classic western masterpiece is an epic fi...,15.589351,4.45139
62250,60,27773,4.5,1125829180,Oldboy,"['Drama', 'Thriller', 'Mystery', 'Action']","With no clue how he came to be imprisoned, dru...",10.616859,4.553091
21812,60,2174,4.0,1125828802,Beetlejuice,"['Fantasy', 'Comedy']","Thanks to an untimely demise via drowning, a y...",10.627364,4.004879
28655,60,30749,5.0,1125829142,Hotel Rwanda,"['Drama', 'History', 'War']","Inspired by true events, this film takes place...",10.262332,4.424902
25376,60,1485,2.5,1125828872,Liar Liar,['Comedy'],Fletcher Reede is a fast-talking attorney and ...,10.102471,3.686758


In [42]:
user_movies = user_movies.sort_values(by ='est', ascending=False).head(4)
user_movies['Model'] = 'SVD + CF'
user_movies

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,description,popularity,est,Model
14591,60,858,5.0,1125829174,The Godfather,"['Drama', 'Crime']","Spanning the years 1945 to 1955, a chronicle o...",41.109264,4.935545,SVD + CF
59079,60,5618,4.0,1125829190,Spirited Away,"['Fantasy', 'Adventure', 'Animation', 'Family']",A ten year old girl who wanders away from her ...,41.048867,4.680176,SVD + CF
17000,60,1208,4.5,1125829194,Apocalypse Now,"['Drama', 'War']","At the height of the Vietnam war, Captain Benj...",13.5963,4.584982,SVD + CF
62250,60,27773,4.5,1125829180,Oldboy,"['Drama', 'Thriller', 'Mystery', 'Action']","With no clue how he came to be imprisoned, dru...",10.616859,4.553091,SVD + CF


In [43]:
recommend_list = user_movies[['movieId', 'est', 'Model']]
type(recommend_list)

pandas.core.frame.DataFrame

In [44]:


#     top_movie = user_movies['movieId'].iloc[0]
#     print("Top movie id", top_movie)
#     top_movie_title = movies['title'][movies['movieId'] == top_movie].values[0]
#     print("Top movie title", top_movie_title)

    
movie_list = recommend_list['movieId'].values.tolist()

In [45]:
movie_list

[858, 5618, 1208, 27773]

# Test model hybrid

In [46]:
movie_ids = hybrid(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_movies['est'] = user_movies['movieId'].apply(lambda x: 0.6*knnbaseline_algo.predict(userId,x).est + 0.4*svdpp_algo.predict(userId, x).est)


     movieId       est     Model
211     1263  3.158445  SVD + CF
259     1287  3.108734  SVD + CF
42      1029  2.926555  SVD + CF
117     1129  2.634374  SVD + CF
[1263, 1287, 1029, 1129]
ALERT: Multiple values
User Vector:  [2.    2.25  1.5   2.25  1.    3.5   2.3   3.    2.875 3.    3.5   2.
 2.5   2.5   2.    0.    0.    0.    0.    3.    0.   ]
User top genre list:  ['Horror', 'Romance', 'Thriller']
Final list:  [1219, 112818, 103249, 1993, 8957, 1214, 1200, 121231, 1991, 1387, 356, 152017, 152017, 78772, 125916, 63992, 72407, 1721, 91104, 595, 115149, 112556, 116823, 296, 58559, 541, 114180, 112623, 88744, 159093]


# **Trả về dataframe**

In [47]:
#lấy title theo movieId
def get_title(x):
    mid = x['movieId']
    return movie['title'][movie['movieId'] == mid].values

In [48]:
def get_genre(x):
    mid = x['movieId']
    return movie['genres'][movie['movieId'] == mid].values

In [49]:
movie_ids['title'] = movie_ids.apply(get_title, axis=1)
movie_ids['genre'] = movie_ids.apply(get_genre, axis=1)

In [50]:
movie_ids.sort_values(by='est', ascending = False).head(10)

Unnamed: 0,movieId,est,Model,title,genre
55,152017,3.509355,Popularity,"[Me Before You, Me Before You]","[['Drama', 'Romance'], ['Drama', 'Romance']]"
67,296,3.372708,Popularity,[Pulp Fiction],"[['Thriller', 'Crime']]"
23,1224,3.337956,Movie similarity,[Henry V],"[['War', 'Drama', 'History', 'Action', 'Romanc..."
44,1219,3.272678,Popularity,[Psycho],"[['Drama', 'Horror', 'Thriller']]"
68,58559,3.224507,Popularity,[The Dark Knight],"[['Drama', 'Action', 'Crime', 'Thriller']]"
69,541,3.217233,Popularity,[Blade Runner],"[['Science Fiction', 'Drama', 'Thriller']]"
9,2202,3.182618,Movie similarity,[Lifeboat],"[['Drama', 'War']]"
51,121231,3.172046,Popularity,[It Follows],"[['Horror', 'Thriller']]"
54,356,3.17189,Popularity,[Forrest Gump],"[['Comedy', 'Drama', 'Romance']]"
0,1263,3.158445,SVD + CF,[The Deer Hunter],"[['Drama', 'War']]"
