In [1]:
import pandas as pd
import numpy as np

In [2]:
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from sklearn.model_selection import train_test_split
from ast import literal_eval
from surprise import dump

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [4]:
def read_data(data_path):
    '''read csv file, bỏ column Unnamed: 0'''
    df = pd.read_csv(data_path)
    col = 'Unnamed: 0'
    if col in df.columns:
        df.drop(col, axis = 1, inplace = True)
    return df

In [5]:
def train_test_df(rating_data, full_data): #full_data là file data cuối cùng của mình tuwf get_movie_features (input la dataframe)
    '''chia tập train và tập test theo sklearn'''
    rating_train, rating_test = train_test_split(rating_data, random_state=42, \
                                     stratify=rating_data['userId'], test_size=0.25)

    full_data = full_data[['movieId', 'title', 'genres', 'description', 'popularity']]
    
    '''Merge full_dataset với rating, chia train: 80, test: 20'''
    train_df = full_data.merge(rating_train, on = 'movieId')
    test_df = full_data.merge(rating_test, on = 'movieId')
    return train_df, test_df
    '''train_df, test_df = train_test(rating_path, full_data_path)'''
    

In [6]:
def convert_traintest_dataframe_forsurprise(train_df, test_df): #train, test_df lấy ở hàm train_test_df
    '''Dùng để convert trainset, testset để dùng cho thư viện surprise'''
    reader = Reader(rating_scale=(0, 5))
    train_convert = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)
    test_convert = Dataset.load_from_df(test_df[['userId', 'movieId', 'rating']], reader)
    train_convert = train_convert.construct_trainset(train_convert.raw_ratings)
    test_convert = test_convert.construct_testset(test_convert.raw_ratings)
    return train_convert, test_convert
    '''train_convert, test_convert = convert_traintest_dataframe_forsurprise(train_df, test_df)'''

In [7]:
def knnbaseline(train_convert, test_convert):
    sim_options = {'name': 'cosine', 'user_based': False} # compute  similarities between items
    knnbaseline_algo = KNNBaseline(sim_options=sim_options)

    knnbaseline_algo.fit(train_convert)
    knnbaseline_predictions = knnbaseline_algo.test(test_convert)

    file_name = 'KnnBaseline_model'
    dump.dump(file_name, algo=knnbaseline_predictions)

    accuracy.rmse(knnbaseline_predictions)
    accuracy.mae(knnbaseline_predictions)
    print("Done!")
    return knnbaseline_algo.fit(train_convert)#phải đặt tên 1 biến là knnbaseline_algo = knnbaseline(trainset, testset)

In [8]:
def svd(train_convert, test_convert):
    svd_algo = SVD()

    svd_algo.fit(train_convert)
    svd_predictions = svd_algo.test(test_convert)

    file_name = 'svd_model'
    dump.dump(file_name, algo=svd_algo)

    accuracy.rmse(svd_predictions)
    accuracy.mae(svd_predictions)
    print("Done!")
    return svd_algo.fit(train_convert)#phải đặt tên 1 biến là svd_algo = svd(train_convert, test_convert):

In [9]:
def svdpp(train_convert, test_convert):
    svdpp_algo = SVDpp()
    svdpp_algo.fit(train_convert)
    svdpp_predictions = svdpp_algo.test(test_convert)
    
    file_name = 'svdpp_model'
    dump.dump(file_name, algo=svdpp_algo)
    
    
    accuracy.rmse(svdpp_predictions)
    accuracy.mae(svdpp_predictions)
    print("Done!")
    return svdpp_algo.fit(train_convert) #phải đặt tên 1 biến là svdpp_algo = svdpp(train_convert, test_convert):

In [10]:
def cosine_similarity(full_data): #cái dataframe từ file data cuối cùng của mình - get_movie_feature
    '''tính cosine similarity dựa trên overview + tagline + 2*genres'''
    full_data['description'] = full_data['description'].astype('str')
    full_data['genres'] = full_data['genres'].astype('str')
    
    full_data['description_genre'] = full_data['description']+ full_data['genres']*2
    full_data['description_genre'] = full_data['description_genre'].fillna('')

    '''vẫn dùng TF-IDF matrix nhưng cộng với 2*genres để trở thành Count Vector'''

    tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english')
    tfidf_matrix = tfidf.fit_transform(full_data['description_genre'])
    cosine_sim= linear_kernel(tfidf_matrix, tfidf_matrix)
    return cosine_sim

In [11]:
def mapping_title_toIndex(full_data): #dataframe từ file data cuối cùng của mình - get_movie_feature
    '''map title với index của table movie, index của title = index của bảng, value = title'''
    titles = full_data['title']
    indices = pd.Series(full_data.index, index=full_data['title'])
    return indices

In [12]:
def get_recommendation_new(title, full_data, indices, cosine_sim):#dataframe từ file data cuối cùng của mình - get_movie_feature, type(title) = String
    '''Model recommendation dựa trên Movie Similarity'''
    #idx = mapping_title_toIndex(full_data)[title] #lấy ra index của title
    idx = indices[title] #indices = mapping_title_toIndex(full_data)
    if type(idx) != np.int64:
        if len(idx)>1:
            '''print("ALERT: Multiple values")'''
            idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return full_data['movieId'].iloc[movie_indices]

In [13]:
def genre_based_popularity(genre, full_data):#dataframe từ file data cuối cùng của mình - get_movie_feature, type(genre) = String
    '''Model recommendation dựa trên popularity'''
    mask = full_data.genres.apply(lambda x: genre in x) # trả về dạng bool, check xem genre có trong cái list genres đó k
    filtered_movie = full_data[mask]# trả về dataframe các film match với genre
    filtered_movie = filtered_movie.sort_values(by='popularity', ascending=False) #xếp theo độ phổ biến nhất
    return filtered_movie['movieId'].head(10).values.tolist() #trả về list top 10 movie similar

In [14]:
def make_useinfo_df(full_data, train_df): #full_data - get_movie_feature; train_df: train của sklearn
    full_data['genres'] = full_data.genres.apply(lambda x: literal_eval(str(x)))
    train_df['genres'] = train_df.genres.apply(lambda x: literal_eval(str(x)))
    
    unique_genre = full_data['genres'].explode().unique() #unique genres của full_data
    genre_distribution = train_df['genres'].explode().value_counts() #unique genres của train_df
    
    # Make a dict assigning an index to a genre
    genre_dict = {k: v for v, k in enumerate(unique_genre)} #key-value: genre - encode

    user_ids = train_df['userId'].unique()
    user_df = pd.DataFrame(columns=['userId', 'user_vector', 'avg_rating', 'num_movies_rated'])
    for user_id in user_ids:
        user_rating_df = train_df[(train_df['userId'] == user_id)]
        user_vector = np.zeros(len(genre_dict))
        count_vector = np.zeros(len(genre_dict))

        user_avg_rating = 0
        movies_rated_count = 0
        for _, row in user_rating_df.iterrows():
            user_avg_rating += row.rating 
            movies_rated_count += 1
            genres = row.genres

            user_movie_vector = np.zeros(len(genre_dict))

            for g in genres:
                user_movie_vector[genre_dict[g]] = 1
                count_vector[genre_dict[g]] += 1
            
            user_vector += user_movie_vector*row.rating
        count_vector = np.where(count_vector==0, 1, count_vector)
        user_vector = np.divide(user_vector, count_vector)
        user_avg_rating /= movies_rated_count
        row_df = pd.DataFrame([[user_id, user_vector, user_avg_rating, movies_rated_count]], 
                          columns=['userId', 'user_vector', 'avg_rating', 'num_movies_rated'])
        user_df = pd.concat([user_df, row_df], ignore_index=True)
        
    return user_df
    '''đặt biến user_info = make_useinfo_df(full_data, train_df): trae về dataframe'''

In [15]:
def user_top_genre(userId, user_info, idx_to_genre): #user_info la dataframe dùng hàm make_useinfo_df(fulldata_path, trainset_path), type(userId) = int
    user_vec = user_info['user_vector'][user_info['userId'] == userId].values[0].copy()
    print("User Vector: ", user_vec)
    top_genre_indices = np.flip(np.argsort(user_vec))
    genre_list = []
    for i in top_genre_indices[:3]:
        genre_list.append(idx_to_genre[i])
    return genre_list 
    '''đặt 1 biến là genre_list = user_top_genre(userId, user_info'''
    

In [16]:
def hybrid(userId, full_data, train_df, test_df, knnbaseline_algo, svdpp_algo, indices, cosine_sim): #full_data là file data cuối cùng, train-test_df là qua sklearn
    user_movies = train_df[train_df['userId'] == userId]
    recommend_list = user_movies[['movieId']]
    movie_list = recommend_list['movieId'].values.tolist()
    
    sim_movies_list = []
    for movie_id in movie_list:
        # Call content based 
        movie_title = full_data['title'][full_data['movieId'] == movie_id].values[0]
        sim_movies = get_recommendation_new(movie_title, full_data,indices, cosine_sim) 
        sim_movies_list.extend(sim_movies)
    # Compute ratings for the popular movies
    for movie_id in sim_movies_list:
        pred_rating = 0.6*knnbaseline_algo.predict(userId, movie_id).est + 0.4*svdpp_algo.predict(userId, movie_id).est
        row_df = pd.DataFrame([[userId, movie_id, pred_rating, 'Movie similarity']], columns=['userId','movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    recommend_list = recommend_list[recommend_list['est'].notnull()].head(10).sort_values(by = 'est',ascending = False)
    return recommend_list

    '''# Popular based movies
    top_genre_list = user_top_genre(userId, user_info) #data frame user_info
    print("User top genre list: ", top_genre_list)

    popular_movies = []
    for top_genre in top_genre_list:
        popular_movies.extend(genre_based_popularity(top_genre, full_data))
    print("Final list: ", popular_movies)

    # Compute ratings for the popular movies
    for movie_id in popular_movies:
        pred_rating = 0.6*knnbaseline_algo.predict(userId, movie_id).est + 0.4*svdpp_algo.predict(userId, movie_id).est
        row_df = pd.DataFrame([[movie_id, pred_rating, 'Popularity']], columns=['movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    recommend_list = recommend_list.drop_duplicates(subset=['movieId'])
    train_movie_list = train_df[train_df['userId']==userId]['movieId'].values.tolist()

    # Remove movies in training for this user
    mask = recommend_list.movieId.apply(lambda x: x not in train_movie_list)
    recommend_list = recommend_list[mask]
    
    return recommend_list'''

In [17]:
def hybrid_test(userId, full_data, train_df, test_df, knnbaseline_algo, svdpp_algo, indices, cosine_sim): #full_data là file data cuối cùng, train-test_df là qua sklearn
    user_movies = train_df[train_df['userId'] == userId]
    recommend_list = user_movies[['movieId']]
    movie_list = recommend_list['movieId'].values.tolist()
    
    sim_movies_list = []
    for movie_id in movie_list:
        # Call content based 
        movie_title = test_df['title'][test_df['movieId'] == movie_id].values[0]
        sim_movies = get_recommendation_new(movie_title, test_df,indices, cosine_sim) 
        sim_movies_list.extend(sim_movies)
    # Compute ratings for the popular movies
    for movie_id in sim_movies_list:
        pred_rating = 0.6*knnbaseline_algo.predict(userId, movie_id).est + 0.4*svdpp_algo.predict(userId, movie_id).est
        row_df = pd.DataFrame([[userId, movie_id, pred_rating,rating, 'Movie similarity']], columns=['userId','movieId', 'est', 'true rating','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    recommend_list = recommend_list[recommend_list['est'].notnull()].head(10).sort_values(by = 'est',ascending = False)
    return recommend_list

In [18]:
def get_title(x):
    '''lấy ra title của hàm hybrid'''
    mid = x['movieId']
    return full_data['title'][full_data['movieId'] == mid].values

def get_genre(x):
    '''get genre của hybrid'''
    mid = x['movieId']
    return full_data['genres'][full_data['movieId'] == mid].values

# DataFrame

**Data path**

In [19]:
full_data_path = 'final_data'
rating_path = 'ratings_small.csv'

**Full Dataset**

In [20]:
#full_dataset
full_data = read_data(full_data_path)
full_data.head(3)

Unnamed: 0,id,movieId,title,genres,description,cast,director,keywords,popularity,vote_average,vote_count,year,wr,spoken_languages
0,862,1,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...","['tomhanks', 'timallen', 'donrickles']",johnlasseter,"['jealousi', 'toy', 'boy', 'friendship', 'frie...",21.946943,7,5415,1995,6.86977,['English']
1,8844,2,Jumanji,"['Adventure', 'Fantasy', 'Family']",When siblings Judy and Peter discover an encha...,"['robinwilliams', 'jonathanhyde', 'kirstendunst']",joejohnston,"['boardgam', 'disappear', ""basedonchildren'sbo...",17.015539,6,2413,1995,5.884891,"['English', 'Français']"
2,15602,3,Grumpier Old Men,"['Romance', 'Comedy']",A family wedding reignites the ancient feud be...,"['waltermatthau', 'jacklemmon', 'ann-margret']",howarddeutch,"['fish', 'bestfriend', 'duringcreditssting']",11.7129,6,92,1995,5.376968,['English']


In [21]:
full_data = full_data[['movieId', 'title', 'genres', 'description', 'popularity']]

In [22]:
full_data.head()

Unnamed: 0,movieId,title,genres,description,popularity
0,1,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...",21.946943
1,2,Jumanji,"['Adventure', 'Fantasy', 'Family']",When siblings Judy and Peter discover an encha...,17.015539
2,3,Grumpier Old Men,"['Romance', 'Comedy']",A family wedding reignites the ancient feud be...,11.7129
3,4,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']","Cheated on, mistreated and stepped on, the wom...",3.859495
4,5,Father of the Bride Part II,['Comedy'],Just when George Banks has recovered from his ...,8.387519


**Rating Dataset**

In [23]:
#rating_dataset
rating_ = read_data(rating_path)
rating_.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182


**Train_df, Test_df**

In [24]:
#train, test_df
train_df, test_df = train_test_df(rating_, full_data)

In [25]:
train_df.head(3)

Unnamed: 0,movieId,title,genres,description,popularity,userId,rating,timestamp
0,1,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...",21.946943,534,5.0,973376852
1,1,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...",21.946943,219,5.0,974475264
2,1,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...",21.946943,63,5.0,1079098216


In [26]:
test_df.head(3)

Unnamed: 0,movieId,title,genres,description,popularity,userId,rating,timestamp
0,1,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...",21.946943,333,4.0,1441197471
1,1,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...",21.946943,501,5.0,1283137657
2,1,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...",21.946943,468,4.0,1296195523


**Train_convert, Test_convert**

In [27]:
train_convert, test_convert = convert_traintest_dataframe_forsurprise(train_df, test_df)

In [28]:
print(train_convert)
print(test_convert)

<surprise.trainset.Trainset object at 0x000002B4D9BFEFD0>
[(333, 1, 4.0), (501, 1, 5.0), (468, 1, 4.0), (433, 1, 4.5), (455, 1, 3.5), (92, 1, 5.0), (584, 1, 5.0), (157, 1, 3.5), (179, 1, 5.0), (506, 1, 4.0), (187, 1, 4.0), (69, 1, 5.0), (596, 1, 3.5), (401, 1, 5.0), (564, 1, 4.0), (580, 1, 4.0), (224, 1, 4.0), (273, 1, 4.5), (26, 1, 5.0), (284, 1, 3.0), (470, 1, 3.5), (559, 1, 4.0), (142, 1, 4.0), (178, 1, 4.0), (486, 1, 5.0), (406, 1, 3.5), (671, 1, 5.0), (212, 1, 3.0), (615, 1, 4.0), (184, 1, 5.0), (37, 1, 4.0), (597, 1, 5.0), (380, 1, 4.0), (412, 1, 4.0), (670, 1, 4.0), (7, 1, 3.0), (471, 1, 3.5), (77, 1, 4.0), (472, 1, 5.0), (426, 1, 3.0), (560, 1, 4.5), (136, 1, 4.5), (475, 1, 4.5), (93, 1, 4.0), (237, 1, 3.0), (87, 1, 3.0), (79, 1, 2.0), (440, 1, 4.0), (382, 1, 3.5), (664, 1, 3.5), (13, 1, 5.0), (176, 1, 2.0), (525, 1, 2.0), (531, 1, 3.0), (379, 1, 3.5), (200, 1, 3.0), (220, 1, 2.0), (603, 1, 4.0), (153, 1, 5.0), (425, 1, 4.5), (417, 1, 3.5), (518, 1, 5.0), (48, 2, 3.5), (496, 2,

**User_info**

In [29]:
user_info = make_useinfo_df(full_data, train_df)

In [30]:
user_info.head(3)

Unnamed: 0,userId,user_vector,avg_rating,num_movies_rated
0,534,"[4.545454545454546, 3.140625, 3.8, 3.960784313...",3.651982,227
1,219,"[3.5, 3.8095238095238093, 3.5789473684210527, ...",3.682692,104
2,63,"[4.333333333333333, 3.5, 3.9375, 4.02083333333...",3.787671,73


# Metric - Dictionary 

**knnbaseline_algo, svd_algo, svdpp_algo**

In [31]:
knnbaseline_algo = knnbaseline(train_convert, test_convert)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9197
MAE:  0.7114
Done!
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [32]:
svd_algo = svd(train_convert, test_convert)

RMSE: 0.9069
MAE:  0.6972
Done!


In [33]:
svdpp_algo = svdpp(train_convert, test_convert)

RMSE: 0.8956
MAE:  0.6847
Done!


**cosine_similarity, indices**

In [34]:
cosine_sim = cosine_similarity(full_data)

In [35]:
indices = mapping_title_toIndex(full_data)

In [36]:
genre_to_idx = {'Animation': 0,
 'Comedy': 1,
 'Family': 2,
 'Adventure': 3,
 'Fantasy': 4,
 'Romance': 5,
 'Drama': 6,
 'Action': 7,
 'Crime': 8,
 'Thriller': 9,
 'Horror': 10,
 'History': 11,
 'Science Fiction': 12,
 'Mystery': 13,
 'War': 14,
 'Foreign': 15,
 'Music': 16,
 'Documentary': 17,
 None: 18,
 'Western': 19,
 'TV Movie': 20}

In [37]:
idx_to_genre = {0: 'Animation',
 1: 'Comedy',
 2: 'Family',
 3: 'Adventure',
 4: 'Fantasy',
 5: 'Romance',
 6: 'Drama',
 7: 'Action',
 8: 'Crime',
 9: 'Thriller',
 10: 'Horror',
 11: 'History',
 12: 'Science Fiction',
 13: 'Mystery',
 14: 'War',
 15: 'Foreign',
 16: 'Music',
 17: 'Documentary',
 18: None,
 19: 'Western',
 20: 'TV Movie'}

# Modeling 

**Movie Similarity model**

In [38]:
get_recommendation_new('Toy Story', full_data, indices, cosine_sim).to_frame()

Unnamed: 0,movieId
2522,3114
7629,78499
6267,35836
8432,103335
2751,3429
2567,3174
1432,1822
6809,54272
3833,4886
7254,67295


**Popularity Model**

In [39]:
popularity = genre_based_popularity('Animation', full_data)

In [40]:
popularity

[135887, 115617, 5618, 4886, 152081, 152081, 6377, 103335, 106696, 134853]

In [41]:
user_top_gen = user_top_genre(1, user_info, idx_to_genre)

User Vector:  [3.         2.33333333 2.         1.875      1.         3.75
 2.5        2.6        2.83333333 3.         3.         2.
 2.         2.5        0.         0.         0.         0.
 0.         3.         0.        ]


**Hybrid Model**

In [42]:
hybrid_result = hybrid(60, full_data, train_df, test_df, knnbaseline_algo, svdpp_algo, indices, cosine_sim)

In [43]:
hybrid_result['title'] = hybrid_result.apply(get_title, axis=1)
hybrid_result['genre'] = hybrid_result.apply(get_genre, axis=1)

In [44]:
#final result for hybrid model
hybrid_result['userId'] = hybrid_result['userId'].astype('int')

In [45]:
hybrid_result

Unnamed: 0,movieId,userId,est,Model,title,genre
48,2580,60,4.296952,Movie similarity,[Go],"[['Crime', 'Comedy', 'Thriller']]"
42,322,60,4.150402,Movie similarity,[Swimming with Sharks],"[['Crime', 'Comedy']]"
41,88118,60,3.848319,Movie similarity,[The Perfect Host],"[['Comedy', 'Thriller', 'Crime']]"
44,5568,60,3.848319,Movie similarity,[Johnny Dangerously],"[['Action', 'Comedy', 'Thriller', 'Crime']]"
49,6548,60,3.774694,Movie similarity,[Bad Boys II],"[['Adventure', 'Action', 'Comedy', 'Thriller',..."
45,5179,60,3.748676,Movie similarity,[Gloria],"[['Drama', 'Action', 'Thriller', 'Crime']]"
46,1689,60,3.73904,Movie similarity,[The Man Who Knew Too Little],"[['Comedy', 'Thriller', 'Crime', 'Action']]"
47,627,60,3.702644,Movie similarity,[The Last Supper],"[['Comedy', 'Thriller', 'Crime', 'Drama']]"
40,32019,60,3.578977,Movie similarity,[Be Cool],"[['Comedy', 'Crime']]"
43,6763,60,3.420548,Movie similarity,[Duplex],"[['Action', 'Comedy', 'Thriller']]"


# Evaluate model 

In [46]:
def split_data(rating_df):
    train_df, test_df = train_test_split(rating_df, \
                                         test_size=0.2, stratify=rating_df['userId'], \
                                        random_state=42)
    return train_df, test_df

def check_movieId(pred_df, val_df):
    result = pred_df['movieId'].isin(val_df[val_df['userId'] == \
                                   pred_df['userId'].iloc[0]]['movieId'])
    return result.reset_index(drop=True)

def evaluate(pred_df, val_df):
    """ Proportion of movies recommended that were actually watched by users

    Args:
    pred_df: dataframe, 2 columns: userId, movieId
    val_df: dataframe, 2 columns: userId, movieId

    Returns:
    
    """
    result = pred_df.groupby('userId').apply(lambda x: check_movieId(x, val_df))
    n_user = val_df.userId.nunique()
    top_k = pred_df.groupby('userId').count().iloc[0]
    top_k = int(top_k.iloc[0])

    return result.sum().sum() / (n_user*top_k)

def precision_recall_at_k(test_df, threshold):
    """Return recall and precision, F-1 scrore for collaborative + hybrid

    Args:
        test_df: prediction dataframe, with 4 columns: userId, movieId, true_rating, pred_rating
        threshold: if rating > threshold, movie is believed to be relevant

    Returns:
    Recall: Proportion of relevant items that are recommended, dict-like
    Precision: Proportion of recommended items that are relevant, dict-like
        Movie is relevant if true_rating > threshold
        Movie is recommend when pred_rating > threshold
    """
    recalls = dict()
    precisions = dict()

    for userId, group in test_df.groupby('userId'):

        filter_rel = group[group['rating'] > threshold]
        filter_rec = group[group['pred_rating'] > threshold]
        filter_rel_rec = group[(group['pred_rating'] > threshold) & \
                               (group['rating'] > threshold)]

        # Number of relevant items
        n_rel = len(filter_rel)

        # Number of recommended items in top k
        n_rec = len(filter_rec)

        # Number of relevant and recommended items in top k
        n_rel_rec = len(filter_rel_rec)

        recalls[userId] = n_rel_rec/n_rel if n_rel != 0 else 1
        precisions[userId] = n_rel_rec/n_rec if n_rec != 0 else 1

    precision = sum(prec for prec in precisions.values())/len(precisions)
    recall = sum(rec for rec in recalls.values())/len(recalls)
    fmeasure = (2*precision*recall)/(precision + recall)

    return recall, precision, fmeasure

In [47]:
check_movieId(hybrid_result, test_df)

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: movieId, dtype: bool

In [48]:
evaluate(hybrid_result, test_df)

0.0