In [1]:
import pandas as pd
import numpy as np

In [2]:
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from sklearn.model_selection import train_test_split
from ast import literal_eval
from surprise import dump

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [4]:
import collaborative_filtering as CF 
from evaluate_model import precision_recall_at_k, evaluate


In [5]:
from get_movie_features import movie_feature
import warnings; warnings.simplefilter('ignore')

movies_metadata = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\movies_metadata.csv'
links_small = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\links_small.csv'
credits_ = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\credits.csv'
keywords = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\keywords.csv'
rating = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\ratings_small.csv'
full_data = movie_feature(movies_metadata, links_small, credits_, keywords)
rating_df = pd.read_csv(rating)

In [6]:
def cosine_similarity(full_data): #cái dataframe từ file data cuối cùng của mình - get_movie_feature
    '''tính cosine similarity dựa trên overview + tagline + 2*genres'''
    full_data['description'] = full_data['description'].astype('str')
    full_data['genres'] = full_data['genres'].astype('str')
    
    full_data['description_genre'] = full_data['description']+ full_data['genres']*2
    full_data['description_genre'] = full_data['description_genre'].fillna('')

    '''vẫn dùng TF-IDF matrix nhưng cộng với 2*genres để trở thành Count Vector'''

    tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english')
    tfidf_matrix = tfidf.fit_transform(full_data['description_genre'])
    cosine_sim= linear_kernel(tfidf_matrix, tfidf_matrix)
    return cosine_sim

In [7]:
def mapping_title_toIndex(full_data): #dataframe từ file data cuối cùng của mình - get_movie_feature
    '''map title với index của table movie, index của title = index của bảng, value = title'''
    titles = full_data['title']
    indices = pd.Series(full_data.index, index=full_data['title'])
    return indices

In [45]:
def get_recommendation_new(title, full_data):#dataframe từ file data cuối cùng của mình - get_movie_feature, type(title) = String
    '''Model recommendation dựa trên Movie Similarity'''
    #idx = mapping_title_toIndex(full_data)[title] #lấy ra index của title
    indices = mapping_title_toIndex(full_data)
    cosine_sim = cosine_similarity(full_data)
    idx = indices[title]

    if type(idx) != np.int64:
        if len(idx)>1:
            '''print("ALERT: Multiple values")'''
            idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]

    movie_indices = [i[0] for i in sim_scores]
    movie_id = full_data['movieId'].iloc[movie_indices]
    score = [i[1] for i in sim_scores]

    return pd.DataFrame({'movieId': movie_id, 'sim_score': score})

In [59]:
def genre_based_popularity(genre, full_data):#dataframe từ file data cuối cùng của mình - get_movie_feature, type(genre) = String
    '''Model recommendation dựa trên popularity'''
    mask = full_data.genres.apply(lambda x: genre in x) # trả về dạng bool, check xem genre có trong cái list genres đó k
    filtered_movie = full_data[mask]# trả về dataframe các film match với genre
    filtered_movie = filtered_movie.sort_values(by='popularity', ascending=False) #xếp theo độ phổ biến nhất
    return filtered_movie['movieId'].head(3).values.tolist() #trả về list top 10 movie similar

In [10]:
def make_useinfo_df(full_data, train_df): #full_data - get_movie_feature; train_df: train của sklearn
    full_data['genres'] = full_data.genres.apply(lambda x: literal_eval(str(x)))
    train_df['genres'] = train_df.genres.apply(lambda x: literal_eval(str(x)))
    
    unique_genre = full_data['genres'].explode().unique() #unique genres của full_data
    genre_distribution = train_df['genres'].explode().value_counts() #unique genres của train_df
    
    # Make a dict assigning an index to a genre
    genre_dict = {k: v for v, k in enumerate(unique_genre)} #key-value: genre - encode

    user_ids = train_df['userId'].unique()
    user_df = pd.DataFrame(columns=['userId', 'user_vector', 'avg_rating', 'num_movies_rated'])
    for user_id in user_ids:
        user_rating_df = train_df[(train_df['userId'] == user_id)]
        user_vector = np.zeros(len(genre_dict))
        count_vector = np.zeros(len(genre_dict))

        user_avg_rating = 0
        movies_rated_count = 0
        for _, row in user_rating_df.iterrows():
            user_avg_rating += row.rating 
            movies_rated_count += 1
            genres = row.genres

            user_movie_vector = np.zeros(len(genre_dict))

            for g in genres:
                user_movie_vector[genre_dict[g]] = 1
                count_vector[genre_dict[g]] += 1
            
            user_vector += user_movie_vector*row.rating
        count_vector = np.where(count_vector==0, 1, count_vector)
        user_vector = np.divide(user_vector, count_vector)
        user_avg_rating /= movies_rated_count
        row_df = pd.DataFrame([[user_id, user_vector, user_avg_rating, movies_rated_count]], 
                          columns=['userId', 'user_vector', 'avg_rating', 'num_movies_rated'])
        user_df = pd.concat([user_df, row_df], ignore_index=True)
        
    return user_df
    '''đặt biến user_info = make_useinfo_df(full_data, train_df): trae về dataframe'''

In [11]:
def user_top_genre(userId, user_info, idx_to_genre): #user_info la dataframe dùng hàm make_useinfo_df(fulldata_path, trainset_path), type(userId) = int
    user_vec = user_info['user_vector'][user_info['userId'] == userId].values[0].copy()
    # print("User Vector: ", user_vec)
    top_genre_indices = np.flip(np.argsort(user_vec))
    genre_list = []
    for i in top_genre_indices[:3]:
        genre_list.append(idx_to_genre[i])
    return genre_list 
    '''đặt 1 biến là genre_list = user_top_genre(userId, user_info'''
    

In [13]:
def get_title(x):
    '''lấy ra title của hàm hybrid'''
    mid = x['movieId']
    return full_data['title'][full_data['movieId'] == mid].values

def get_genre(x):
    '''get genre của hybrid'''
    mid = x['movieId']
    return full_data['genres'][full_data['movieId'] == mid].values

In [62]:
def hybrid(userId, full_data, train_df, train_full, models, idx_to_genre): #full_data là file data movie features
    
    user_movies = train_df[train_df['userId'] == userId]['movieId'].values.tolist()
    
    # get 10 most similar movies

    sim_movies_list = pd.DataFrame(columns=['movieId', 'sim_score'])
    for movie_id in user_movies:
        movie_title = full_data['title'][full_data['movieId'] == movie_id].values[0]
        sim_movie = get_recommendation_new(movie_title, full_data)
        sim_movies_list = pd.concat([sim_movies_list, sim_movie], ignore_index=True)
    sim_movies_list.sort_values(by='sim_score', ascending=False)
    sim_movies_list['movieId'].head(10).values.tolist()

    recommend_list = pd.DataFrame(columns=['userId', 'movieId', 'est', "Model"])
    recommend_list.userId = userId
    recommend_list.movieId = user_movies

    # Compute ratings for the similar movies
    for movie_id in sim_movies_list:
        pred_rating = 0
        for model, w in models.items():
            pred_rating += model.algorithm.predict(userId, movie_id)[3] * w
        row_df = pd.DataFrame([[userId, movie_id, pred_rating, "Similarity"]], columns=['userId', 'movieId', 'est', "Model"])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    recommend_list = recommend_list[recommend_list['est'].notnull()].head(10).sort_values(by = 'est',ascending = False)

    # Popular based movies
    top_genre_list = user_top_genre(userId, make_useinfo_df(full_data, train_full), \
                                    idx_to_genre) #data frame user_info

    popular_movies = []
    for top_genre in top_genre_list:
        popular_movies.extend(genre_based_popularity(top_genre, full_data))

    # Compute ratings for the popular movies
    for movie_id in popular_movies:
        pred_rating = 0
        for model, w in models.items():
            pred_rating += model.algorithm.predict(userId, movie_id)[3] * w        
        row_df = pd.DataFrame([[userId, movie_id, pred_rating, 'Popularity']], columns=['userId', 'movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)

    recommend_list = recommend_list.drop_duplicates(subset=['movieId'])
    train_movie_list = train_df[train_df['userId']==userId]['movieId'].values.tolist()

    # Remove movies in training for this user
    mask = recommend_list.movieId.apply(lambda x: x not in train_movie_list)
    recommend_list = recommend_list[mask]
    recommend_list.userId = recommend_list.userId.astype(int)
    
    recommend_list['title'] = recommend_list.apply(get_title, axis=1)
    recommend_list['genre'] = recommend_list.apply(get_genre, axis=1)

    return recommend_list.sort_values(by='est', ascending=False).head(10)


# DataFrame

**Full Dataset**

In [14]:
full_data.head(2)

Unnamed: 0,id,movieId,title,genres,description,cast,director,keywords,popularity,vote_average,vote_count,year,wr,spoken_languages
0,862,1,Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...","[tomhanks, timallen, donrickles]",johnlasseter,"[jealousi, toy, boy, friendship, friend, rival...",21.946943,7,5415,1995,6.86977,[English]
1,8844,2,Jumanji,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,"[robinwilliams, jonathanhyde, kirstendunst]",joejohnston,"[boardgam, disappear, basedonchildren'sbook, n...",17.015539,6,2413,1995,5.884891,"[English, Français]"


In [15]:
full_data = full_data[['movieId', 'title', 'genres', 'description', 'popularity']]

**Train_df, Test_df**

In [16]:
#train, test_df
train_df, test_df = train_test_split(rating_df, random_state=42, \
                                     stratify=rating_df['userId'], test_size=0.25)
train_full = full_data.merge(rating_df, on='movieId')

In [17]:
train_df.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
59679,431,2863,4.0,1165548515
85061,571,7173,2.0,1334343358


In [18]:
test_df.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
42031,302,593,5.0,843720636
26084,191,110,3.0,839925631


In [19]:
train_full.head(2)

Unnamed: 0,movieId,title,genres,description,popularity,userId,rating,timestamp
0,1,Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",21.946943,7,3.0,851866703
1,1,Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",21.946943,9,4.0,938629179


**User_info**

In [20]:
user_info = make_useinfo_df(full_data, train_full)

In [21]:
user_info.head(3)

Unnamed: 0,userId,user_vector,avg_rating,num_movies_rated
0,7,"[4.0, 3.5294117647058822, 3.625, 3.43333333333...",3.448276,87
1,9,"[3.3333333333333335, 3.5833333333333335, 3.333...",3.755556,45
2,13,"[4.0, 3.5277777777777777, 3.9705882352941178, ...",3.730769,52


# Algorithms

**knnbaseline_algo, svdpp_algo, BaselineOnly**

In [22]:
# train_set, test_set = train_test_split(train_df, test_size=0.25,\
                                    # stratify=train_df['userId'], random_state=42)

In [23]:
# Tuple: precision, recall, F1 score

knn_base_line = CF.CollaborativeFiltering(KNNBaseline())
knn_base_line.load_data(train_df, test_df)
knn_base_line.fit()
knn_base_line.predict()

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9035
MAE:  0.6917


(0.7525242667117319, 0.6784179454034541, 0.7135521792094823)

In [24]:
# svd_algo = svd(train_convert, val_convert)
svdpp = CF.CollaborativeFiltering(SVDpp())
svdpp.load_data(train_df, test_df)
svdpp.fit()
svdpp.predict()

RMSE: 0.8948
MAE:  0.6860


(0.7345044571195224, 0.6930292566015249, 0.7131643519103869)

In [25]:
base_line_only = CF.CollaborativeFiltering(BaselineOnly())
base_line_only.load_data(train_df, test_df)
base_line_only.fit()
base_line_only.predict()

Estimating biases using als...
RMSE: 0.8995
MAE:  0.6951


(0.7392863655981999, 0.695005133582501, 0.7164621969408549)

**cosine_similarity, indices**

In [26]:
cosine_sim = cosine_similarity(full_data)

In [27]:
indices = mapping_title_toIndex(full_data)

In [28]:
genre_to_idx = {'Animation': 0,
 'Comedy': 1,
 'Family': 2,
 'Adventure': 3,
 'Fantasy': 4,
 'Romance': 5,
 'Drama': 6,
 'Action': 7,
 'Crime': 8,
 'Thriller': 9,
 'Horror': 10,
 'History': 11,
 'Science Fiction': 12,
 'Mystery': 13,
 'War': 14,
 'Foreign': 15,
 'Music': 16,
 'Documentary': 17,
 None: 18,
 'Western': 19,
 'TV Movie': 20}

In [29]:
idx_to_genre = {0: 'Animation',
 1: 'Comedy',
 2: 'Family',
 3: 'Adventure',
 4: 'Fantasy',
 5: 'Romance',
 6: 'Drama',
 7: 'Action',
 8: 'Crime',
 9: 'Thriller',
 10: 'Horror',
 11: 'History',
 12: 'Science Fiction',
 13: 'Mystery',
 14: 'War',
 15: 'Foreign',
 16: 'Music',
 17: 'Documentary',
 18: None,
 19: 'Western',
 20: 'TV Movie'}

# Modeling 

**Movie Similarity model**

In [42]:
get_recommendation_new('Toy Story', full_data)

[(0, 1.0), (2522, 0.28756765994464184), (7629, 0.23259654531091614), (6267, 0.12302282475189674), (8432, 0.10715934388111009), (2751, 0.10508214486551863), (2567, 0.09708859375679496), (1432, 0.09487004679517726), (6809, 0.08934720473853719), (3833, 0.08823846005183036), (7254, 0.08724072071012798), (1219, 0.08653372561070624), (9093, 0.08153789976835765), (6534, 0.08094665654659736), (1662, 0.08088353117976962), (9045, 0.07962682309911445), (8519, 0.07916537457469214), (6519, 0.07912526931644351), (7854, 0.07910220544188537), (6733, 0.07884099787801448), (9134, 0.07730259261345862), (9135, 0.07730259261345862), (8758, 0.07675245877639231), (6708, 0.07666008339865733), (7163, 0.07547257957424267), (9182, 0.07533996383510742), (8230, 0.07392862852901624), (5456, 0.07389132746700759), (9129, 0.07250236727146582), (7454, 0.07217731655914686), (6130, 0.07153794726724119), (910, 0.07122721039608364), (4757, 0.07103921108081647), (6427, 0.07102037651786333), (8595, 0.06978494348578239), (864

Unnamed: 0,movieId,sim_score
2522,3114,0.287568
7629,78499,0.232597
6267,35836,0.123023
8432,103335,0.107159
2751,3429,0.105082
2567,3174,0.097089
1432,1822,0.09487
6809,54272,0.089347
3833,4886,0.088238
7254,67295,0.087241


**Popularity Model**

In [31]:
# get user's favourite genres
user_top_gen = user_top_genre(1, user_info, idx_to_genre)

In [32]:
# popular movie id based on genres
popularity = genre_based_popularity('Animation', full_data)
popularity

[135887, 115617, 5618, 4886, 152081, 152081, 6377, 103335, 106696, 134853]

**Hybrid Model**

In [61]:
# combine best algorithms by weight
models = {svdpp: 0.4, base_line_only: 0.4, knn_base_line: 0.2}

hybrid_result = hybrid(1, full_data, train_df, train_full, models, idx_to_genre)
hybrid_result


Unnamed: 0,userId,movieId,est,Model,title,genre
5,1,1219,3.41763,Popularity,[Psycho],"[[Drama, Horror, Thriller]]"
2,1,356,3.37157,Popularity,[Forrest Gump],"[[Comedy, Drama, Romance]]"
10,1,99114,3.276081,Popularity,[Django Unchained],"[[Drama, Western]]"
9,1,128360,3.20222,Popularity,[The Hateful Eight],"[[Crime, Drama, Mystery, Western]]"
8,1,139385,3.082193,Popularity,[The Revenant],"[[Western, Drama, Adventure, Thriller]]"
3,1,152017,2.936507,Popularity,"[Me Before You, Me Before You]","[[Drama, Romance], [Drama, Romance]]"
0,1,movieId,2.854043,Similarity,[],[]
1,1,sim_score,2.854043,Similarity,[],[]
6,1,112818,2.655662,Popularity,[The Purge: Anarchy],"[[Horror, Thriller]]"
7,1,103249,2.600605,Popularity,[World War Z],"[[Action, Drama, Horror, Science Fiction, Thri..."


# Evaluate model 

In [34]:
pred_df = pd.DataFrame(columns=['userId', 'movieId', 'est', 'Model'])

for user in test_df.userId.unique():
    pred_df = pd.concat([pred_df, hybrid(user, full_data, train_df, \
                              train_full, models, idx_to_genre)], ignore_index=True)
pred_df

Unnamed: 0,userId,movieId,est,Model
0,302,117529,3.150111,Similarity
1,302,60674,3.150111,Similarity
2,302,65552,3.150111,Similarity
3,302,8447,3.150111,Similarity
4,302,136800,3.150111,Similarity
...,...,...,...,...
69,191,7143,3.150111,Popularity
70,191,3155,3.150111,Popularity
71,191,33162,3.150111,Popularity
72,191,3342,3.150111,Popularity


In [35]:
evaluate(pred_df[['userId', 'movieId']], test_df[['userId', 'movieId']])

7.84375245117264e-05