In [1]:
import pandas as pd
import numpy as np

In [2]:
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from sklearn.model_selection import train_test_split
from ast import literal_eval
from surprise import dump

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [4]:
import collaborative_filtering as CF 
from evaluate_model import precision_recall_at_k, evaluate


In [5]:
from get_movie_features import movie_feature
import warnings; warnings.simplefilter('ignore')

movies_metadata = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\movies_metadata.csv'
links_small = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\links_small.csv'
credits_ = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\credits.csv'
keywords = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\keywords.csv'
rating = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\ratings_small.csv'
full_data = movie_feature(movies_metadata, links_small, credits_, keywords)
rating_df = pd.read_csv(rating)

In [6]:
def cosine_similarity(full_data): 
    '''tính cosine similarity dựa trên overview + tagline + 2*genres
    Embedding: TF-IDF'''
    
    full_data['description'] = full_data['description'].astype('str')
    full_data['genres'] = full_data['genres'].astype('str')
    
    full_data['description_genre'] = full_data['description']+ full_data['genres']*2
    full_data['description_genre'] = full_data['description_genre'].fillna('')

    tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english')
    tfidf_matrix = tfidf.fit_transform(full_data['description_genre'])
    cosine_sim= linear_kernel(tfidf_matrix, tfidf_matrix)
    return cosine_sim

In [7]:
def mapping_title_toIndex(full_data): 
    '''map title với index của table movie, index của title = index của bảng, value = title'''
    titles = full_data['title']
    indices = pd.Series(full_data.index, index=full_data['title'])
    return indices

In [8]:
def get_recommendation_new(title, full_data, indices, cosine_sim):
    '''Model recommendation dựa trên movie similarity'''
    
    #idx = mapping_title_toIndex(full_data)[title] #lấy ra index của title
    idx = indices[title]

    if type(idx) != np.int64:
        if len(idx)>1:
            '''print("ALERT: Multiple values")'''
            idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]

    movie_indices = [i[0] for i in sim_scores]
    movie_id = full_data['movieId'].iloc[movie_indices]
    score = [i[1] for i in sim_scores]

    return pd.DataFrame({'movieId': movie_id, 'sim_score': score})

In [9]:
def genre_based_popularity(genre, full_data):
    
    '''Model recommendation dựa trên popularity'''
    
    mask = full_data.genres.apply(lambda x: genre in x) # trả về dạng bool, check xem genre có trong cái list genres đó k
    filtered_movie = full_data[mask]# trả về dataframe các film match với genre
    filtered_movie = filtered_movie.sort_values(by='popularity', ascending=False) #xếp theo độ phổ biến nhất
    return filtered_movie['movieId'].head(3).values.tolist() 

In [10]:
def make_useinfo_df(full_data, train_df): 

    """User information

    Returns
    
    user_vector: A list with length of number of genres. User's frequency of watching 
    a specific genre.
    avg_rating: user's average rating of all movies
    num_movies_rated: number of movies user have rated
    """

    full_data['genres'] = full_data.genres.apply(lambda x: literal_eval(str(x)))
    train_df['genres'] = train_df.genres.apply(lambda x: literal_eval(str(x)))
    
    unique_genre = full_data['genres'].explode().unique() #unique genres của full_data
    genre_distribution = train_df['genres'].explode().value_counts() #unique genres của train_df
    
    # Make a dict assigning an index to a genre
    genre_dict = {k: v for v, k in enumerate(unique_genre)} #key-value: genre - encode

    user_ids = train_df['userId'].unique()
    user_df = pd.DataFrame(columns=['userId', 'user_vector', 'avg_rating', 'num_movies_rated'])
    for user_id in user_ids:
        user_rating_df = train_df[(train_df['userId'] == user_id)]
        user_vector = np.zeros(len(genre_dict))
        count_vector = np.zeros(len(genre_dict))

        user_avg_rating = 0
        movies_rated_count = 0
        for _, row in user_rating_df.iterrows():
            user_avg_rating += row.rating 
            movies_rated_count += 1
            genres = row.genres

            user_movie_vector = np.zeros(len(genre_dict))

            for g in genres:
                user_movie_vector[genre_dict[g]] = 1
                count_vector[genre_dict[g]] += 1
            
            user_vector += user_movie_vector*row.rating
        count_vector = np.where(count_vector==0, 1, count_vector)
        user_vector = np.divide(user_vector, count_vector)
        user_avg_rating /= movies_rated_count
        row_df = pd.DataFrame([[user_id, user_vector, user_avg_rating, movies_rated_count]], 
                          columns=['userId', 'user_vector', 'avg_rating', 'num_movies_rated'])
        user_df = pd.concat([user_df, row_df], ignore_index=True)
        
    return user_df


In [11]:
def user_top_genre(userId, user_info, idx_to_genre): 

    """Get user's favorite genre
    """

    user_vec = user_info['user_vector'][user_info['userId'] == userId].values[0].copy()
    # print("User Vector: ", user_vec)
    top_genre_indices = np.flip(np.argsort(user_vec))
    genre_list = []
    for i in top_genre_indices[:3]:
        genre_list.append(idx_to_genre[i])
    return genre_list 
    '''đặt 1 biến là genre_list = user_top_genre(userId, user_info'''
    

In [12]:
def get_title(x):
    '''lấy ra title của hàm hybrid'''
    mid = x['movieId']
    return full_data['title'][full_data['movieId'] == mid].values

def get_genre(x):
    '''get genre của hybrid'''
    mid = x['movieId']
    return full_data['genres'][full_data['movieId'] == mid].values

In [13]:
def hybrid(userId, full_data, train_df, train_full, models, \
           idx_to_genre, cosine_sim, indices): 
    
    """ Hybrid Approach: 
    - Predict rating of user to similar movies (content_based) to the movies they actually
    have watched
    - Predict rating of user to popular movies based on their favourite genres
    - Choose the higher estimated rating 
    """

    user_movies = train_df[train_df['userId'] == userId]['movieId'].values.tolist()
    
    # get 10 most similar movies

    sim_movies_list = pd.DataFrame(columns=['movieId', 'sim_score'])
    
    if len(user_movies) > 30:
        user_movies = np.random.choice(user_movies, size=30, replace=False).tolist()

    for movie_id in user_movies:
        movie_title = full_data['title'][full_data['movieId'] == movie_id].values[0]
        sim_movie = get_recommendation_new(movie_title, full_data, indices, cosine_sim)
        sim_movies_list = pd.concat([sim_movies_list, sim_movie], ignore_index=True)
    sim_movies_list = sim_movies_list.sort_values(by='sim_score', ascending=False)
    sim_movies_list = sim_movies_list['movieId'].head(10).values.tolist()

    recommend_list = pd.DataFrame(columns=['userId', 'movieId', 'est', "Model"])
    recommend_list.userId = userId
    recommend_list.movieId = user_movies

    # Compute ratings for the similar movies
    for movie_id in sim_movies_list:
        pred_rating = 0
        for model, w in models.items():
            pred_rating += model.algorithm.predict(userId, movie_id)[3] * w
        row_df = pd.DataFrame([[userId, movie_id, pred_rating, "Similarity"]], columns=['userId', 'movieId', 'est', "Model"])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    recommend_list = recommend_list[recommend_list['est'].notnull()].head(10).sort_values(by = 'est',ascending = False)

    # Predict popular movies based on user's favourite genres
    top_genre_list = user_top_genre(userId, make_useinfo_df(full_data, train_full), \
                                    idx_to_genre) #data frame user_info

    popular_movies = []
    for top_genre in top_genre_list:
        popular_movies.extend(genre_based_popularity(top_genre, full_data))

    # Compute ratings for the popular movies
    for movie_id in popular_movies:
        pred_rating = 0
        for model, w in models.items():
            pred_rating += model.algorithm.predict(userId, movie_id)[3] * w        
        row_df = pd.DataFrame([[userId, movie_id, pred_rating, 'Popularity']], columns=['userId', 'movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)

    recommend_list = recommend_list.drop_duplicates(subset=['movieId'])

    # Remove movies in training for this user
    train_movie_list = train_df[train_df['userId']==userId]['movieId'].values.tolist()
    mask = recommend_list.movieId.apply(lambda x: x not in train_movie_list)
    recommend_list = recommend_list[mask]

    # add title and genre column
    recommend_list.userId = recommend_list.userId.astype(int)
    recommend_list['title'] = recommend_list.apply(get_title, axis=1)
    recommend_list['genre'] = recommend_list.apply(get_genre, axis=1)

    return recommend_list.sort_values(by='est', ascending=False).head(10)


In [14]:
def hybrid_predict(full_data, train_df, test_df, models, idx_to_genre):
    """ Hybrid recommend for all users in thr dataset
    """

    indices = mapping_title_toIndex(full_data)
    cosine_sim = cosine_similarity(full_data)
    train_full = pd.merge(full_data,train_df, on='movieId')

    cols = ['userId', 'movieId', 'est', 'Model']
    pred_df = pd.DataFrame(columns=cols)

    for user in test_df.userId.unique():
        hybrid_result = hybrid(user,full_data, train_df, train_full, models, idx_to_genre, \
               cosine_sim, indices)
        pred_df = pd.concat([pred_df, hybrid_result[cols]], ignore_index=True)     
           
    return pred_df

    

# DataFrame

**Full Dataset**

In [15]:
full_data.head(2)

Unnamed: 0,id,movieId,title,genres,description,cast,director,keywords,popularity,vote_average,vote_count,year,wr,spoken_languages,description_genre
0,862,1,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...","[tomhanks, timallen, donrickles]",johnlasseter,"[jealousi, toy, boy, friendship, friend, rival...",21.946943,7,5415,1995,6.86977,[English],"Led by Woody, Andy's toys live happily in his ..."
1,8844,2,Jumanji,"['Adventure', 'Fantasy', 'Family']",When siblings Judy and Peter discover an encha...,"[robinwilliams, jonathanhyde, kirstendunst]",joejohnston,"[boardgam, disappear, basedonchildren'sbook, n...",17.015539,6,2413,1995,5.884891,"[English, Français]",When siblings Judy and Peter discover an encha...


In [16]:
full_data = full_data[['movieId', 'title', 'genres', 'description', 'popularity']]

**Train_df, Test_df**

In [17]:
# train, test_df
train_df, test_df = train_test_split(rating_df, random_state=42, \
                                     stratify=rating_df['userId'], test_size=0.25)
train_full = pd.merge(full_data,train_df, on='movieId')

print(train_df.shape)
train_df = train_df[train_df.movieId.isin(full_data.movieId)]
print(train_df.shape)

(75003, 4)
(74866, 4)


In [18]:
train_df.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
59679,431,2863,4.0,1165548515
85061,571,7173,2.0,1334343358


In [19]:
test_df.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
42031,302,593,5.0,843720636
26084,191,110,3.0,839925631


In [20]:
train_full.head(2)

Unnamed: 0,movieId,title,genres,description,popularity,userId,rating,timestamp
0,1,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...",21.946943,534,5.0,973376852
1,1,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...",21.946943,219,5.0,974475264


**User_info**

In [21]:
user_info = make_useinfo_df(full_data, train_full)

In [22]:
user_info.head(3)

Unnamed: 0,userId,user_vector,avg_rating,num_movies_rated
0,534,"[4.545454545454546, 3.140625, 3.8, 3.960784313...",3.651982,227
1,219,"[3.5, 3.8095238095238093, 3.5789473684210527, ...",3.682692,104
2,63,"[4.333333333333333, 3.5, 3.9375, 4.02083333333...",3.787671,73


# Algorithms

**knnbaseline_algo, svdpp_algo, BaselineOnly**

In [23]:
# train_set, test_set = train_test_split(train_df, test_size=0.25,\
                                    # stratify=train_df['userId'], random_state=42)

In [21]:
# Tuple: precision, recall, F1 score

knn_base_line = CF.CollaborativeFiltering(KNNBaseline())
knn_base_line.load_data(train_df, test_df)
knn_base_line.fit()
knn_base_line.predict()

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9038
MAE:  0.6921


(0.751916774772597, 0.6784242547818156, 0.713282450887898)

In [22]:
# svd_algo = svd(train_convert, val_convert)
svdpp = CF.CollaborativeFiltering(SVDpp())
svdpp.load_data(train_df, test_df)
svdpp.fit()
svdpp.predict()

RMSE: 0.8908
MAE:  0.6828


(0.7294449021058556, 0.6936404660478344, 0.7110922692000845)

In [23]:
base_line_only = CF.CollaborativeFiltering(BaselineOnly())
base_line_only.load_data(train_df, test_df)
base_line_only.fit()
base_line_only.predict()

Estimating biases using als...
RMSE: 0.8997
MAE:  0.6954


(0.738041092366959, 0.6947553609459112, 0.7157443813246984)

**cosine_similarity, indices**

In [27]:
cosine_sim = cosine_similarity(full_data)

In [28]:
indices = mapping_title_toIndex(full_data)

In [29]:
genre_to_idx = {'Animation': 0,
 'Comedy': 1,
 'Family': 2,
 'Adventure': 3,
 'Fantasy': 4,
 'Romance': 5,
 'Drama': 6,
 'Action': 7,
 'Crime': 8,
 'Thriller': 9,
 'Horror': 10,
 'History': 11,
 'Science Fiction': 12,
 'Mystery': 13,
 'War': 14,
 'Foreign': 15,
 'Music': 16,
 'Documentary': 17,
 None: 18,
 'Western': 19,
 'TV Movie': 20}

In [25]:
idx_to_genre = {0: 'Animation',
 1: 'Comedy',
 2: 'Family',
 3: 'Adventure',
 4: 'Fantasy',
 5: 'Romance',
 6: 'Drama',
 7: 'Action',
 8: 'Crime',
 9: 'Thriller',
 10: 'Horror',
 11: 'History',
 12: 'Science Fiction',
 13: 'Mystery',
 14: 'War',
 15: 'Foreign',
 16: 'Music',
 17: 'Documentary',
 18: None,
 19: 'Western',
 20: 'TV Movie'}

# Modeling 

**Movie Similarity model**

In [32]:
get_recommendation_new('Toy Story', full_data,indices, cosine_sim)

Unnamed: 0,movieId,sim_score
2522,3114,0.287568
7629,78499,0.232597
6267,35836,0.123023
8432,103335,0.107159
2751,3429,0.105082


**Popularity Model**

In [33]:
# get user's favourite genres
user_top_gen = user_top_genre(1, user_info, idx_to_genre)
user_top_gen

['Romance', 'Horror', 'Western']

In [34]:
# popular movie id based on genres
popularity = genre_based_popularity('Horror', full_data)
popularity

[1219, 112818, 103249]

**Hybrid Model**

In [36]:
# combine best algorithms by weight
models = {svdpp: 0.4, base_line_only: 0.4, knn_base_line: 0.2}

hybrid_result = hybrid(1, full_data, train_df, train_full, models, \
                       idx_to_genre, cosine_sim, indices)
hybrid_result


Unnamed: 0,userId,movieId,est,Model,title,genre
13,1,1219,3.432222,Popularity,[Psycho],"[[Drama, Horror, Thriller]]"
10,1,356,3.333278,Popularity,[Forrest Gump],"[[Comedy, Drama, Romance]]"
18,1,99114,3.239684,Popularity,[Django Unchained],"[[Drama, Western]]"
17,1,128360,3.220401,Popularity,[The Hateful Eight],"[[Crime, Drama, Mystery, Western]]"
16,1,139385,3.10586,Popularity,[The Revenant],"[[Western, Drama, Adventure, Thriller]]"
0,1,1344,3.005339,Similarity,[Cape Fear],"[[Drama, Thriller]]"
11,1,152017,2.893007,Popularity,"[Me Before You, Me Before You]","[[Drama, Romance], [Drama, Romance]]"
1,1,5489,2.889955,Similarity,[Nosferatu the Vampyre],"[[Drama, Horror]]"
2,1,147426,2.858733,Similarity,[İtirazım Var],"[[Drama, Action, Crime]]"
3,1,6014,2.851651,Similarity,[National Security],"[[Action, Comedy, Crime, Thriller]]"


# Evaluate model 

In [46]:
pred_df = hybrid_predict(full_data, train_df, test_df, models, idx_to_genre)

In [47]:
evaluate(pred_df[['userId', 'movieId']], test_df[['userId', 'movieId']])
# output: 0.041463818512998844

0.041463818512998844

# Try embedding with other features

In [29]:
def cosine_similarity(full_data): 
    '''tính cosine similarity dựa trên director, keywords, cast, genres.
    With more weight on director'''
    full_data['director'] = full_data['director'].apply(lambda x: [x,x, x])
    full_data['soup'] = full_data['keywords'] + full_data['cast'] + \
                        full_data['director'] + full_data['genres']
    
    full_data['soup'] = full_data['soup'].apply(lambda x: ' '.join(x))
    tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english')
    tfidf_matrix = tfidf.fit_transform(full_data['soup'])
    cosine_sim= linear_kernel(tfidf_matrix, tfidf_matrix)
    return cosine_sim

In [31]:
full_data = movie_feature(movies_metadata, links_small, credits_, keywords)

# combine best algorithms by weight
models = {svdpp: 0.4, base_line_only: 0.4, knn_base_line: 0.2}

pred_df = hybrid_predict(full_data, train_df, test_df, models, idx_to_genre)

In [32]:
evaluate(pred_df[['userId', 'movieId']], test_df[['userId', 'movieId']])

0.03507203179334327