In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv(r'C:\Users\Admin\Downloads\References\23-24-1\thesis\Movie-Recommendation-System\data\train_set.csv', )
trainset, testset = train_test_split(data, test_size=0.2, random_state = 42, stratify=data['userId'])

trainset

Unnamed: 0,userId,movieId,rating,timestamp
27282,544,4963,4.5,1435790757
28513,99,5952,4.0,1044786919
956,396,29,4.0,839256240
44694,150,435,2.5,1114309597
26806,41,6333,4.0,1109812953
...,...,...,...,...
20943,55,79,3.0,855927172
71762,572,60072,3.0,1436779892
14928,134,1584,3.5,1361245898
1642,533,3196,4.0,965316395


In [2]:

import numpy as np
import pandas as pd



from surprise import Dataset, Reader
from surprise import NMF, SVD
# from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.dataset import DatasetAutoFolds

from collections import defaultdict



In [3]:
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader

In [4]:

def precision_recall_at_k(test_df, threshold):
    """Return recall and precision, F-1 scrore for collaborative + hybrid

    Args:
        test_df: prediction dataframe, with 4 columns: userId, movieId, true_rating, pred_rating
        threshold: if rating > threshold, movie is believed to be relevant

    Returns:
    Recall: Proportion of relevant items that are recommended, dict-like
    Precision: Proportion of recommended items that are relevant, dict-like
        Movie is relevant if true_rating > threshold
        Movie is recommend when pred_rating > threshold
    """
    recalls = dict()
    precisions = dict()

    for userId, group in test_df.groupby('userId'):

        filter_rel = group[group['rating'] > threshold]
        filter_rec = group[group['pred_rating'] > threshold]
        filter_rel_rec = group[(group['pred_rating'] > threshold) & \
                               (group['rating'] > threshold)]

        # Number of relevant items
        n_rel = len(filter_rel)

        # Number of recommended items in top k
        n_rec = len(filter_rec)

        # Number of relevant and recommended items in top k
        n_rel_rec = len(filter_rel_rec)

        recalls[userId] = n_rel_rec/n_rel if n_rel != 0 else 1
        precisions[userId] = n_rel_rec/n_rec if n_rec != 0 else 1

    precision = sum(prec for prec in precisions.values())/len(precisions)
    recall = sum(rec for rec in recalls.values())/len(recalls)
    fmeasure = (2*precision*recall)/(precision + recall)

    return recall, precision, fmeasure
    

In [5]:
class CollaborativeFiltering:
    def __init__(self, algorithm=None):
        self.trainset = None
        self.testset = None
        self.algorithm = algorithm
        self.predictions = None
        self.test_df = None
        
    def load_data(self, trainset, testset):
        reader = Reader()
  
        train = Dataset.load_from_df(trainset[['userId', 'movieId', 'rating']], reader)
        test = Dataset.load_from_df(testset[['userId', 'movieId', 'rating']], reader)
        
        self.trainset = train.build_full_trainset()
        full_testset = test.build_full_trainset()
        self.testset = full_testset.build_testset()
        
    def fit_predict(self):
    
        self.algorithm.fit(self.trainset)
        predictions_test = self.algorithm.test(self.testset)
        self.predictions = predictions_test
        rmse = accuracy.rmse(predictions_test)
        mae = accuracy.mae(predictions_test)

        test_df = pd.DataFrame(self.predictions).drop(columns='details')
        test_df.columns = ['userId', 'movieId', 'rating', 'pred_rating']
        self.test_df = test_df
        pre, recall, f_measure = precision_recall_at_k(test_df, 3.5)
        return rmse, mae, pre, recall, f_measure



In [6]:
svd = CollaborativeFiltering(SVD())
svd.load_data(trainset, testset)
svd.fit_predict()

RMSE: 0.9010
MAE:  0.6977


(0.901029533082979,
 0.6977321556357086,
 0.735032322112937,
 0.7119547542504887,
 0.7233095095379379)

In [7]:
# Chọn SVD

In [8]:
nmf = CollaborativeFiltering(NMF())
nmf.load_data(trainset, testset)
nmf.fit_predict()



RMSE: 0.9605
MAE:  0.7389


(0.9605113062753561,
 0.7389200315898368,
 0.6961718488296731,
 0.7023542123634607,
 0.6992493656317427)

In [9]:
svd_pp = CollaborativeFiltering(SVDpp())
svd_pp.load_data(trainset, testset)
svd_pp.fit_predict()

RMSE: 0.8936
MAE:  0.6875


(0.8936036762831866,
 0.687500726527451,
 0.736499964070086,
 0.7242152599765129,
 0.7303059544680887)

In [10]:
# Chọn SVD_pp

In [11]:
base = CollaborativeFiltering(BaselineOnly())
base.load_data(trainset, testset)
base.fit_predict()

Estimating biases using als...
RMSE: 0.8959
MAE:  0.6948


(0.8959359082434771,
 0.6947684945619227,
 0.7451798526402156,
 0.7125896660760395,
 0.7285204629976084)

In [12]:
# Chọn Baseline only

In [13]:
slope_one = CollaborativeFiltering(SlopeOne())
slope_one.load_data(trainset, testset)
slope_one.fit_predict()

RMSE: 0.9401
MAE:  0.7225


(0.9400987538992951,
 0.7224915394597742,
 0.720229979193013,
 0.7062908259511104,
 0.7131922998173031)

In [14]:
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore

In [15]:
knn_base = CollaborativeFiltering(KNNBaseline())
knn_base.load_data(trainset, testset)
knn_base.fit_predict()

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9064
MAE:  0.6960


(0.9064242777503783,
 0.6959879814186629,
 0.7676489360995878,
 0.6940766250240114,
 0.7290112411548947)

In [16]:
# KNN baseline tạm ổn

In [17]:
KNN_basic = CollaborativeFiltering(KNNBasic())
KNN_basic.load_data(trainset, testset)
KNN_basic.fit_predict()

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9822
MAE:  0.7561


(0.9821644552243276,
 0.7560629093594452,
 0.763898630304965,
 0.6660556172932007,
 0.7116297246737961)

In [18]:
KNN_means = CollaborativeFiltering(KNNWithMeans())
KNN_means.load_data(trainset, testset)
KNN_means.fit_predict()

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9279
MAE:  0.7117


(0.9278605725076455,
 0.7116961796504687,
 0.7437416289554162,
 0.7025126660306112,
 0.7225394820355117)

In [19]:
KNN_Z = CollaborativeFiltering(KNNWithZScore())
KNN_Z.load_data(trainset, testset)
KNN_Z.fit_predict()

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9270
MAE:  0.7068


(0.9270370628606168,
 0.7068406602644297,
 0.7467526909536043,
 0.6982505287417268,
 0.7216876117516187)

In [20]:
df = svd.test_df
df.columns = ['userId',	'movieId',	'rating',	'svd_rating']
df['svd_pp'] = svd_pp.test_df['pred_rating']
df['base'] = base.test_df['pred_rating']
df['knn_base'] = knn_base.test_df['pred_rating']
df.head()

Unnamed: 0,userId,movieId,rating,svd_rating,svd_pp,base,knn_base
0,212,916,3.5,3.507236,4.1831,3.525547,3.867603
1,212,5459,2.0,2.754342,2.717447,2.904066,2.857219
2,212,172,2.0,2.37652,2.151325,2.382521,2.126295
3,212,2421,2.0,2.623369,2.44099,2.563658,2.292795
4,212,3988,3.0,2.819597,2.676378,2.828012,2.730163


In [21]:
# Combine

In [22]:
import math

In [23]:

rmse = ((df.rating - df.svd_pp) ** 2).mean() ** .5
print(rmse)
mae = (((df.rating - df.svd_pp) ** 2) ** .5).mean()
print(mae)

0.8936036762831866
0.687500726527451


In [24]:
T = df.shape[0]
print(T)

15001


In [25]:

svd_wt = 0.25
knn_base_wt = 0.25
svdpp_wt = 0.3
base_wt = 0.2


In [26]:



combined_df = svd.test_df[['userId',	'movieId',	'rating']]
combined_df['pred_rating']  = svd_wt*df['svd_rating'] + svdpp_wt*df['svd_pp'] + knn_base_wt*df['knn_base'] + base_wt*df['base']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['pred_rating']  = svd_wt*df['svd_rating'] + svdpp_wt*df['svd_pp'] + knn_base_wt*df['knn_base'] + base_wt*df['base']


In [27]:
combined_df

Unnamed: 0,userId,movieId,rating,pred_rating
0,212,916,3.5,3.803749
1,212,5459,2.0,2.798938
2,212,172,2.0,2.247606
3,212,2421,2.0,2.474069
4,212,3988,3.0,2.755956
...,...,...,...,...
14996,100,745,4.0,4.180575
14997,100,708,3.0,3.277577
14998,76,2087,3.5,4.029178
14999,76,3671,3.5,4.052200


In [45]:

precision_recall_at_k(combined_df, 3.5)

(0.7515509954249983, 0.7202266518132968, 0.7355554802963483)

In [46]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse = mean_squared_error(combined_df['rating'], combined_df['pred_rating'], squared=False)

mae = mean_absolute_error(combined_df['rating'], combined_df['pred_rating'])
rmse, mae

(0.8826253410448964, 0.6811048935546885)

## Evaluate

In [47]:
valid = pd.read_csv(r'C:\Users\Admin\Downloads\References\23-24-1\thesis\Movie-Recommendation-System\data\test_set.csv')
data = pd.read_csv(r'C:\Users\Admin\Downloads\References\23-24-1\thesis\Movie-Recommendation-System\data\train_set.csv')

In [48]:
from collections import defaultdict
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [49]:
reader = Reader()

train = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)

trainset = train.build_full_trainset()
anti_train = trainset.build_anti_testset()

In [50]:
model = SVD()
model.fit(trainset)
predictions = model.test(anti_train)

In [51]:
predict_svd = pd.DataFrame(predictions).sort_values(by='est', ascending=False)
predict_svd

Unnamed: 0,uid,iid,r_ui,est,details
1204613,652,527,3.544645,5.0,{'was_impossible': False}
4253715,46,55820,3.544645,5.0,{'was_impossible': False}
4253746,46,428,3.544645,5.0,{'was_impossible': False}
3913275,298,7123,3.544645,5.0,{'was_impossible': False}
1359342,113,74458,3.544645,5.0,{'was_impossible': False}
...,...,...,...,...,...
4709593,581,48,3.544645,1.0,{'was_impossible': False}
4709556,581,519,3.544645,1.0,{'was_impossible': False}
4709548,581,372,3.544645,1.0,{'was_impossible': False}
4709544,581,2471,3.544645,1.0,{'was_impossible': False}


In [52]:
top_10 = predict_svd.groupby('uid', sort=False).head(10)
top_10 = top_10[['uid', 'iid']]
top_10.columns = ['userId', 'movieId']
top_10.head()

Unnamed: 0,userId,movieId
1204613,652,527
4253715,46,55820
4253746,46,428
3913275,298,7123
1359342,113,74458


In [53]:
valid_set = valid[['userId', 'movieId']]

In [54]:
valid_set

Unnamed: 0,userId,movieId
0,302,593
1,191,110
2,457,5337
3,239,1042
4,292,5377
...,...,...
24996,353,34319
24997,428,743
24998,595,1097
24999,664,61248


In [55]:
def check_movieId(pred_df, val_df):
    result = pred_df['movieId'].isin(val_df[val_df['userId'] == \
                                   pred_df['userId'].iloc[0]]['movieId'])
    return result.reset_index(drop=True)

def evaluate(pred_df, val_df):
    """ Proportion of movies recommended that were actually watched by users

    Args:
    pred_df: dataframe, 2 columns: userId, movieId
    val_df: dataframe, 2 columns: userId, movieId

    Returns:
    
    """
    result = pred_df.groupby('userId').apply(lambda x: check_movieId(x, val_df))
    n_user = val_df.userId.nunique()
    top_k = pred_df.groupby('userId').count().iloc[0]
    top_k = int(top_k.iloc[0])

    print(result.sum() / (n_user*top_k))
    return result.sum() / (n_user*top_k)

In [56]:
top_10 = top_10.reset_index()

In [57]:
top_10

Unnamed: 0,index,userId,movieId
0,1204613,652,527
1,4253715,46,55820
2,4253746,46,428
3,3913275,298,7123
4,1359342,113,74458
...,...,...,...
6705,4710894,581,3462
6706,4708953,581,4306
6707,4708362,581,48516
6708,4709118,581,904


In [58]:
evaluate(top_10, valid_set)

movieId
0    0.009985
1    0.007601
2    0.006110
3    0.005365
4    0.005067
5    0.006855
6    0.006259
7    0.004471
8    0.003875
9    0.004173
dtype: float64


movieId
0    0.009985
1    0.007601
2    0.006110
3    0.005365
4    0.005067
5    0.006855
6    0.006259
7    0.004471
8    0.003875
9    0.004173
dtype: float64

In [60]:
top_10

Unnamed: 0,index,userId,movieId
0,1204613,652,527
1,4253715,46,55820
2,4253746,46,428
3,3913275,298,7123
4,1359342,113,74458
...,...,...,...
6705,4710894,581,3462
6706,4708953,581,4306
6707,4708362,581,48516
6708,4709118,581,904


In [61]:
valid_set

Unnamed: 0,userId,movieId
0,302,593
1,191,110
2,457,5337
3,239,1042
4,292,5377
...,...,...
24996,353,34319
24997,428,743
24998,595,1097
24999,664,61248
