In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv(r'C:\Users\Admin\Downloads\References\23-24-1\thesis\Movie-Recommendation-System\data\train_set.csv', )
trainset, testset = train_test_split(data, test_size=0.2, random_state = 42, stratify=data['userId'])

trainset

Unnamed: 0,userId,movieId,rating,timestamp
27282,544,4963,4.5,1435790757
28513,99,5952,4.0,1044786919
956,396,29,4.0,839256240
44694,150,435,2.5,1114309597
26806,41,6333,4.0,1109812953
...,...,...,...,...
20943,55,79,3.0,855927172
71762,572,60072,3.0,1436779892
14928,134,1584,3.5,1361245898
1642,533,3196,4.0,965316395


In [2]:

import numpy as np
import pandas as pd



from surprise import Dataset, Reader
from surprise import NMF, SVD
# from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.dataset import DatasetAutoFolds

from collections import defaultdict



In [3]:
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader

In [4]:

def precision_recall_at_k(test_df, threshold):
    """Return recall and precision, F-1 scrore for collaborative + hybrid

    Args:
        test_df: prediction dataframe, with 4 columns: userId, movieId, true_rating, pred_rating
        threshold: if rating > threshold, movie is believed to be relevant

    Returns:
    Recall: Proportion of relevant items that are recommended, dict-like
    Precision: Proportion of recommended items that are relevant, dict-like
        Movie is relevant if true_rating > threshold
        Movie is recommend when pred_rating > threshold
    """
    recalls = dict()
    precisions = dict()

    for userId, group in test_df.groupby('userId'):

        filter_rel = group[group['rating'] > threshold]
        filter_rec = group[group['pred_rating'] > threshold]
        filter_rel_rec = group[(group['pred_rating'] > threshold) & \
                               (group['rating'] > threshold)]

        # Number of relevant items
        n_rel = len(filter_rel)

        # Number of recommended items in top k
        n_rec = len(filter_rec)

        # Number of relevant and recommended items in top k
        n_rel_rec = len(filter_rel_rec)

        recalls[userId] = n_rel_rec/n_rel if n_rel != 0 else 1
        precisions[userId] = n_rel_rec/n_rec if n_rec != 0 else 1

    precision = sum(prec for prec in precisions.values())/len(precisions)
    recall = sum(rec for rec in recalls.values())/len(recalls)
    fmeasure = (2*precision*recall)/(precision + recall)

    return recall, precision, fmeasure
    

In [5]:
class CollaborativeFiltering:
    def __init__(self, algorithm=None):
        self.trainset = None
        self.testset = None
        self.algorithm = algorithm
        self.predictions = None
        self.test_df = None
        
    def load_data(self, trainset, testset):
        reader = Reader()
  
        train = Dataset.load_from_df(trainset[['userId', 'movieId', 'rating']], reader)
        test = Dataset.load_from_df(testset[['userId', 'movieId', 'rating']], reader)
        
        self.trainset = train.build_full_trainset()
        full_testset = test.build_full_trainset()
        self.testset = full_testset.build_testset()
        
    def fit_predict(self):
    
        self.algorithm.fit(self.trainset)
        predictions_test = self.algorithm.test(self.testset)
        self.predictions = predictions_test
        rmse = accuracy.rmse(predictions_test)
        mae = accuracy.mae(predictions_test)

        test_df = pd.DataFrame(self.predictions).drop(columns='details')
        test_df.columns = ['userId', 'movieId', 'rating', 'pred_rating']
        self.test_df = test_df
        pre, recall, f_measure = precision_recall_at_k(test_df, 4)
        return rmse, mae, pre, recall, f_measure



In [6]:
svd = CollaborativeFiltering(SVD())
svd.load_data(trainset, testset)
svd.fit_predict()

RMSE: 0.9036
MAE:  0.6986


(0.9035793665407348,
 0.6985507969055574,
 0.4862125448466086,
 0.625165301118886,
 0.5470024674511051)

In [7]:
# Chọn SVD

In [8]:
nmf = CollaborativeFiltering(NMF())
nmf.load_data(trainset, testset)
nmf.fit_predict()



RMSE: 0.9636
MAE:  0.7435


(0.9636161866368311,
 0.7435195828175822,
 0.5159494541157457,
 0.569150338954535,
 0.5412457150369476)

In [9]:
svd_pp = CollaborativeFiltering(SVDpp())
svd_pp.load_data(trainset, testset)
svd_pp.fit_predict()

RMSE: 0.8924
MAE:  0.6875


(0.8924291286971074,
 0.6874550651171686,
 0.5121456154979761,
 0.6225867150333924,
 0.5619916658624284)

In [10]:
# Chọn SVD_pp

In [11]:
base = CollaborativeFiltering(BaselineOnly())
base.load_data(trainset, testset)
base.fit_predict()

Estimating biases using als...
RMSE: 0.8959
MAE:  0.6948


(0.8959359082434771,
 0.6947684945619227,
 0.4439966387653323,
 0.6654748047585647,
 0.5326294393974716)

In [12]:
# Chọn Baseline only

In [13]:
slope_one = CollaborativeFiltering(SlopeOne())
slope_one.load_data(trainset, testset)
slope_one.fit_predict()

RMSE: 0.9401
MAE:  0.7225


(0.9400987538992951,
 0.7224915394597742,
 0.5437412750177517,
 0.5476972842799368,
 0.5457121102075829)

In [15]:
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore

In [16]:
knn_base = CollaborativeFiltering(KNNBaseline())
knn_base.load_data(trainset, testset)
knn_base.fit_predict()

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9064
MAE:  0.6960


(0.9064242777503783,
 0.6959879814186629,
 0.5396098729617446,
 0.5639506634707828,
 0.5515118307073799)

In [17]:
# KNN baseline tạm ổn

In [18]:
KNN_basic = CollaborativeFiltering(KNNBasic())
KNN_basic.load_data(trainset, testset)
KNN_basic.fit_predict()

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9822
MAE:  0.7561


(0.9821644552243276,
 0.7560629093594452,
 0.5078568001439105,
 0.5318497492159886,
 0.5195764361789659)

In [19]:
KNN_means = CollaborativeFiltering(KNNWithMeans())
KNN_means.load_data(trainset, testset)
KNN_means.fit_predict()

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9279
MAE:  0.7117


(0.9278605725076455,
 0.7116961796504687,
 0.5404972762656267,
 0.5925479536533269,
 0.5653270435272965)

In [20]:
KNN_Z = CollaborativeFiltering(KNNWithZScore())
KNN_Z.load_data(trainset, testset)
KNN_Z.fit_predict()

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9270
MAE:  0.7068


(0.9270370628606168,
 0.7068406602644297,
 0.560540829377121,
 0.5908486795214486,
 0.5752958599946838)

In [21]:
df = svd.test_df
df.columns = ['userId',	'movieId',	'rating',	'svd_rating']
df['svd_pp'] = svd_pp.test_df['pred_rating']
df['base'] = base.test_df['pred_rating']
df['knn_base'] = knn_base.test_df['pred_rating']
df.head()

Unnamed: 0,userId,movieId,rating,svd_rating,svd_pp,base,knn_base
0,212,916,3.5,3.459605,3.671065,3.525547,3.867603
1,212,5459,2.0,2.911499,3.117193,2.904066,2.857219
2,212,172,2.0,2.426523,2.163626,2.382521,2.126295
3,212,2421,2.0,2.800626,2.649285,2.563658,2.292795
4,212,3988,3.0,2.739088,2.623174,2.828012,2.730163


In [22]:
# Combine

In [23]:
import math

In [24]:

rmse = ((df.rating - df.svd_pp) ** 2).mean() ** .5
print(rmse)
mae = (((df.rating - df.svd_pp) ** 2) ** .5).mean()
print(mae)

0.8924291286971074
0.6874550651171686


In [25]:
T = df.shape[0]
print(T)

15001


In [26]:

svd_wt = 0.25
knn_base_wt = 0.25
svdpp_wt = 0.3
base_wt = 0.2


In [27]:



combined_df = svd.test_df[['userId',	'movieId',	'rating']]
combined_df['pred_rating']  = svd_wt*df['svd_rating'] + svdpp_wt*df['svd_pp'] + knn_base_wt*df['knn_base'] + base_wt*df['base']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['pred_rating']  = svd_wt*df['svd_rating'] + svdpp_wt*df['svd_pp'] + knn_base_wt*df['knn_base'] + base_wt*df['base']


In [28]:
combined_df

Unnamed: 0,userId,movieId,rating,pred_rating
0,212,916,3.5,3.638231
1,212,5459,2.0,2.958151
2,212,172,2.0,2.263796
3,212,2421,2.0,2.580872
4,212,3988,3.0,2.719868
...,...,...,...,...
14996,100,745,4.0,4.220742
14997,100,708,3.0,3.233489
14998,76,2087,3.5,4.031782
14999,76,3671,3.5,4.077845


In [46]:

precision_recall_at_k(combined_df, 4)

(0.500513768889793, 0.651991609198833, 0.5662980560591095)

In [47]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse = mean_squared_error(combined_df['rating'], combined_df['pred_rating'], squared=False)

mae = mean_absolute_error(combined_df['rating'], combined_df['pred_rating'])
rmse, mae

(0.8832231866912136, 0.6814622438213048)

## Evaluate

In [48]:
valid = pd.read_csv(r'C:\Users\Admin\Downloads\References\23-24-1\thesis\Movie-Recommendation-System\data\test_set.csv')
data = pd.read_csv(r'C:\Users\Admin\Downloads\References\23-24-1\thesis\Movie-Recommendation-System\data\train_set.csv')

In [49]:
from collections import defaultdict
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [50]:
reader = Reader()

train = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)

trainset = train.build_full_trainset()
anti_train = trainset.build_anti_testset()

In [51]:
model = SVD()
model.fit(trainset)
predictions = model.test(anti_train)

In [52]:
predict_svd = pd.DataFrame(predictions).sort_values(by='est', ascending=False)
predict_svd

Unnamed: 0,uid,iid,r_ui,est,details
4256415,46,3168,3.544645,5.0,{'was_impossible': False}
1834114,235,904,3.544645,5.0,{'was_impossible': False}
3913363,298,905,3.544645,5.0,{'was_impossible': False}
1204950,652,750,3.544645,5.0,{'was_impossible': False}
1334297,656,1266,3.544645,5.0,{'was_impossible': False}
...,...,...,...,...,...
4709536,581,3826,3.544645,1.0,{'was_impossible': False}
1321628,133,1831,3.544645,1.0,{'was_impossible': False}
4708941,581,3755,3.544645,1.0,{'was_impossible': False}
2797073,609,2735,3.544645,1.0,{'was_impossible': False}


In [53]:
top_10 = predict_svd.groupby('uid', sort=False).head(10)
top_10 = top_10[['uid', 'iid']]
top_10.columns = ['userId', 'movieId']
top_10.head()

Unnamed: 0,userId,movieId
4256415,46,3168
1834114,235,904
3913363,298,905
1204950,652,750
1334297,656,1266


In [54]:
valid_set = valid[['userId', 'movieId']]

In [55]:
valid_set

Unnamed: 0,userId,movieId
0,302,593
1,191,110
2,457,5337
3,239,1042
4,292,5377
...,...,...
24996,353,34319
24997,428,743
24998,595,1097
24999,664,61248


In [56]:
def check_movieId(pred_df, val_df):
    result = pred_df['movieId'].isin(val_df[val_df['userId'] == \
                                   pred_df['userId'].iloc[0]]['movieId'])
    return result.reset_index(drop=True)

def evaluate(pred_df, val_df):
    """ Proportion of movies recommended that were actually watched by users

    Args:
    pred_df: dataframe, 2 columns: userId, movieId
    val_df: dataframe, 2 columns: userId, movieId

    Returns:
    
    """
    result = pred_df.groupby('userId').apply(lambda x: check_movieId(x, val_df))
    n_user = val_df.userId.nunique()
    top_k = pred_df.groupby('userId').count().iloc[0]
    top_k = int(top_k.iloc[0])

    print(result.sum() / (n_user*top_k))
    return result.sum() / (n_user*top_k)

In [57]:
top_10 = top_10.reset_index()

In [58]:
top_10

Unnamed: 0,index,userId,movieId
0,4256415,46,3168
1,1834114,235,904
2,3913363,298,905
3,1204950,652,750
4,1334297,656,1266
...,...,...,...
6705,4708782,581,1225
6706,4709203,581,2289
6707,4710403,581,1287
6708,4708362,581,48516


In [59]:
evaluate(top_10, valid_set)

movieId
0    0.008048
1    0.009240
2    0.006706
3    0.005663
4    0.006408
5    0.006706
6    0.004620
7    0.004918
8    0.004173
9    0.005365
dtype: float64


movieId
0    0.008048
1    0.009240
2    0.006706
3    0.005663
4    0.006408
5    0.006706
6    0.004620
7    0.004918
8    0.004173
9    0.005365
dtype: float64

In [60]:
print(evaluate(top_10, valid_set))

movieId
0    0.008048
1    0.009240
2    0.006706
3    0.005663
4    0.006408
5    0.006706
6    0.004620
7    0.004918
8    0.004173
9    0.005365
dtype: float64
movieId
0    0.008048
1    0.009240
2    0.006706
3    0.005663
4    0.006408
5    0.006706
6    0.004620
7    0.004918
8    0.004173
9    0.005365
dtype: float64


In [61]:
top_10

Unnamed: 0,index,userId,movieId
0,4256415,46,3168
1,1834114,235,904
2,3913363,298,905
3,1204950,652,750
4,1334297,656,1266
...,...,...,...
6705,4708782,581,1225
6706,4709203,581,2289
6707,4710403,581,1287
6708,4708362,581,48516


In [62]:
valid_set

Unnamed: 0,userId,movieId
0,302,593
1,191,110
2,457,5337
3,239,1042
4,292,5377
...,...,...
24996,353,34319
24997,428,743
24998,595,1097
24999,664,61248
