In [261]:
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv(r'C:\Users\Admin\Downloads\References\23-24-1\thesis\Movie-Recommendation-System\data\train_set.csv', )
trainset, validset = train_test_split(data, test_size=0.2, random_state = 42, stratify=data['userId'])

trainset

Unnamed: 0,userId,movieId,rating,timestamp
27282,544,4963,4.5,1435790757
28513,99,5952,4.0,1044786919
956,396,29,4.0,839256240
44694,150,435,2.5,1114309597
26806,41,6333,4.0,1109812953
...,...,...,...,...
20943,55,79,3.0,855927172
71762,572,60072,3.0,1436779892
14928,134,1584,3.5,1361245898
1642,533,3196,4.0,965316395


In [262]:
import numpy as np

from surprise import Dataset, Reader
from surprise import NMF, SVD
# from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.dataset import DatasetAutoFolds

from collections import defaultdict
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader


Metrics sử dụng trong bài để so sánh các thuật toán là RMSE, MAE, precision, recall, f-measure

In [263]:
# Tính precision, recall, f-measure
def precision_recall_at_k(test_df, threshold):
    """Return recall and precision, F-1 scrore for collaborative + hybrid

    Args:
        test_df: prediction dataframe, with 4 columns: userId, movieId, true_rating, pred_rating
        threshold: if rating > threshold, movie is believed to be relevant

    Returns:
    Recall: Proportion of relevant items that are recommended, dict-like
    Precision: Proportion of recommended items that are relevant, dict-like
        Movie is relevant if true_rating > threshold
        Movie is recommend when pred_rating > threshold
    """
    recalls = dict()
    precisions = dict()

    for userId, group in test_df.groupby('userId'):

        filter_rel = group[group['rating'] > threshold]
        filter_rec = group[group['pred_rating'] > threshold]
        filter_rel_rec = group[(group['pred_rating'] > threshold) & \
                               (group['rating'] > threshold)]

        # Number of relevant items
        n_rel = len(filter_rel)

        # Number of recommended items in top k
        n_rec = len(filter_rec)

        # Number of relevant and recommended items in top k
        n_rel_rec = len(filter_rel_rec)

        recalls[userId] = n_rel_rec/n_rel if n_rel != 0 else 1
        precisions[userId] = n_rel_rec/n_rec if n_rec != 0 else 1

    precision = sum(prec for prec in precisions.values())/len(precisions)
    recall = sum(rec for rec in recalls.values())/len(recalls)
    fmeasure = (2*precision*recall)/(precision + recall)

    return recall, precision, fmeasure
    

Tạo class CollaborativeFiltering, trong đó: \
input thuật toán, trainset, testset -> trả ra predictions trên tập test, các metrics đánh giá

In [264]:
class CollaborativeFiltering:
    def __init__(self, algorithm=None):
        self.trainset = None
        self.testset = None
        self.algorithm = algorithm
        self.predictions = None
        self.test_df = None
        
    def load_data(self, trainset: pd.DataFrame, testset: pd.DataFrame):
        reader = Reader()
  
        train = Dataset.load_from_df(trainset[['userId', 'movieId', 'rating']], reader)
        test = Dataset.load_from_df(testset[['userId', 'movieId', 'rating']], reader)
        
        self.trainset = train.build_full_trainset()
        full_testset = test.build_full_trainset()
        self.testset = full_testset.build_testset()
        
    def fit_predict(self):
        # Fit
        self.algorithm.fit(self.trainset)
        # Predict
        predictions_test = self.algorithm.test(self.testset)
        self.predictions = predictions_test

        # Tính rmse, mae
        rmse = accuracy.rmse(predictions_test)
        mae = accuracy.mae(predictions_test)

        # Tính precision, recall, f-measure với threshold rating = 3.75
        test_df = pd.DataFrame(self.predictions).drop(columns='details')
        test_df.columns = ['userId', 'movieId', 'rating', 'pred_rating']
        self.test_df = test_df
        pre, recall, f_measure = precision_recall_at_k(test_df, 3.75)

        # trả ra 1 tuple gồm 5 metrics đã tính
        return rmse, mae, pre, recall, f_measure



#### Đánh giá các thuật toán
Test các thuật toán khác nhau: SVD, NMF, KNN,... 
- Thấy rằng SVD, SVDpp, KNN Baseline, Baseline cho kết quả RMSE và precision tốt nhất so với các phương pháp còn lại \
RMSE khoảng 0.89 - 0,91 \
MAE khoảng 0.7 \
precision khoảng 0.52 - 0.57 
- Đặc biệt, KNN Baseline tuy có RMSE cao hơn 3 thuật toán còn lại một chút (cao hơn khoảng 0.01) nhưng precision cao hơn khá nhiều (=0.6011) \
- Nhìn chung các thuật toán KNN cho precision xấp xỉ 0.6, cao hơn các thuật toán còn lại. 


? Nên chọn thuật toán dựa theo RMSE hay precision?

In [265]:
svd = CollaborativeFiltering(SVD())
svd.load_data(trainset, validset)
svd.fit_predict()
# rmse, mae, precision, recall, f-measure

RMSE: 0.9002
MAE:  0.6957


(0.900186850964241,
 0.6957369304817678,
 0.5475501492428418,
 0.781600109478028,
 0.6439682101928421)

In [266]:
# Chọn SVD

In [267]:
nmf = CollaborativeFiltering(NMF())
nmf.load_data(trainset, validset)
nmf.fit_predict()



RMSE: 0.9719
MAE:  0.7493


(0.9718850471356919,
 0.7493135207165066,
 0.5237040053423342,
 0.7534961204255481,
 0.6179281199796646)

In [268]:
svd_pp = CollaborativeFiltering(SVDpp())
svd_pp.load_data(trainset, validset)
svd_pp.fit_predict()

RMSE: 0.8943
MAE:  0.6876


(0.8942677172094614,
 0.6876153915042712,
 0.5593134964515624,
 0.7846234388357155,
 0.6530819526576143)

In [269]:
# Chọn SVD_pp

In [270]:
base = CollaborativeFiltering(BaselineOnly())
base.load_data(trainset, validset)
base.fit_predict()

Estimating biases using als...
RMSE: 0.8959
MAE:  0.6948


(0.8959359082434771,
 0.6947684945619227,
 0.5242606261989056,
 0.7954302358494028,
 0.6319855134810897)

In [271]:
# Chọn Baseline only

In [272]:
slope_one = CollaborativeFiltering(SlopeOne())
slope_one.load_data(trainset, validset)
slope_one.fit_predict()

RMSE: 0.9401
MAE:  0.7225


(0.9400987538992951,
 0.7224915394597742,
 0.5579230180763148,
 0.7641522126082897,
 0.6449528723223139)

In [273]:
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore

In [274]:
knn_base = CollaborativeFiltering(KNNBaseline())
knn_base.load_data(trainset, validset)
knn_base.fit_predict()

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9064
MAE:  0.6960


(0.9064242777503783,
 0.6959879814186629,
 0.6011469016679707,
 0.7557408715283147,
 0.6696372277169577)

In [275]:
# KNN baseline ổn

In [276]:
KNN_basic = CollaborativeFiltering(KNNBasic())
KNN_basic.load_data(trainset, validset)
KNN_basic.fit_predict()

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9822
MAE:  0.7561


(0.9821644552243276,
 0.7560629093594452,
 0.5984823503523305,
 0.7297152062764107,
 0.6576155325095433)

In [277]:
KNN_means = CollaborativeFiltering(KNNWithMeans())
KNN_means.load_data(trainset, validset)
KNN_means.fit_predict()

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9279
MAE:  0.7117


(0.9278605725076455,
 0.7116961796504687,
 0.5771583922079696,
 0.7760328842089919,
 0.6619816422945439)

In [278]:
KNN_Z = CollaborativeFiltering(KNNWithZScore())
KNN_Z.load_data(trainset, validset)
KNN_Z.fit_predict()

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9270
MAE:  0.7068


(0.9270370628606168,
 0.7068406602644297,
 0.5907476145784287,
 0.7689852586590943,
 0.668184488497663)

In [279]:
df = svd.test_df
df.columns = ['userId',	'movieId',	'rating',	'svd_rating']
df['svd_pp'] = svd_pp.test_df['pred_rating']
df['base'] = base.test_df['pred_rating']
df['knn_base'] = knn_base.test_df['pred_rating']
df.head()

Unnamed: 0,userId,movieId,rating,svd_rating,svd_pp,base,knn_base
0,212,916,3.5,3.606142,3.539146,3.525547,3.867603
1,212,5459,2.0,2.798074,2.831244,2.904066,2.857219
2,212,172,2.0,2.181245,2.489776,2.382521,2.126295
3,212,2421,2.0,2.404826,2.703115,2.563658,2.292795
4,212,3988,3.0,2.814111,2.724827,2.828012,2.730163


### (Linear) Combine 
Dùng 4 thuật toán cho kết quả tốt nhất phía trên, linear combine kết quả để tạo ra rating mới, sau đó so sánh với real rating xem các metrics có cải thiện không

In [281]:
# Weight cho các thuật toán khác nhau
svd_wt = 0.25
knn_base_wt = 0.25
svdpp_wt = 0.3
base_wt = 0.2


In [282]:
combined_df = svd.test_df[['userId',	'movieId',	'rating']]
combined_df['pred_rating']  = svd_wt*df['svd_rating'] + svdpp_wt*df['svd_pp'] + knn_base_wt*df['knn_base'] + base_wt*df['base']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['pred_rating']  = svd_wt*df['svd_rating'] + svdpp_wt*df['svd_pp'] + knn_base_wt*df['knn_base'] + base_wt*df['base']


In [283]:
combined_df

Unnamed: 0,userId,movieId,rating,pred_rating
0,212,916,3.5,3.635289
1,212,5459,2.0,2.844010
2,212,172,2.0,2.300322
3,212,2421,2.0,2.498071
4,212,3988,3.0,2.769119
...,...,...,...,...
14996,100,745,4.0,4.179029
14997,100,708,3.0,3.277764
14998,76,2087,3.5,4.042492
14999,76,3671,3.5,4.095225


In [284]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse = mean_squared_error(combined_df['rating'], combined_df['pred_rating'], squared=False)

mae = mean_absolute_error(combined_df['rating'], combined_df['pred_rating'])
rmse, mae

(0.8829829214359362, 0.6806949748949307)

In [285]:
precision, recall, f_measure = precision_recall_at_k(combined_df, 3.75)

precision, recall, f_measure 

(0.5608002572401722, 0.7923857418064926, 0.6567761241271927)

Có thể thấy cả RMSE, MAE đều giảm so với tất cả các thuật toán riêng lẻ -> Combine có thể cho kết quả tốt hơn một chút \
Tuy nhiên precision, recall không có sự khác biệt lớn

## Evaluate trên metric tự build

Idea: 
- Train lại model đạt kết quả tốt trên cả tập train (= tập train + valid phía trên), predict cho tập anti của train
- Chọn top n movies có đánh giá predicted cao nhất của mỗi user 
- Dùng metric tính xem có bao nhiêu movies người dùng thực sự xem trong số được đề xuất 

In [286]:
test = pd.read_csv(r'C:\Users\Admin\Downloads\References\23-24-1\thesis\Movie-Recommendation-System\data\test_set.csv')
data = pd.read_csv(r'C:\Users\Admin\Downloads\References\23-24-1\thesis\Movie-Recommendation-System\data\train_set.csv')

In [287]:
reader = Reader()

train = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)

trainset = train.build_full_trainset()
anti_train = trainset.build_anti_testset()

In [288]:
model = SVD()
model.fit(trainset)
predictions = model.test(anti_train)

In [289]:
def top_n(predictions, n_predict):
    predict_df = pd.DataFrame(predictions).sort_values(by='est', ascending=False)
    top_n = predict_df.groupby('uid', sort=False).head(n_predict)
    top_n = top_n[['uid', 'iid']]
    top_n.columns = ['userId', 'movieId']
    return top_n



In [290]:
test_set = test[['userId', 'movieId']]

In [291]:
test_set

Unnamed: 0,userId,movieId
0,302,593
1,191,110
2,457,5337
3,239,1042
4,292,5377
...,...,...
24996,353,34319
24997,428,743
24998,595,1097
24999,664,61248


In [292]:
def check_movieId(pred_df, val_df):
    result = pred_df['movieId'].isin(val_df[val_df['userId'] == \
                                   pred_df['userId'].iloc[0]]['movieId'])
    return result.reset_index(drop=True)

def evaluate(pred_df, val_df):
    """ Proportion of movies recommended that were actually watched by users

    Args:
    pred_df: dataframe, 2 columns: userId, movieId
    val_df: dataframe, 2 columns: userId, movieId

    Returns:
    
    """
    result = pred_df.groupby('userId').apply(lambda x: check_movieId(x, val_df))
    n_user = val_df.userId.nunique()
    top_k = pred_df.groupby('userId').count().iloc[0]
    top_k = int(top_k.iloc[0])

    return result.sum().sum() / (n_user*top_k)

In [293]:
evaluate(top_n(predictions=predictions, n_predict=5), test_set)

0.07272727272727272

In [294]:
evaluate(top_n(predictions=predictions, n_predict=50), test_set)

0.0414903129657228

## Thử dùng các phương pháp đánh giá trên tập test

In [295]:

data.shape, test.shape

((75003, 4), (25001, 4))

In [296]:
model = CollaborativeFiltering(SVD())
model.load_data(data, test)
model.fit_predict()

RMSE: 0.9041
MAE:  0.6965


(0.9041375569513747,
 0.6965072831095146,
 0.5385899267829255,
 0.7537953204432878,
 0.6282748388194812)

So sánh với khi đánh giá model SVD() bên trên: \
(0.900186850964241, \
 0.6957369304817678,\
 0.5475501492428418,\
 0.781600109478028,\
 0.6439682101928421)
 


In [297]:
model = CollaborativeFiltering(SVDpp())
model.load_data(data, test)
model.fit_predict()

RMSE: 0.8951
MAE:  0.6863


(0.8951495916932949,
 0.686327639716623,
 0.5616877029367903,
 0.7702269743576452,
 0.6496317329360918)

So sánh với khi đánh giá model SVDpp() bên trên: \
(0.8942677172094614, \
 0.6876153915042712, \
 0.5593134964515624, \
 0.7846234388357155, \
 0.6530819526576143)
 
  -> Không có sự khác biệt lớn giữa các metrics đánh giá khi sử dụng trên tập valid với tập test

In [306]:
pd.concat(pd.DataFrame(svd.fit_predict()), pd.DataFrame(base.fit_predict()))

RMSE: 0.9033
MAE:  0.6989
Estimating biases using als...
RMSE: 0.8959
MAE:  0.6948


  pd.concat(pd.DataFrame(svd.fit_predict()), pd.DataFrame(base.fit_predict()))


TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"