In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv(r'C:\Users\Admin\Downloads\References\23-24-1\thesis\Movie-Recommendation-System\data\train_set.csv', )
trainset, testset = train_test_split(data, test_size=0.2, random_state = 42, stratify=data['userId'])

trainset

Unnamed: 0,userId,movieId,rating,timestamp
27282,544,4963,4.5,1435790757
28513,99,5952,4.0,1044786919
956,396,29,4.0,839256240
44694,150,435,2.5,1114309597
26806,41,6333,4.0,1109812953
...,...,...,...,...
20943,55,79,3.0,855927172
71762,572,60072,3.0,1436779892
14928,134,1584,3.5,1361245898
1642,533,3196,4.0,965316395


In [188]:

import numpy as np
import pandas as pd



from surprise import Dataset, Reader
from surprise import NMF, SVD
# from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.dataset import DatasetAutoFolds

from collections import defaultdict



In [16]:
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader

In [31]:

def precision_recall_at_k(test_df, threshold):
    """Return recall and precision, F-1 scrore for collaborative + hybrid

    Args:
        test_df: prediction dataframe, with 4 columns: userId, movieId, true_rating, pred_rating
        threshold: if rating > threshold, movie is believed to be relevant

    Returns:
    Recall: Proportion of relevant items that are recommended, dict-like
    Precision: Proportion of recommended items that are relevant, dict-like
        Movie is relevant if true_rating > threshold
        Movie is recommend when pred_rating > threshold
    """
    recalls = dict()
    precisions = dict()

    for userId, group in test_df.groupby('userId'):

        filter_rel = group[group['rating'] > threshold]
        filter_rec = group[group['pred_rating'] > threshold]
        filter_rel_rec = group[(group['pred_rating'] > threshold) & \
                               (group['rating'] > threshold)]

        # Number of relevant items
        n_rel = len(filter_rel)

        # Number of recommended items in top k
        n_rec = len(filter_rec)

        # Number of relevant and recommended items in top k
        n_rel_rec = len(filter_rel_rec)

        recalls[userId] = n_rel_rec/n_rel if n_rel != 0 else 1
        precisions[userId] = n_rel_rec/n_rec if n_rec != 0 else 1

    precision = sum(prec for prec in precisions.values())/len(precisions)
    recall = sum(rec for rec in recalls.values())/len(recalls)
    fmeasure = (2*precision*recall)/(precision + recall)

    return recall, precision, fmeasure
    

In [53]:
class CollaborativeFiltering:
    def __init__(self, algorithm=None):
        self.trainset = None
        self.testset = None
        self.algorithm = algorithm
        self.predictions = None
        self.test_df = None
        
    def load_data(self, trainset, testset):
        reader = Reader()
  
        train = Dataset.load_from_df(trainset[['userId', 'movieId', 'rating']], reader)
        test = Dataset.load_from_df(testset[['userId', 'movieId', 'rating']], reader)
        
        self.trainset = train.build_full_trainset()
        full_testset = test.build_full_trainset()
        self.testset = full_testset.build_testset()
        
    def fit_predict(self):
    
        self.algorithm.fit(self.trainset)
        predictions_test = self.algorithm.test(self.testset)
        self.predictions = predictions_test
        rmse = accuracy.rmse(predictions_test)
        mae = accuracy.mae(predictions_test)

        test_df = pd.DataFrame(self.predictions).drop(columns='details')
        test_df.columns = ['userId', 'movieId', 'rating', 'pred_rating']
        self.test_df = test_df
        pre, recall, f_measure = precision_recall_at_k(test_df, 3.75)
        return rmse, mae, pre, recall, f_measure



In [95]:
svd = CollaborativeFiltering(SVD())
svd.load_data(trainset, testset)
svd.fit_predict()

RMSE: 0.9033
MAE:  0.6983


(0.9033328655394621,
 0.6982772478959498,
 0.5426266643744838,
 0.7796542951230435,
 0.6398960924138568)

In [96]:
# Chọn SVD

In [97]:
nmf = CollaborativeFiltering(NMF())
nmf.load_data(trainset, testset)
nmf.fit_predict()



RMSE: 0.9690
MAE:  0.7461


(0.9689660860279911,
 0.7460943841660753,
 0.5350170503752232,
 0.7635166795824692,
 0.6291626199587533)

In [98]:
svd_pp = CollaborativeFiltering(SVDpp())
svd_pp.load_data(trainset, testset)
svd_pp.fit_predict()

RMSE: 0.8942
MAE:  0.6877


(0.8942053065639488,
 0.6876926336496808,
 0.5631132180529149,
 0.7824053779152618,
 0.6548892174362642)

In [99]:
# Chọn SVD_pp

In [100]:
base = CollaborativeFiltering(BaselineOnly())
base.load_data(trainset, testset)
base.fit_predict()

Estimating biases using als...
RMSE: 0.8959
MAE:  0.6948


(0.8959359082434771,
 0.6947684945619227,
 0.5242606261989056,
 0.7954302358494028,
 0.6319855134810897)

In [101]:
# Chọn Baseline only

In [102]:
slope_one = CollaborativeFiltering(SlopeOne())
slope_one.load_data(trainset, testset)
slope_one.fit_predict()

RMSE: 0.9401
MAE:  0.7225


(0.9400987538992951,
 0.7224915394597742,
 0.5579230180763148,
 0.7641522126082897,
 0.6449528723223139)

In [103]:
df['slope_one_rating'] = pd.DataFrame(slope_one.predictions)['est']

In [104]:
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore

In [105]:
knn_base = CollaborativeFiltering(KNNBaseline())
knn_base.load_data(trainset, testset)
knn_base.fit_predict()

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9064
MAE:  0.6960


(0.9064242777503783,
 0.6959879814186629,
 0.6011469016679707,
 0.7557408715283147,
 0.6696372277169577)

In [106]:
# KNN baseline tạm ổn

In [107]:
KNN_basic = CollaborativeFiltering(KNNBasic())
KNN_basic.load_data(trainset, testset)
KNN_basic.fit_predict()

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9822
MAE:  0.7561


(0.9821644552243276,
 0.7560629093594452,
 0.5984823503523305,
 0.7297152062764107,
 0.6576155325095433)

In [108]:
KNN_means = CollaborativeFiltering(KNNWithMeans())
KNN_means.load_data(trainset, testset)
KNN_means.fit_predict()

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9279
MAE:  0.7117


(0.9278605725076455,
 0.7116961796504687,
 0.5771583922079696,
 0.7760328842089919,
 0.6619816422945439)

In [109]:
KNN_Z = CollaborativeFiltering(KNNWithZScore())
KNN_Z.load_data(trainset, testset)
KNN_Z.fit_predict()

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9270
MAE:  0.7068


(0.9270370628606168,
 0.7068406602644297,
 0.5907476145784287,
 0.7689852586590943,
 0.668184488497663)

In [110]:
df = svd.test_df
df.columns = ['userId',	'movieId',	'rating',	'svd_rating']
df['svd_pp'] = svd_pp.test_df['pred_rating']
df['base'] = base.test_df['pred_rating']
df['knn_base'] = knn_base.test_df['pred_rating']
df.head()

Unnamed: 0,userId,movieId,rating,svd_rating,svd_pp,base,knn_base
0,212,916,3.5,3.37096,3.230922,3.525547,3.867603
1,212,5459,2.0,3.237054,2.942856,2.904066,2.857219
2,212,172,2.0,2.491079,2.463335,2.382521,2.126295
3,212,2421,2.0,2.647185,2.559857,2.563658,2.292795
4,212,3988,3.0,2.955878,2.838898,2.828012,2.730163


In [111]:
# Combine

In [112]:
import math

In [113]:

rmse = ((df.rating - df.svd_pp) ** 2).mean() ** .5
print(rmse)
mae = (((df.rating - df.svd_pp) ** 2) ** .5).mean()
print(mae)

0.8942053065639488
0.6876926336496808


In [114]:
T = df.shape[0]
print(T)

15001


In [115]:

svd_wt = 0.25
knn_base_wt = 0.25
svdpp_wt = 0.3
base_wt = 0.2


In [119]:



combined_df = svd.test_df[['userId',	'movieId',	'rating']]
combined_df['pred_rating']  = svd_wt*df['svd_rating'] + svdpp_wt*df['svd_pp'] + knn_base_wt*df['knn_base'] + base_wt*df['base']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['pred_rating']  = svd_wt*df['svd_rating'] + svdpp_wt*df['svd_pp'] + knn_base_wt*df['knn_base'] + base_wt*df['base']


In [155]:
combined_df

Unnamed: 0,userId,movieId,rating,pred_rating
0,212,916,3.5,3.484027
1,212,5459,2.0,2.987238
2,212,172,2.0,2.369848
3,212,2421,2.0,2.515684
4,212,3988,3.0,2.838782
...,...,...,...,...
14996,100,745,4.0,4.128507
14997,100,708,3.0,3.173059
14998,76,2087,3.5,4.022493
14999,76,3671,3.5,4.057804


In [156]:

precision_recall_at_k(combined_df, 3.75)

(0.5567834157394573, 0.7928009395121743, 0.6541546119519683)

In [157]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse = mean_squared_error(combined_df['rating'], combined_df['pred_rating'], squared=False)

mae = mean_absolute_error(combined_df['rating'], combined_df['pred_rating'])
rmse, mae

(0.8839001599847013, 0.6818315753475657)

## Evaluate

In [158]:
valid = pd.read_csv(r'C:\Users\Admin\Downloads\References\23-24-1\thesis\Movie-Recommendation-System\data\test_set.csv')
data = pd.read_csv(r'C:\Users\Admin\Downloads\References\23-24-1\thesis\Movie-Recommendation-System\data\train_set.csv')

In [159]:
reader = Reader()

train = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)

trainset = train.build_full_trainset()
anti_train = trainset.build_anti_testset()

In [160]:
model = SVD()
model.fit(trainset)
predictions = model.test(anti_train)

In [169]:
predict_svd = pd.DataFrame(predictions).sort_values(by='est', ascending=False)
predict_svd

Unnamed: 0,uid,iid,r_ui,est,details
3913363,298,905,3.544645,5.0,{'was_impossible': False}
1336442,656,6732,3.544645,5.0,{'was_impossible': False}
2836668,517,1259,3.544645,5.0,{'was_impossible': False}
3852818,401,778,3.544645,5.0,{'was_impossible': False}
4254212,46,2130,3.544645,5.0,{'was_impossible': False}
...,...,...,...,...,...
4711154,581,1600,3.544645,1.0,{'was_impossible': False}
4712846,581,2381,3.544645,1.0,{'was_impossible': False}
2797156,609,455,3.544645,1.0,{'was_impossible': False}
4713105,581,829,3.544645,1.0,{'was_impossible': False}


In [183]:
top_10 = predict_svd.groupby('uid', sort=False).head(10)
top_10 = top_10[['uid', 'iid']]
top_10.columns = ['userId', 'movieId']
top_10.head()

Unnamed: 0,userId,movieId
3913363,298,905
1336442,656,6732
2836668,517,1259
3852818,401,778
4254212,46,2130


In [163]:
valid_set = valid[['userId', 'movieId']]

In [164]:
valid_set

Unnamed: 0,userId,movieId
0,302,593
1,191,110
2,457,5337
3,239,1042
4,292,5377
...,...,...
24996,353,34319
24997,428,743
24998,595,1097
24999,664,61248


In [204]:
def check_movieId(pred_df, val_df):
    result = pred_df['movieId'].isin(val_df[val_df['userId'] == \
                                   pred_df['userId'].iloc[0]]['movieId'])
    return result.reset_index(drop=True)

def evaluate(pred_df, val_df):
    """ Proportion of movies recommended that were actually watched by users

    Args:
    pred_df: dataframe, 2 columns: userId, movieId
    val_df: dataframe, 2 columns: userId, movieId

    Returns:
    
    """
    result = pred_df.groupby('userId').apply(lambda x: check_movieId(x, val_df))
    n_user = val_df.userId.nunique()
    top_k = pred_df.groupby('userId').count().iloc[0]
    top_k = int(top_k.iloc[0])


    return result.sum() / (n_user*top_k)

In [195]:
top_10 = top_10.reset_index()

In [197]:
top_10

Unnamed: 0,index,userId,movieId
0,3913363,298,905
1,1336442,656,6732
2,2836668,517,1259
3,3852818,401,778
4,4254212,46,2130
...,...,...,...
6705,4708515,581,7153
6706,4709413,581,2186
6707,4711043,581,1233
6708,4709190,581,1212


In [205]:
evaluate(top_10, valid_set)

movieId
0    0.010581
1    0.006557
2    0.006557
3    0.006110
4    0.005216
5    0.005365
6    0.005812
7    0.005216
8    0.004918
9    0.005514
dtype: float64

In [198]:
print(evaluate(top_10, valid_set))

movieId
0    0.010581
1    0.006557
2    0.006557
3    0.006110
4    0.005216
5    0.005365
6    0.005812
7    0.005216
8    0.004918
9    0.005514
dtype: float64


In [207]:
top_10.infer_objects

<bound method NDFrame.infer_objects of         index  userId  movieId
0     3913363     298      905
1     1336442     656     6732
2     2836668     517     1259
3     3852818     401      778
4     4254212      46     2130
...       ...     ...      ...
6705  4708515     581     7153
6706  4709413     581     2186
6707  4711043     581     1233
6708  4709190     581     1212
6709  4708735     581      246

[6710 rows x 3 columns]>