In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [32]:
from surprise import Reader, Dataset

In [33]:
from surprise import Dataset, NMF
from surprise.model_selection import cross_validate


# Load the data
reader = Reader()
ratings = pd.read_csv('/kaggle/input/the-movies-dataset/ratings_small.csv')
ratings.head()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

##load algo
algo = NMF()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9542  0.9408  0.9381  0.9519  0.9434  0.9457  0.0063  
MAE (testset)     0.7358  0.7229  0.7212  0.7290  0.7274  0.7273  0.0051  
Fit time          4.23    4.14    4.28    4.27    4.34    4.25    0.07    
Test time         0.19    0.19    0.19    0.20    0.20    0.19    0.00    


{'test_rmse': array([0.95416647, 0.94078762, 0.93807338, 0.95190664, 0.94338842]),
 'test_mae': array([0.73577937, 0.72294559, 0.72124739, 0.72896879, 0.72742718]),
 'fit_time': (4.227799892425537,
  4.135069131851196,
  4.276002407073975,
  4.270809173583984,
  4.342686176300049),
 'test_time': (0.19092965126037598,
  0.1880505084991455,
  0.18812060356140137,
  0.19843053817749023,
  0.19754600524902344)}

In [34]:
from surprise import accuracy
from surprise.model_selection import train_test_split

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=0.25)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9573


0.9573339726174547

In [35]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
anti_testset = trainset.build_anti_testset()
predictions = algo.test(anti_testset)

In [36]:
pd.DataFrame(predictions)

Unnamed: 0,uid,iid,r_ui,est,details
0,10,4951,3.543958,2.489881,{'was_impossible': False}
1,10,376,3.543958,3.584095,{'was_impossible': False}
2,10,3175,3.543958,3.602146,{'was_impossible': False}
3,10,1957,3.543958,3.572168,{'was_impossible': False}
4,10,133195,3.543958,2.205196,{'was_impossible': False}
...,...,...,...,...,...
5429876,583,96075,3.543958,4.097640,{'was_impossible': False}
5429877,583,8840,3.543958,2.535734,{'was_impossible': False}
5429878,583,39398,3.543958,2.370248,{'was_impossible': False}
5429879,583,5562,3.543958,1.712539,{'was_impossible': False}


In [37]:
# predictions

In [38]:
from collections import defaultdict
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [39]:
top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

10 [1178, 3022, 8955, 102684, 3309, 1217, 65037, 6918, 8132, 52885]
128 [1233, 4848, 5618, 81847, 3020, 40819, 2692, 8154, 2542, 1251]
468 [92494, 26587, 116897, 3216, 97957, 58047, 1860, 309, 8955, 5114]
103 [4914, 65037, 67504, 4755, 1860, 1192, 26865, 83318, 6271, 4630]
206 [2973, 67504, 26865, 83318, 5114, 2929, 2920, 83411, 50068, 57353]
475 [67504, 83318, 83411, 8609, 25764, 71755, 3153, 73290, 5017, 5121]
115 [5475, 318, 5522, 2232, 296, 1178, 912, 116797, 968, 97866]
213 [8208, 872, 5114, 50641, 2636, 326, 7700, 8797, 4404, 65037]
534 [65514, 994, 1066, 1934, 1260, 927, 898, 8955, 497, 102684]
240 [65037, 67504, 4755, 2920, 2612, 5017, 6413, 2066, 89904, 1860]
73 [65037, 67504, 4754, 26865, 83318, 8609, 83411, 3216, 92494, 7574]
232 [5475, 5522, 90428, 2692, 6187, 1172, 1023, 54997, 1228, 109487]
564 [5522, 4973, 1251, 3307, 940, 3462, 3022, 88810, 899, 1232]
624 [67504, 83318, 83411, 6433, 5121, 3216, 92494, 26974, 119145, 5059]
272 [5475, 902, 3468, 5414, 8969, 3435, 3741, 86

In [40]:
top_n[14]


[(4754, 5),
 (3216, 5),
 (92494, 5),
 (97957, 5),
 (26974, 4.797207943831751),
 (4037, 4.797157926352313),
 (3284, 4.755461910607027),
 (7327, 4.710869387147349),
 (27156, 4.6940704231623425),
 (2920, 4.688164092134249)]

In [41]:
data

<surprise.dataset.DatasetAutoFolds at 0x7f41e5792080>

## Now we define functions to easily use it 

In [45]:
# collaborative algorithm

import numpy as np
import pandas as pd



from surprise import Dataset, Reader
from surprise import NMF, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.dataset import DatasetAutoFolds

from collections import defaultdict


def collaborative_filtering(input, algorithm, n_suggestion=10):

    def load_data(path_csv):
        reader = Reader()
        ratings = pd.read_csv(path_csv)
        ratings.head()
        data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
        
        return data
        
    def fit_predict(algo, path_csv):
        data = load_data(path_csv)
        
        # fit and predict in data - to know RMSE
        trainset, testset = train_test_split(data, test_size=0.25)

        algo.fit(trainset)
        predictions_test = algo.test(testset)
        
        # Real predict: predict ratings for all pairs (u, i) that are NOT in the training set.
        full_trainset = data.build_full_trainset()
        anti_testset = full_trainset.build_anti_testset()
        predictions = algo.test(anti_testset)
        
        return predictions, accuracy.rmse(predictions_test)
    
    
    def get_top_n(algo, path_csv, n=10):
        """Return the top-N recommendation for each user from a set of predictions.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        """
        
        predictions = fit_predict(algo, path_csv)[0]
        
        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n
    
    return get_top_n(algorithm, input)


## Example

top_n = collaborative_filtering(input='/kaggle/input/the-movies-dataset/ratings_small.csv', algorithm=NMF())

# # Print the recommended items for each user
# for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])

RMSE: 0.9547


In [46]:
top_n[14]
# user 14, suggest 10 movies 121231,...,3320 with rating estimated 5,..., 4.777

[(121231, 5),
 (4406, 5),
 (3746, 4.96738308234904),
 (71033, 4.88195737231622),
 (1941, 4.860410279778331),
 (31435, 4.793104483477439),
 (79091, 4.791133125523275),
 (80906, 4.7814997830595285),
 (7459, 4.781208589777139),
 (3320, 4.777818261034626)]

In [47]:
## Example

top_n = collaborative_filtering(input='/kaggle/input/the-movies-dataset/ratings_small.csv', algorithm=SVD())

# Print the recommended items for each user
# for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])

RMSE: 0.8938


In [48]:
top_n[14]
# user 14, suggest 10 movies 1172,...,527 with rating estimated 4.366,..., 4.246

[(1172, 4.366917098291678),
 (318, 4.343879957402736),
 (926, 4.339618101924625),
 (111, 4.335590660394622),
 (858, 4.305880984415794),
 (922, 4.286166418789119),
 (48516, 4.285424350053995),
 (1193, 4.277516683474715),
 (3462, 4.2636946312405195),
 (527, 4.246943945834868)]