In [1]:
!gsutil cp gs://icentris-data-model-ml-d1/product-reviews.csv .
    

Copying gs://icentris-data-model-ml-d1/product-reviews.csv...
/ [1 files][  6.7 KiB/  6.7 KiB]                                                
Operation completed over 1 objects/6.7 KiB.                                      


In [8]:
import pandas as pd

from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

from surprise import SVD
from surprise import accuracy
from surprise.model_selection import KFold

In [4]:
reviews = pd.read_csv('product-reviews.csv', names=['product_id', 'user_id', 'rating'])

In [7]:
reader = Reader(rating_scale=(1,5))
dataset = Dataset.load_from_df(reviews[['user_id', 'product_id', 'rating']], reader)

In [9]:
kf = KFold(n_splits=3)

algo = SVD()
for trainset, testset in kf.split(dataset):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.1105
RMSE: 1.7487
RMSE: 1.4287


In [10]:
from collections import defaultdict

from surprise import SVD
from surprise import Dataset


def get_top_n(predictions, n=5):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [11]:
top_n = get_top_n(predictions)

In [13]:
#predicted rating for top 5 products for each user
top_n

defaultdict(list,
            {444: [('10-0011', 4.721952627073553)],
             793: [('15-0029', 4.873941697151362)],
             810: [('03-0010', 4.473249962206511),
              ('10-0002', 4.248276861045902)],
             992: [('15-0029', 4.873941697151362)],
             2001: [('15-0029', 4.873941697151362)],
             3538: [('10-0024', 4.702121050848317)],
             5324: [('15-0029', 4.873941697151362)],
             9678: [('06-0003', 4.435717694477046)],
             19335: [('03-0006', 4.71894435237806)],
             26849: [('16-0006', 4.055683272024348)],
             35404: [('15-0029', 4.873941697151362)],
             35759: [('06-0003', 4.196357753295739)],
             42457: [('15-0029', 4.873941697151362)],
             54327: [('15-0022', 4.5611484171710615)],
             63328: [('13-0010', 4.374955638180056)],
             71250: [('15-0029', 4.922089307424837)],
             72387: [('06-0002', 4.6260510025409385)],
             79907: [('15-001