In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

In [2]:
with open("data/product.json","r") as f: 
    datas = json.load(f)

In [3]:
train_data = pd.read_csv('data/review.csv')
valid_data = pd.read_csv('data/validation.csv')

In [4]:
print(train_data.Star)

0        5.0
1        3.0
2        4.0
3        2.0
4        2.0
        ... 
52507    5.0
52508    5.0
52509    5.0
52510    4.0
52511    5.0
Name: Star, Length: 52512, dtype: float64


In [5]:
train_reviewer_product_matrix = train_data.pivot_table(index='ReviewerID', columns='ProductID', values='Star').fillna(0)
train_product_reviewer_matrix = train_reviewer_product_matrix.T

In [8]:
def similarity(matrix):
    return cosine_similarity(matrix)

In [9]:
def predict_reviewer_based(reviewer_product_matrix, reviewer_similarity):
    reviewer_ratings_mean = reviewer_product_matrix.mean(axis=1).values.reshape(-1, 1) # (num_reviewers, 1)
    centered_reviewer_product_matrix = reviewer_product_matrix - reviewer_ratings_mean # (num_reviewers, num_products)
    sim_sum = np.abs(reviewer_similarity).sum(axis=1)[:,None] # (num_reviewers, 1)
    pred_ratings = reviewer_ratings_mean + np.dot(reviewer_similarity, centered_reviewer_product_matrix) / sim_sum # (num_reviewers, num_products)
    # return as pd.DataFrame
    return pd.DataFrame(pred_ratings, index=reviewer_product_matrix.index, columns=reviewer_product_matrix.columns)

In [19]:
reviewers_similarity_matrix = similarity(train_reviewer_product_matrix)
reviewer_based_pred = predict_reviewer_based(train_reviewer_product_matrix, reviewers_similarity_matrix)

In [21]:
print(reviewers_similarity_matrix)
print(reviewer_based_pred)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
ProductID             B000FBFMHU  B000FC27TA  B000FCKPG2  B000GCFWXW  \
ReviewerID                                                             
A0020356UF96ZV361ST    -0.005530   -0.005530   -0.005530   -0.005530   
A00463782V7TKAP9EMNL    0.001526    0.001526    0.001526    0.001526   
A0099735VDZ3HDCAAYKL    0.003831    0.008341    0.003831    0.019766   
A01631062UX24GI4LJKF   -0.004234   -0.004234   -0.004234   -0.004234   
A0178408Z1TQAM7D75FY   -0.009944   -0.009944   -0.009944   -0.009944   
...                          ...         ...         ...         ...   
AZQSQSF2QI02F          -0.007578   -0.003412   -0.007578    0.021187   
AZRPAHQG1VHR0          -0.003311   -0.003311   -0.003311   -0.003311   
AZULU4TOTOLEU          -0.004318   -0.004318   -0.004318   -0.004318   
AZYERRDY2VW61           0.003639    0.005344    0.00

In [11]:
def predict_product_based(product_reviewer_matrix, product_similarity):
    nom = np.dot(product_similarity, product_reviewer_matrix) # (num_products, num_reviewers)
    denom =  np.abs(product_similarity).sum(axis=1)[:,None] # (num_products, 1)
    pred_ratings = nom / denom # division through broadcasting
    return pd.DataFrame(pred_ratings, index=product_reviewer_matrix.index, columns=product_reviewer_matrix.columns)

In [12]:
products_similarity_matrix = similarity(train_product_reviewer_matrix)
product_based_pred = predict_product_based(train_product_reviewer_matrix, products_similarity_matrix)

In [13]:
def svd_rating_prediction(pivot_table, n_components=2):
    U, s, Vh = np.linalg.svd(pivot_table)
    # select top n_components
    U = U[:, :n_components]
    s = np.diag(s[:n_components])
    Vh = Vh[:n_components, :]
    return np.dot(np.dot(U, s), Vh)

In [14]:
def svd_rating_prediction(pivot_table, n_components=2):
    svd = TruncatedSVD(n_components=n_components)
    matrix = svd.fit_transform(pivot_table)
    return svd.inverse_transform(matrix)

In [15]:
svd_pred = svd_rating_prediction(train_reviewer_product_matrix, n_components=2)

In [16]:
def rmse(pred, actual):
    '''
    params:
        pred <np.array>: an array containing all predicted ratings
        actual <np.array>: an array containing all ground truth ratings

    return:
        a scalar whose value is the rmse
    '''
    return np.sqrt(mean_squared_error(actual, pred))

In [17]:
def get_predictions(pred_matrix, reviewer_id_map, business_id_map, valid_data):
    predictions = []
    actuals = []
    
    for _, row in valid_data.iterrows():
        reviewer_id = row['ReviewerID']
        business_id = row['ProductID']
        actual_rating = row['Star']
        
        if reviewer_id in reviewer_id_map and business_id in business_id_map:
            reviewer_idx = reviewer_id_map[reviewer_id]
            business_idx = business_id_map[business_id]
            pred_rating = pred_matrix[reviewer_idx, business_idx]
            predictions.append(pred_rating)
            actuals.append(actual_rating)
    
    return np.array(predictions), np.array(actuals)

In [18]:
reviewer_id_map = {reviewer_id: idx for idx, reviewer_id in enumerate(train_reviewer_product_matrix.index)}
business_id_map = {business_id: idx for idx, business_id in enumerate(train_reviewer_product_matrix.columns)}

# Get predictions for User-based CF
reviewer_filtered_pred, reviewer_filtered_actual = get_predictions(reviewer_based_pred.values, reviewer_id_map, business_id_map, valid_data)

# Get predictions for Item-based CF
product_filtered_pred, product_filtered_actual = get_predictions(product_based_pred.T.values, reviewer_id_map, business_id_map, valid_data)

# Get predictions for SVD
svd_filtered_pred, svd_filtered_actual = get_predictions(svd_pred, reviewer_id_map, business_id_map, valid_data)

# Calculate RMSE for User-based CF
reviewer_based_rmse = rmse(reviewer_filtered_pred, reviewer_filtered_actual)
print(f'User-based RMSE: {reviewer_based_rmse}')

# Calculate RMSE for Item-based CF
product_based_rmse = rmse(product_filtered_pred, product_filtered_actual)
print(f'Item-based RMSE: {product_based_rmse}')

# Calculate RMSE for SVD
svd_rmse = rmse(svd_filtered_pred, svd_filtered_actual)
print(f'SVD RMSE: {svd_rmse}')

User-based RMSE: 4.228321475101317
Item-based RMSE: 4.22752745102103
SVD RMSE: 4.25468433011097
