In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from surprise import Reader
from surprise import Dataset

# Collaborative Filtering Recommender System
In this lab session, we will work with the training set created last week.

In [2]:
import gzip
import os
import json
import pandas as pd
import numpy as np
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('All_Beauty_5.json.gz')

df = df.sort_values(by=['reviewerID', 'asin', 'unixReviewTime'])
cleaned_dataset = df.dropna(subset=['overall']).drop_duplicates(subset=['reviewerID', 'asin'], keep = 'last').reset_index(drop=True)
# print(len(cleaned_dataset))
# cleaned_dataset.head()
cleaned_dataset = cleaned_dataset.sort_values(by=['reviewerID', 'unixReviewTime']).reset_index(drop=True)
# extracting the latest (in time) positively rated item (rating  ≥4 ) by each user. 
test_data_pre = cleaned_dataset[cleaned_dataset.overall >= 4.0].drop_duplicates(subset=['reviewerID'], keep='last')
# generate training data
training_data = cleaned_dataset.drop(test_data_pre.index)

# Remove users that do not appear in the training set.
user_in_training = test_data_pre['reviewerID'].isin(training_data['reviewerID'])
test_data = test_data_pre[user_in_training]

In [3]:
matrix = pd.DataFrame(columns = training_data['asin'].drop_duplicates(), index = training_data['reviewerID'].drop_duplicates())
for row_i in range(len(training_data)):
    reviewerID = list(training_data.reviewerID)[row_i]
    asin = list(training_data.asin)[row_i]
    rate = list(training_data.overall)[row_i]
    matrix.loc[reviewerID, asin] = rate
matrix = matrix.fillna(0)

## Exercise 1
In this exercise, we are going to predict the rating of a single user-item pair using a neighborhood-based method.
### 1.1
- Represent the ratings from the training set in a user-item matrix where the rows represent users and the columns represent items.
- Fill unobserved ratings with $0$.

Compute the cosine similarities between the user with 'reviewerID'='A25C2M3QF9G7OQ' and all users that have rated the item with 'asin'='B00EYZY6LQ'.<br>
What are the similarities and what are the ratings given by these users on item 'B00EYZY6LQ'?

In [5]:
cosine_similarities = cosine_similarity(np.array(matrix.loc[matrix.index == 'A25C2M3QF9G7OQ']),matrix[matrix.loc[:,'B00EYZY6LQ']>0])

result = pd.DataFrame(matrix[matrix.loc[:,'B00EYZY6LQ']>0].loc[:,'B00EYZY6LQ'])
result.insert(result.shape[1],'cos_sim',np.array(cosine_similarities).T)

### 1.2
Predict the rating for user 'A25C2M3QF9G7OQ' on item 'B00EYZY6LQ' based on the ratings from the $3$ most similar users, using a weighted (by similarity) average. What is the prediction?

In [38]:
result_sort = result.sort_values(by='cos_sim', ascending=False).head(3)
print(result_sort)

                B00EYZY6LQ   cos_sim
reviewerID                          
A2ZY49IDE6TY5I         4.0  0.682835
A2LW5AL0KQ9P1M         4.0  0.275810
A1R1BFJCMWX0Y3         3.0  0.245145


In [39]:
result_sort = result.sort_values(by='cos_sim', ascending=False).head(3)
prediction = 0
for index in result_sort.index:
    prediction += result_sort.loc[index][0]*result_sort.loc[index][1]
prediction /= result_sort.loc[:,'cos_sim'].sum()
print('prediction is {:.8f}'.format(prediction))

prediction is 3.79635550


## Exercise 2
In this exercise, we are going to predict the rating of the same user-item pair as in exercise 1, now using a latent factor method.
### 2.1
- Represent the ratings from the training set in a user-item matrix where the rows represent users and the columns represent items.
- Subtract the row mean (i.e. mean rating per user) from each non-missing element in the matrix.
- Replace missing values with $0$.

Factorize the user-item matrix by performing Singular Value Decomposition (SVD) of rank $5$ using eigendecomposition. What is ther user factors of user 'A25C2M3QF9G7OQ' and the item factors of item 'B00EYZY6LQ'?

In [40]:
matrix = pd.DataFrame(columns = training_data['asin'].drop_duplicates(), index = training_data['reviewerID'].drop_duplicates())
for row_i in range(len(training_data)):
    reviewerID = list(training_data.reviewerID)[row_i]
    asin = list(training_data.asin)[row_i]
    rate = list(training_data.overall)[row_i]
    matrix.loc[reviewerID, asin] = rate
matrix_0 = matrix.fillna(0)
matrix = matrix.fillna(0)
for index in matrix.index:
    mean_ = np.mean([i for i  in matrix.loc[index] if i != 0])
    if index=='A25C2M3QF9G7OQ':
        A25C2M3QF9G7OQ_mean = mean_
    for item in range(len(matrix.loc[index])):
        if matrix.loc[index][item] != 0:
            matrix.loc[index][item] -= mean_

""" user_item_pre = training_data.pivot('reviewerID', 'asin', 'overall')
user_item_mean = user_item_pre.mean(axis = 1)
user_item_sub = user_item_pre.sub(user_item_mean, axis = 0)
user_item_sub = user_item_sub.fillna(0)
A25C2M3QF9G7OQ_mean = user_item_mean.loc['A25C2M3QF9G7OQ']
print(A25C2M3QF9G7OQ_mean) """

" user_item_pre = training_data.pivot('reviewerID', 'asin', 'overall')\nuser_item_mean = user_item_pre.mean(axis = 1)\nuser_item_sub = user_item_pre.sub(user_item_mean, axis = 0)\nuser_item_sub = user_item_sub.fillna(0)\nA25C2M3QF9G7OQ_mean = user_item_mean.loc['A25C2M3QF9G7OQ']\nprint(A25C2M3QF9G7OQ_mean) "

In [41]:
Q, sigma, P = svds(matrix, k=5)
U = np.dot(Q, np.diag(sigma))

In [42]:
user_factors = pd.DataFrame(data = U, index = matrix.index)
print(user_factors.loc['A25C2M3QF9G7OQ'])
item_factors = pd.DataFrame(data = P, columns = matrix.columns)
print(item_factors.loc[:,'B00EYZY6LQ'])
u_A25C2M3QF9G7OQ = np.array(user_factors.loc['A25C2M3QF9G7OQ'])
i_B00EYZY6LQ = np.array(item_factors.loc[:,'B00EYZY6LQ'])

0   -0.553446
1   -0.421214
2   -0.063396
3    0.656496
4    0.251410
Name: A25C2M3QF9G7OQ, dtype: float64
0    0.054085
1   -0.009215
2    0.040723
3    0.042454
4    0.152673
Name: B00EYZY6LQ, dtype: float64


### 2.2
Predict the rating for user 'A25C2M3QF9G7OQ' on item 'B00EYZY6LQ' by taking the dot product between the user factors and item factors and adding back the mean rating of this user. What is the prediction?

In [43]:
print("{:.6f}".format(A25C2M3QF9G7OQ_mean+np.dot(u_A25C2M3QF9G7OQ,i_B00EYZY6LQ)))

4.437621


<br>
<br>
For the rest of the exercises, you can use the python library Scikit-Surprise. Please find the documentation here: https://surprise.readthedocs.io/en/stable/getting_started.html. <br>
You can convert the training set to the format required in Scikit-Surprise as follows:

In [44]:
reader = Reader(rating_scale=(1, 5))
training = Dataset.load_from_df(training_data[['reviewerID', 'asin', 'overall']], reader=reader)

## Exercise 3
### 3.1
Define a user-based neighborhood model that takes into account the mean rating of each user.<br>
Use cosine as similarity measure and try to vary the (maximum) number of neighbors to take into account when predicting ratings. Keep Scikit-Surprise's default setting for all other parameters. <br>
Is it better to use $1$ or $10$ neighbors? You should determine this based on the Root Mean Square Error (RMSE) over 3-fold cross-validation.

In [46]:
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
param_grid = {'k': [1, 10],
'sim_options' : {'name': ['cosine'],
               'user_based': [True]  # compute  similarities between items
               }
               }
sim_options = {'name': 'cosine',
               'user_based': True  # compute  similarities between items
               }


algo_1 = KNNWithMeans(k = 1 , min_k = 1, sim_options = sim_options)

print(cross_validate(algo_1, training, measures=['RMSE'], cv=3, verbose=False)['test_rmse'].mean())

algo_10 = KNNWithMeans(k = 10 , min_k = 1, sim_options = sim_options)

print(cross_validate(algo_10, training, measures=['RMSE'], cv=3, verbose=False)['test_rmse'].mean())

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
0.4205100191988406
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
0.43072198050222493


In [49]:

gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=3)
gs.fit(training)
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
0.37609365440256365
{'k': 10, 'sim_options': {'name': 'cosine', 'user_based': True}}


### 3.2
Fit the neigborhood-based model defined in exercise 3.1 on the full training set with cosine as similarity measure and either $1$ or $10$ neighbors based on what you found to be better in exercise 3.1. Keep Scikit-Surprise's default setting for all other parameters, but set the random state to $0$ for comparable results. <br>
Use the model to predict the unobserved ratings for the users in the training set. How many predictions are there and what is the average of all the predictions?

In [50]:
trainset = training.build_full_trainset()
testset = trainset.build_anti_testset()
sim_options = {'name': 'cosine',
               'user_based': True  # compute  similarities between items
               }


algo_10 = KNNWithMeans(k = 10 , min_k = 1, sim_options = sim_options)
predictions = [i.est for i in algo_10.fit(trainset).test(testset)]
""" for user in training_data.reviewerID:
    for item in training_data.asin:
        if matrix_0.loc[user, item] == 0:
            predictions.append(algo_10.predict(uid=user,iid=item)) """

print(len(predictions))
print(np.mean(predictions))


Computing the cosine similarity matrix...
Done computing similarity matrix.
54746
4.628144189949582


## Exercise 4
### 4.1
Define an SVD model with user and item biases that uses Stochastic Gradient Descend (SGD) to estimate the low-rank matrix based on only observed ratings. <br>
Set the number of latent factors to $30$ and try to iterate the SGD procedure for different number of epochs. Keep Scikit-Surprise's default setting for all other parameters. <br>
Is it better to run for $100$ or $500$ epochs? You should determine this based on the RMSE over 3-fold cross-validation.

In [51]:
from surprise import SVD
from surprise.model_selection import GridSearchCV
param_grid = {'n_epochs': [100, 500],'n_factors':[30]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(training)
# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])


0.36163921298099866
{'n_epochs': 500, 'n_factors': 30}


### 4.2
Fit the latent factor model defined in exercise 4.1 on the full training set with $30$ latent factors and run for either $100$ or $500$ epochs based on what you found to be better in exercise 4.1. Keep Scikit-Surprise's default setting for all other parameters, but set the random state to $0$ for comparable results.<br>
Use the model to predict the unobserved ratings for the users in the training set. How many predictions are there and what is the average of all the predictions?

In [52]:
algo = SVD(n_epochs=500, n_factors=30, random_state=0)
predictions = [i.est for i in algo.fit(trainset).test(testset)]
print(len(predictions))
print(np.mean(predictions))

54746
4.403720461682863
