# Recommender System
Read in the data.

In [29]:
import pandas as pd
import numpy as np

data_df = pd.read_csv('./train_art_features.csv')
data_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0.317651,0.588484,0.734532,0.0,0.0,0.0,0.0,0.0,1.635034,0.529136,...,0.012182,0.435283,0.234793,0.0,0.0,0.36646,0.39303,1.197851,1.220937,3.171708
1,0.876346,0.003836,0.0,0.0,0.014527,0.0,0.033951,0.0,0.083843,0.68614,...,0.0,0.427031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.195149
2,0.0,0.0,0.0,0.0,0.0,2.784533,0.0,0.0,0.0,0.70897,...,0.642855,1.964759,1.939867,0.0,0.0,0.0,0.0,1.791351,0.700842,0.0
3,0.021457,1.689611,0.0,0.745829,0.0,0.519452,0.0,0.765993,0.0,0.0,...,0.961703,1.618219,0.177426,0.0,1.644428,0.0,1.327781,0.920276,0.730517,2.443821
4,0.523536,0.0,0.0,0.0,0.0,0.0,0.146831,0.0,1.15344,0.360616,...,0.0,0.0,0.41359,0.0,0.320506,0.0,1.930118,1.061933,0.0,0.172906


### Train-test split the data, and generate the two `(num_art, num_features)` "ratings" matricies.

In [30]:
data = data_df.to_numpy()

#https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/
test = np.zeros(data.shape)
train = data.copy()
for user in range(data.shape[0]):
    test_ratings = np.random.choice(data[user, :].nonzero()[0],
                                    size=10,
                                    replace=False)
    train[user, test_ratings] = 0.
    test[user, test_ratings] = data_df.iloc[user, test_ratings]

print(train)
print(test)
print(np.all((train * test) == 0))

# generate train_mat and test_mat
num_art = 6924
num_features = 2048

train_mat = train
test_mat = test

[[0.31765118 0.5884839  0.7345319  ... 1.1978512  1.2209369  3.1717079 ]
 [0.8763461  0.00383636 0.         ... 0.         0.         0.19514911]
 [0.         0.         0.         ... 1.7913507  0.70084167 0.        ]
 ...
 [0.5443084  0.         0.         ... 0.         0.9720184  0.6643692 ]
 [0.         0.40954348 0.         ... 0.         0.         0.25679904]
 [0.4958547  0.49603662 0.         ... 0.15636368 0.         0.        ]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
True


Cool, it randomly grabs 10 entries from train_mat and puts them into test_mat. And the dot product is zero, so we know we're good.

### Run Baseline Estimate, print RMSE.

In [31]:
indicator_mat = (train_mat > 0).astype(float)
mu = np.sum(train_mat) / np.sum(indicator_mat)

num_rating_items = np.sum(indicator_mat, axis=0, keepdims=True)
num_rating_items[num_rating_items == 0] = 1
mu_items = np.sum(train_mat, axis=0, keepdims=True) / num_rating_items
mu_items[mu_items == 0] = mu

num_rating_users = np.sum(indicator_mat, axis=1, keepdims=True)
num_rating_users[num_rating_users == 0] = 1
mu_users = np.sum(train_mat, axis=1, keepdims=True) / num_rating_users
mu_users[mu_users == 0] = mu
bi = mu_items - mu
bu = mu_users - mu
prediction_mat = np.ones_like(train_mat)
prediction_mat = mu + bi + bu

indicator_mat = (test_mat > 0).astype(float)
test_rmse = (np.sum(((prediction_mat - test_mat) * indicator_mat) ** 2) / np.sum(indicator_mat)) ** 0.5
print('test_rmse = ', test_rmse)
train_rmse = (np.sum(((prediction_mat - train_mat) * indicator_mat) ** 2) / np.sum(indicator_mat)) ** 0.5
print('train_rmse = ', train_rmse)

test_rmse =  0.4765302457042466
train_rmse =  0.5541170480751484


These values are close, and because train_rmse $>$ test_rmse, we know we're still underfitting, so that's good, there's room for improvement.
### Run User-User Collaborative Filtering with Jaccard Similarity, print RMSE.

In [32]:
# binary matrix to indicate whether there is a rating for a user-movie pair
indicator_mat = (train_mat > 0).astype(float)  # size = (#user, #movie)

# calculate the number of ratings for each user
num_rating_per_user = np.sum(indicator_mat, axis=1, keepdims=True)  # size = (#user, 1)

# calculate the numerator of Jaccard similarity: for two users, calculate the number of movies both of they rated
numerator = np.matmul(indicator_mat, indicator_mat.T)  # size = (#user, #user)

# calculate the denominator of Jaccard similarity: for two users, calculate the number of movies they rated in total
denominator = num_rating_per_user + num_rating_per_user.T - numerator  # size = (#user, #user)

# set 0 to be 1 to avoid error in division
denominator[denominator == 0] = 1

# calculate Jaccard similarity matrix
Jaccard_mat = numerator / denominator  # size = (#user, #user)

prediction_mat = train_mat.copy()

num_rating_users[num_rating_users == 0] = 1
mu_users = np.sum(train_mat, axis=1, keepdims=True) / num_rating_users
deviation_mat = (train_mat - mu_users) * indicator_mat
for u in range(num_art):
    similarities = Jaccard_mat[u, :]
    similarities[u] = -1
    N_idx = np.argpartition(similarities, -10)[-10:]
    N_sim = similarities[N_idx]
    prediction_mat[u, :] = np.sum(N_sim.reshape((-1, 1)) * deviation_mat[N_idx, :], axis=0) / np.sum(N_sim)
prediction_mat += mu_users

indicator_mat = (test_mat > 0).astype(float)
test_rmse = (np.sum(((prediction_mat - test_mat) * indicator_mat) ** 2) / np.sum(indicator_mat)) ** 0.5
print('test_rmse = ', test_rmse)
train_rmse = (np.sum(((prediction_mat - train_mat) * indicator_mat) ** 2) / np.sum(indicator_mat)) ** 0.5
print('train_rmse = ', train_rmse)

test_rmse =  0.4598470515052233
train_rmse =  0.5995344883230586


Same deal here.
### Run Implicit User-User Collaborative Filtering for 10NN with Jaccard Similarity, print Precision@k and Recall@k for Top 50 List.

In [34]:
train_mat = (train_mat > 0).astype(float)
test_mat = (test_mat > 0).astype(float)

user_train_like = []
for u in range(num_art):
    user_train_like.append(np.where(train_mat[u, :] > 0)[0])

numer = np.matmul(train_mat, train_mat.T)
denom = np.sum(train_mat ** 2, axis=1, keepdims=True) ** 0.5
Cosine = numer / np.matmul(denom, denom.T)

recommendation = []
for u in range(num_art):
    similarities = Jaccard_mat[u, :]
    similarities[u] = -1
    N_idx = np.argpartition(similarities, -10)[-10:]
    N_sim = similarities[N_idx]
    scores = np.sum(N_sim.reshape((-1, 1)) * train_mat[N_idx, :], axis=0) / np.sum(N_sim)

    train_like = user_train_like[u]
    scores[train_like] = -9999
    top50_iid = np.argpartition(scores, -50)[-50:]
    top50_iid = top50_iid[np.argsort(scores[top50_iid])[-1::-1]]
    recommendation.append(top50_iid)
recommendation = np.array(recommendation)

user_test_like = []
for u in range(num_art):
    user_test_like.append(np.where(test_mat[u, :] > 0)[0])

recalls = np.zeros(3)
precisions = np.zeros(3)
user_count = 0.

for u in range(num_art):
    test_like = user_test_like[u]
    test_like_num = len(test_like)
    if test_like_num == 0:
        continue
    rec = recommendation[u, :]
    hits = np.zeros(3)
    for k in range(50):
        if rec[k] in test_like:
            if k < 50:
                hits[2] += 1
                if k < 20:
                    hits[1] += 1
                    if k < 5:
                        hits[0] += 1
    recalls[0] += (hits[0] / test_like_num)
    recalls[1] += (hits[1] / test_like_num)
    recalls[2] += (hits[2] / test_like_num)
    precisions[0] += (hits[0] / 5.)
    precisions[1] += (hits[1] / 20.)
    precisions[2] += (hits[2] / 50.)
    user_count += 1

recalls /= user_count
precisions /= user_count

print('recall@5\t[%.6f],\t||\t recall@20\t[%.6f],\t||\t recall@50\t[%.6f]' % (recalls[0], recalls[1], recalls[2]))
print('precision@5\t[%.6f],\t||\t precision@20\t[%.6f],\t||\t precision@50\t[%.6f]' % (precisions[0], precisions[1], precisions[2]))

recall@5	[0.069295],	||	 recall@20	[0.159763],	||	 recall@50	[0.266566]
precision@5	[0.138590],	||	 precision@20	[0.079882],	||	 precision@50	[0.053313]


Recall@k is going up, Precision@k is going down. @50 we're recalling ~0.25ish relevant information. Looks like a good start.

