In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os
from matplotlib import pyplot as plt
import json
from collections import Counter
import math
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity
tqdm.pandas()

In [2]:
train_df = pd.read_csv('./data/train_ratings.csv')
test_df = pd.read_csv('./data/test_ratings.csv')
user_ids = np.load('./data/user_ids.npy')
book_ids = np.load('./data/book_ids.npy')

In [3]:
train_df.head()

Unnamed: 0,user_id,book_id,rating
0,1,1180,4
1,1,6285,4
2,2,8034,4
3,2,9762,4
4,3,9014,1


In [4]:
uidd = dict()
for i, uid in enumerate(user_ids):
    uidd[uid] = i
bidd = dict()
for i, bid in enumerate(book_ids):
    bidd[bid] = i

In [5]:
train_df.shape[0] / book_ids.shape[0] / user_ids.shape[0]

0.0012408077629163388

In [6]:
train_df['user_coo'] = train_df.apply(lambda row: uidd[row['user_id']], axis=1)
train_df['book_coo'] = train_df.apply(lambda row: bidd[row['book_id']], axis=1)

In [7]:
matrix = coo_matrix((train_df['rating'], (train_df['book_coo'], train_df['user_coo'])), dtype=np.float32)
matrix.shape

(10000, 53382)

In [8]:
user_mean = np.array((matrix.sum(axis=0) / matrix.getnnz(axis=0)).tolist()[0])
user_mean

array([4.        , 4.        , 1.        , ..., 4.08333333, 5.        ,
       4.27272727])

In [9]:
train_df['user_mean'] = train_df.apply(lambda row: user_mean[row['user_coo']], axis=1)

In [10]:
expand_user_mean = coo_matrix((train_df['user_mean'], (train_df['book_coo'], train_df['user_coo'])), dtype=np.float32)

In [11]:
norm_matrix = matrix - expand_user_mean
norm_matrix.shape

(10000, 53382)

In [12]:
cosine_matrix = cosine_similarity(norm_matrix.T, norm_matrix.T)
np.fill_diagonal(cosine_matrix, 1)

In [121]:
for uid in tqdm(user_ids[:100]):
    user_coo = uidd[uid]
    cosine = cosine_matrix[user_coo]
    indices = np.where(cosine != 0)[0]
    uid_test_df = test_df[test_df['user_id'] == uid]
    for bid in uid_test_df['book_id'].values:
        bid_train_df = train_df[train_df['book_id'] == bid]
        intersection = list(set(bid_train_df['user_coo'].to_list()).intersection(set(indices.tolist())))
        if len(intersection) > 0:
            print(uid, bid, np.array(intersection))
    #     break
    # break

  0%|          | 0/100 [00:00<?, ?it/s]

7 585 [35970 22147 42018 26341 50054 21996 41646 14830 15762  2770 33205 36216
  8698  8383]
7 956 [36323 43125]
7 1199 [45476 14924  9454 40112 19444 20440 47866 12476]
7 1464 [37059 14501  6214 46348 17197 49710 14924 15505  7160]
7 1484 [18976  1922 33475 51111 20879 41298 51634 20092]
7 1519 [ 4418 13992 41298 42997 10263 35485]
7 1620 [40577 36323  2763  9806 21230 44048 40944 21810 28980 31192 11196 20830]
7 1873 [ 1729  3777 51111 17197 15184]
7 1969 [18304 35873 28197 42281  6922 17770 19244 30701  9454 41298 24531 10263
 10486 30967 11196 18302]
7 1991 [ 3712 29449  2828   653 46990 11152 32272 15762 52759 33303 35873 11429
  6198 20539  3771 11196 21056 26182 11336  4171 29516  4431 15955  9301
  5975 28641 29669  2547 13044 29306  2301]
7 2084 [  900 38536 19593 46990 52368 10263  4125 17950 32546 11429 13861 35622
 13992   427 17197 37549 18991 39854 31156 17077  4418  1878  2903 30173
  2547 42997  1528 29306]
7 2102 [17153 31234 40577 39047 21384 19593 38536 46348 39949 4