In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os
from matplotlib import pyplot as plt
import json
from collections import Counter
import math
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity
tqdm.pandas()

In [2]:
train_df = pd.read_csv('./data/train_ratings.csv')
test_df = pd.read_csv('./data/test_ratings.csv')
user_ids = np.load('./data/user_ids.npy')
book_ids = np.load('./data/book_ids.npy')

In [3]:
train_df.head()

Unnamed: 0,user_id,book_id,rating
0,1,1180,4
1,1,6285,4
2,2,8034,4
3,2,9762,4
4,3,9014,1


In [4]:
uidd = dict()
for i, uid in enumerate(user_ids):
    uidd[uid] = i
bidd = dict()
for i, bid in enumerate(book_ids):
    bidd[bid] = i

In [5]:
train_df.shape[0] / book_ids.shape[0] / user_ids.shape[0]

0.0012408077629163388

In [6]:
train_df['user_coo'] = train_df.apply(lambda row: uidd[row['user_id']], axis=1)
train_df['book_coo'] = train_df.apply(lambda row: bidd[row['book_id']], axis=1)

In [7]:
matrix = coo_matrix((train_df['rating'], (train_df['book_coo'], train_df['user_coo'])), dtype=np.float32)
matrix.shape

(10000, 53382)

In [8]:
user_mean = np.array((matrix.sum(axis=0) / matrix.getnnz(axis=0)).tolist()[0])
user_mean

array([4.        , 4.        , 1.        , ..., 4.08333333, 5.        ,
       4.27272727])

In [9]:
train_df['user_mean'] = train_df.apply(lambda row: user_mean[row['user_coo']], axis=1)

In [10]:
expand_user_mean = coo_matrix((train_df['user_mean'], (train_df['book_coo'], train_df['user_coo'])), dtype=np.float32)

In [11]:
norm_matrix = matrix - expand_user_mean
norm_matrix.shape

(10000, 53382)

In [12]:
cosine_matrix = cosine_similarity(norm_matrix.T, norm_matrix.T)
np.fill_diagonal(cosine_matrix, 1)

In [45]:
k = 3
os.makedirs('./result/user-user-{}'.format(k), exist_ok=True)
for uid in tqdm(user_ids[:100]):
    user_coo = uidd[uid]
    cosine = cosine_matrix[user_coo]
    indices = np.where(cosine != 0)[0]
    uid_test_df = test_df[test_df['user_id'] == uid]
    uid_mean = user_mean[user_coo]
    predict = list()
    for bid in uid_test_df['book_id'].values:
        bid_train_df = train_df[train_df['book_id'] == bid]
        intersection = list(set(bid_train_df['user_coo'].to_list()).intersection(set(indices.tolist())))
        if len(intersection) > 0:
            intersection = np.array(intersection)
            choices = np.argsort(cosine[intersection])[::-1][:k]
            used_cosine = cosine[intersection][choices]
            used_rating = bid_train_df[bid_train_df['user_coo'].isin(intersection[choices])]['rating'].values
            predict.append(used_cosine.dot(used_rating) / np.abs(used_cosine).sum() + uid_mean)
        else:
            predict.append(uid_mean)
    uid_test_df['predict'] = np.array(predict)
    uid_test_df.to_csv('./result/user-user-{}/{}.csv'.format(k, uid), index=False)

  0%|          | 0/100 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uid_test_df['predict'] = np.array(predict)
