- https://www.kaggle.com/code/chocozzz/03-goodbooks-10k-collaborative-filtering-model

In [1]:
import pandas as pd
import numpy as np 
import os, sys, gc 

from tqdm.notebook import tqdm 

import matplotlib.pyplot as plt 
plt.rcParams['figure.figsize'] = (20,10)
plt.ion()

<matplotlib.pyplot._IonContext at 0x1039dc460>

In [2]:
path = "./data/books/"

In [3]:
books = pd.read_csv(path + "books.csv")
book_tags = pd.read_csv(path + "book_tags.csv")
ratings = pd.read_csv(path + "ratings.csv")
to_read = pd.read_csv(path + "to_read.csv")

train = pd.read_csv(path + 'eda_train.csv')
test = pd.read_csv(path + 'eda_test.csv')

In [4]:
train['book_id'] = train['book_id'].astype(str)
test['book_id'] = test['book_id'].astype(str)

books['book_id'] = books['book_id'].astype(str)

In [5]:
popular_rec_model = books.sort_values(by='books_count', ascending=False)['book_id'].values[0:500]

In [6]:
sol = test.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
gt = {}
for user in tqdm(sol['user_id'].unique()):
  gt[user] = list(sol[sol['user_id']==user]['unique'].values[0])

  0%|          | 0/53424 [00:00<?, ?it/s]

In [8]:
rec_df = pd.DataFrame()
rec_df['user_id'] = train['user_id'].unique()
rec_df.head()

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5


### SGD를 이용한 협업필터링 진행

In [9]:
import numpy as np 
from tqdm import tqdm_notebook as tqdm 

class MatrixFactorization():
  def __init__(self, R, k, learning_rate, reg_param, epochs, verbose=False):
    """
      R: rating matrix
      k: latent parameter
      learning_rate: beta on weight update 
      epochs: training epochs
      verbose: print status
    """
    self._R = R
    self._num_users, self._num_items = R.shape
    self._k = k 
    self._learning_rate = learning_rate 
    self._reg_param = reg_param 
    self._epochs = epochs 
    self._verbose = verbose 

  def fit(self):
    """
      training Matrix Factorization: Update matrix latent weight and bias 
      참고: self._b에 대한 설명 
      - global bias: input R에서 평가가 매겨진 rating의 평균값을 global bias로 사용 
      - 정규화 기능. 최종 rating에 음수가 들어가는 것 대신 latent feature에 음수가 포함되도록 해줌.

      return: training_process
    """
    # inint latent features 
    self._P = np.random.normal(size=(self._num_users, self._k))
    self._Q = np.random.normal(size=(self._num_items, self._k))

    # init biases 
    self._b_P = np.zeros(self._num_users)
    self._b_Q = np.zeros(self._num_items)
    self._b = np.mean(self._R[np.where(self._R != 0)])

    # train while epochs 
    self._training_process = [] 
    for epoch in range(self._epochs):
      # rating이 존재하는 index를 기준으로 training 
      xi, yi = self._R.nonzero()
      for i, j in zip(xi, yi):
        self.gradient_descent(i, j, self._R[i, j])
      
      cost = self.cost()
      self._training_process.append((epoch, cost))

      # print status 
      if self._verbose and ((epoch +1) % 10 == 0):
        print("Iteration: %d; cost: %.4f" % (epoch +1, cost))

  def cost(self):
    """
      compute root mean square error
      return rmse cost
    """
    # xi, yi: R[xi, yi]는 nonzero인 value를 의미한다. 
    xi, yi = self._R.nonzero()
    cost = 0
    for x, y in zip(xi, yi):
      cost += pow(self._R[x, y] - self.get_prediction(x, y), 2)

    return np.sqrt(cost/len(xi))

  def gradient(self, error, i, j):
    """
      gradient of latent feature for GD
      param
        error: rating - prediction error 
        i: user index
        j: item index
      return gradient of leatent feature tuple
    """
    dp = (error * self._Q[j, :]) - (self._reg_param * self._P[i, :])
    dq = (error * self._P[i, :]) - (self._reg_param * self._Q[j, :])
    return dp, dq 

  def gradient_descent(self, i, j, rating):
    """
      graident descent function
      param
        i: user index of matrix
        j: item index of matrix
        rating: rating of (i,j)
    """
    # get error
    prediction = self.get_prediction(i,j)
    error = rating - prediction 

    # update biases
    self._b_P[i] += self._learning_rate * (error - self._reg_param * self._b_P[i])
    self._b_Q[j] += self._learning_rate * (error - self._reg_param * self._b_Q[j])

    # update latent feature
    dp, dq = self.gradient(error, i, j)
    self._P[i, :] += self._learning_rate * dp 
    self._Q[j, :] += self._learning_rate * dq 

  def get_prediction(self, i, j):
    """
      get predicted rating
      param
        i: user_i
        j: item_j
      return prediction of r_ij
    """
    return self._b + self._b_P[i] + self._b_Q[j] + self._P[i, :].dot(self._Q[j, :].T)

  def get_complete_amtrix(self):
    """
      computer complete matrix PXQ + P.bias + Q.bias + global bias
      - PXQ 행렬에 b_P[:, np.newaxis]를 더하는 것은 각 열마다 bias를 더해주는 것
      - b_Q[np.newaxis, :]를 더하는 것은 각 행마다 bias를 더해주는 것 
      - b를 더하는 것은 각 element마다 bias를 더해주는 것
      - newaxis: 차원을 추가해줌. 1차원인 Latent들로 2차원의 R에 행/열 단위 연산을 해주기위해 차원을 추가하는 것.
      return complete matrix R^
    """
    return self._b + self._b_P[:, np.newaxis] + self._b_Q[np.newaxis, :] + self._P.dot(self._Q.T)

In [10]:
user2idx = {}
for i, l in enumerate(train['user_id'].unique()):
  user2idx[l] = i

book2idx = {}
for i, l in enumerate(train['book_id'].unique()):
  book2idx[l] = i 

In [12]:
idx2user = {i: user for user, i in user2idx.items()}
idx2book = {i: book for book, i in book2idx.items()}

In [13]:
data = train[['user_id', 'book_id']].reset_index(drop=True)
useridx = data['useridx'] = train['user_id'].apply(lambda x: user2idx[x]).values 
bookidx = data['bookidx'] = train['book_id'].apply(lambda x: book2idx[x]).values 

rating = np.ones(len(data))

In [14]:
from scipy.sparse import csr_matrix

purchase_sparse = csr_matrix((rating, (useridx, bookidx)), shape=(len(set(useridx)), len(set(bookidx))))
purchase_sparse 

<53382x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 387039 stored elements in Compressed Sparse Row format>

In [15]:
R = purchase_sparse.toarray()
R

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
%%time 
factorizer = MatrixFactorization(R, k=20, learning_rate=0.01, reg_param=0.01, epochs=100, verbose=True)
factorizer.fit()

Iteration: 10; cost: 0.4632
Iteration: 20; cost: 0.3142
Iteration: 30; cost: 0.2600
Iteration: 40; cost: 0.2303
Iteration: 50; cost: 0.2102
Iteration: 60; cost: 0.1950
Iteration: 70; cost: 0.1828
Iteration: 80; cost: 0.1726
Iteration: 90; cost: 0.1638
Iteration: 100; cost: 0.1563
CPU times: user 13min 35s, sys: 4.13 s, total: 13min 39s
Wall time: 13min 40s


In [17]:
del R 
gc.collect()

sgd_rec_model = factorizer.get_complete_amtrix()

In [18]:
# 내가 읽은 책의 목록을 추출
read_list = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
read_list.head()

Unnamed: 0,user_id,unique
0,1,[1180]
1,2,[8034]
2,3,[9049]
3,4,[8464]
4,5,"[4829, 7487]"


In [19]:
total_rec_list = {}
for user in tqdm(data['useridx'].unique()):
  rec_list = [] 

  # 기존에 만든 book id를 변경 
  rating_scores = [
    (idx2book[i], c) for i, c in enumerate(sgd_rec_model[user]) if i != user # 자기 자신이 추천안되도록
  ]
  rating_scores = sorted(rating_scores, key= lambda x: x[1], reverse=True) # 평점이 높은 순서대로 정렬

  seen = read_list[read_list['user_id'] == idx2user[user]]['unique'].values[0]
  for rec in rating_scores[0:250]:
    if rec[0] not in seen:
      rec_list.append(rec[0])

  if len(rec_list) < 200:
    for i in popular_rec_model[0:200]:
      if i not in seen:
        rec_list.append(i)

  total_rec_list[idx2user[user]] = rec_list[0:200]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for user in tqdm(data['useridx'].unique()):


  0%|          | 0/53382 [00:00<?, ?it/s]

In [20]:
import six 
import math 

class evaluate():
  def __init__(self, recs, gt, topn=100):
    self.recs = recs 
    self.gt = gt 
    self.topn = topn 

  def _ndcg(self):
    Q, S = 0.0, 0.0
    for u, seen in six.iteritems(self.gt):
      seen = list(set(seen))
      rec = self.recs.get(u, [])
      if not rec or len(seen) == 0:
        continue 

      dcg = 0.0 
      idcg = sum([
        1.0 / math.log(i +2, 2) for i in range(min(len(seen), len(rec)))
      ])
      for i, r in enumerate(rec):
        if r not in seen:
          continue 
        rank = i +1 
        dcg += 1.0 / math.log(rank +1, 2)
      
      ndcg = dcg / idcg 
      S += ndcg 
      Q += 1 
    
    return S / Q 

  def _map(self):
    n, ap = 0.0, 0.0 
    for u, seen in six.iteritems(self.gt):
      seen = list(set(seen))
      rec = self.recs.get(u, [])
      if not rec or len(seen) == 0:
        continue 

      _ap, correct = 0.0, 0.0 
      for i, r in enumerate(rec):
        if r in seen:
          correct += 1
          _ap += (correct / (i + 1.0))

      _ap /= min(len(seen), len(rec))
      ap += _ap 
      n += 1.0 

    return ap / n 

  def _entropy_diversity(self):
    sz = float(len(self.recs)) * self.topn 
    freq = {} 
    for u, rec in six.iteritems(self.recs):
      for r in rec:
        freq[r] = freq.get(r, 0) + 1
    
    ent = -sum([
      v / sz * math.log(v/sz) for v in six.itervalues(freq)
    ])
    return ent 

  def _evaluate(self):
    print('MAP@%s: %s' % (self.topn, self._map()))
    print('NDCG@%s: %s' % (self.topn, self._ndcg()))
    print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))

In [21]:
evaluate_func = evaluate(recs=total_rec_list, gt=gt, topn=200)
evaluate_func._evaluate()

MAP@200: 0.0006076238705365617
NDCG@200: 0.0061132877401394335
EntDiv@200: 8.302659538139352


## ALS 방식을 이용한 협업필터링 

In [59]:
from scipy.sparse import csr_matrix

purchase_sparse = csr_matrix((rating, (useridx, bookidx)), shape=(len(set(useridx)), len(set(bookidx))))
purchase_sparse 

<53382x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 387039 stored elements in Compressed Sparse Row format>

In [60]:
from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from implicit.bpr import BayesianPersonalizedRanking as BPR

als_model = ALS(factors=20, regularization=0.01, iterations = 100)
als_model.fit(purchase_sparse.T)

  0%|          | 0/100 [00:00<?, ?it/s]

In [61]:
purchase_sparse_array = purchase_sparse.toarray()
purchase_sparse_array.shape

(53382, 10000)

In [62]:
als_model.recommend(0, purchase_sparse, N=100)[0:10]

ValueError: user_items must contain 1 row for every user in userids

In [None]:
total_rec_list = {}
for user in tqdm(data['useridx'].unique()):
  rec_list = []
  
  # 기존에 만든 Book ID를 변경 
  seen = read_list[read_list['user_id'] == idx2user[user]]['unique'].values[0]
  recs = als_model.recommend(user, purchase_sparse, N=250)
  recs = [idx2book[x[0]] for x in recs][0:250]  
  
  for rec in recs: 
    if rec not in seen:
      rec_list.append(rec)
  
  if len(rec_list) < 200:
    for i in popular_rec_model[0:200]:
      if rec not in seen:
        rec_list.append(rec)

  total_rec_list[idx2user[user]] = rec_list[0:200]

  0%|          | 0/53382 [00:00<?, ?it/s]

ValueError: user_items must contain 1 row for every user in userids

In [None]:
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200)
evaluate_func._evaluate()