In [1]:
!pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.6.2-cp39-cp39-manylinux2014_x86_64.whl (18.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.6.2


In [2]:
import pandas as pd
import numpy as np
from numpy.linalg import inv
from implicit.datasets.lastfm import get_lastfm
from tqdm import tqdm
import random
from scipy.stats import rankdata



#Load Data

In [3]:
def preference_mat(R):
  P = np.copy(R)
  P[P>0] = 1
  return P

def confidence_mat(R):
  alpha = 40
  C = 1 + alpha * np.log1p(R/(10**-8))
  # C = 1 + alpha * R
  return C

def latent_factor(R, factor_dim=100):
  user_num = R.shape[0]
  item_num = R.shape[1]
  X_user = np.random.normal(size=(user_num, factor_dim))
  Y_item = np.random.normal(size=(item_num, factor_dim))

  return (X_user, Y_item)

In [4]:
def optimize_X(X, R, Y, C, P, user_num, item_num, data_lambda=40):
  factor_dim = Y.shape[1]
  Yt = np.transpose(Y)
  for user in range(user_num):
    C_user = np.identity(item_num) # (item,item)
    np.fill_diagonal(C_user, C[user])
    C_user = np.diag(C[user])

    Yt_Cu_Y = np.matmul(np.matmul(Yt, C_user), Y)

    identity = np.identity(factor_dim)
    lab_ident = np.dot(data_lambda, identity)

    Yt_Cu_Y_I = np.add(Yt_Cu_Y, lab_ident)
    first_inv = inv(Yt_Cu_Y_I)
    inv_Yt_Cu = np.matmul(np.matmul(first_inv, Yt), C_user)
    X[user] = np.matmul(inv_Yt_Cu, P[user])


def optimize_Y(X, R, Y, C, P, user_num, item_num, data_lambda=400):
  factor_dim = Y.shape[1]
  Xt = np.transpose(X)
  for item in range(item_num):
    C_item = np.identity(user_num) # (user, user)
    np.fill_diagonal(C_item, C[:, item])

    Xt_Ci_X = np.matmul(np.matmul(Xt, C_item), X)

    identity = np.identity(factor_dim)
    lab_ident = np.dot(data_lambda, identity)

    Xt_Ci_X_I = np.add(Xt_Ci_X, lab_ident)
    first_inv = inv(Xt_Ci_X_I)
    inv_Xt_Ci = np.matmul(np.matmul(first_inv, Xt), C_item)
    Y[item] = np.matmul(inv_Xt_Ci, P[:, item])

def als_fun(predict, C, P, X_user, Y_item, data_lambda=40):
  predict_loss = np.square(P - predict)
  confidence_loss = np.sum(C * predict_loss)
  regularization = data_lambda * ((np.sum(np.square(X_user))) + np.sum(np.square(Y_item)))
  loss = confidence_loss + regularization
  return predict_loss, loss

def divide_matrix(A):
  n, m = A.shape
  B = np.zeros((n, m))
  C = np.zeros((n, m))
    
  for i in range(n):
    for j in range(m):
      if A[i][j] > 0:
        if (random.uniform(0, 1) < 0.8):
          B[i][j] = A[i][j]
        else:
          C[i][j] = A[i][j]
        
  return B, C

def rank(predict, test_data):
  rank_sum = 0
  user_num = test_data.shape[0]
  for user in range(user_num):
    if np.sum(test_data[user]) == 0:
      continue
    user_vec = predict[user]
    count_zero = np.count_nonzero(user_vec == 0)
    ranks = rankdata(user_vec)
    max_norm = np.max(ranks) - count_zero
    ranks = 100 - (ranks - count_zero - 1)*(100/(max_norm - 1))
    ranks[ranks<0] = 0
    rank_sum += np.inner(ranks, test_data[user])/np.sum(test_data[user])

  return(rank_sum/user_num)


In [5]:
def run(train_data, test_data, epochs = 10, data_lambda=40, factor_dim=200):

  R = train_data
  P = preference_mat(R)
  C = confidence_mat(R)

  X_user, Y_item = latent_factor(R, factor_dim)
  user_num = R.shape[0]
  item_num = R.shape[1]

  loss_list = []
  for epoch in range(epochs):
    if epoch != 0:
      optimize_X(X_user, R, Y_item, C, P, user_num, item_num, data_lambda)
      optimize_Y(X_user, R, Y_item, C, P, user_num, item_num, data_lambda)
    predict = np.matmul(X_user, np.transpose(Y_item))
    predict_loss, total_loss = als_fun(predict, C, P, X_user, Y_item, data_lambda)
    loss_list.append(total_loss)

  rank_score = rank(predict, test_data)

  print(f"rank bar score is : {rank_score}")
  return rank_score

In [None]:
import matplotlib.pyplot as plt

factor_dims = range(10, 150, 10)
epochs = 10

_, _, artist_user_plays = get_lastfm()

# artist_user_plays = artist_user_plays[:,np.argsort(artist_user_plays.sum(axis=0))[::-1]]


train_data = artist_user_plays[:600, :4000]

train_data = train_data.toarray()


# train_data = [[random.randint(0, 5) for j in range(1000)] for i in range(2000)]
train_data = np.array(train_data)

train_data, test_data = divide_matrix(train_data)

rank_scores = []
for factor in tqdm(factor_dims):
  rank_scores.append(run(train_data, test_data, epochs = epochs, factor_dim = factor))

plt.plot(factor_dims, rank_scores)
plt.show()

  7%|▋         | 1/14 [13:28<2:55:06, 808.20s/it]

rank bar score is : 24.855072463768114


 14%|█▍        | 2/14 [29:02<2:56:26, 882.20s/it]

rank bar score is : 24.871635610766045


 21%|██▏       | 3/14 [46:09<2:53:53, 948.50s/it]

rank bar score is : 24.9120082815735


 29%|██▊       | 4/14 [1:04:40<2:48:46, 1012.69s/it]

rank bar score is : 24.910973084886123


 36%|███▌      | 5/14 [1:25:35<2:45:00, 1100.04s/it]

rank bar score is : 24.987577639751546


 43%|████▎     | 6/14 [1:53:25<2:52:29, 1293.66s/it]

rank bar score is : 24.82712215320911


 50%|█████     | 7/14 [2:19:17<2:40:48, 1378.33s/it]

rank bar score is : 24.9824016563147


 57%|█████▋    | 8/14 [2:45:45<2:24:29, 1444.88s/it]

rank bar score is : 24.980331262939956


 64%|██████▍   | 9/14 [3:19:10<2:15:00, 1620.03s/it]

rank bar score is : 24.814699792960656


 71%|███████▏  | 10/14 [3:50:27<1:53:17, 1699.48s/it]

rank bar score is : 24.97619047619047


 79%|███████▊  | 11/14 [4:24:18<1:30:02, 1800.98s/it]

rank bar score is : 24.81366459627329
