In [1]:
from tqdm import tqdm
from sklearn.decomposition import NMF
import pandas as pd
from scipy import sparse
import numpy as np
import implicit

# read data
ratings = pd.read_feather('data/amazon_review_ratings.feather')

# indexing ids
# userid
userid_unique = pd.Series(ratings["userId"].unique())
index_userid_dict = userid_unique.to_dict()
# inverse
userid_index_dict = dict(map(reversed, index_userid_dict.items()))

# itemid
itemid_unique = pd.Series(ratings["itemId"].unique())
index_itemid_dict = itemid_unique.to_dict()
# inverse
itemid_index_dict = dict(map(reversed, index_itemid_dict.items()))

ratings["userId_reindex"] = ratings["userId"].map(userid_index_dict)
ratings["itemid_reindex"] = ratings["itemId"].map(itemid_index_dict)

# reindexしたidを使って、アイテムとジャンルの対応が取れるdictを作る
itemid_genres_dict = ratings[['itemid_reindex', 'category']].set_index('itemid_reindex')['category'].to_dict()

In [2]:
import cloudpickle
X_train = cloudpickle.load(open("output/Amazon-X_train.pkl","rb"))
X_test = cloudpickle.load(open("output/Amazon-X_test.pkl","rb"))
test_movies_and_TVs_pos_items_dict = cloudpickle.load(open('output/test_movies_and_TVs_pos_items_dict.pkl', 'rb'))

In [3]:
# aggregateのtrainをbookとmoviesに分離する
# bookの列
book_columns = [v for v in range(X_train.shape[1]) if 'book' in itemid_genres_dict[v]]
# moviesの列
movies_columns = [v for v in range(X_train.shape[1]) if 'movies_and_TVs' in itemid_genres_dict[v]]

# 選んだカラムに応じてとってくる
book_train = X_train[:, book_columns]
movies_train = X_train[:, movies_columns]

# moviesのみ、アイテムidのconcatとの対応関係が必要なので辞書として持っておく
movies_concat_itemid_dict = {}
count = 0
for v in range(X_train.shape[1]):
    if 'movies_and_TVs' in itemid_genres_dict[v]:
        movies_concat_itemid_dict[v] = count
        count += 1

In [4]:
# アイテムidのconcatとの対応関係が必要なので辞書として持っておく
book_concat_itemid_dict = {}
count = 0
for v in range(X_train.shape[1]):
    if 'book' in itemid_genres_dict[v]:
        book_concat_itemid_dict[v] = count
        count += 1
# inverse
inverse_book_concat_itemid_dict = dict(map(reversed, book_concat_itemid_dict.items()))

movies_concat_itemid_dict = {}
count = 0
for v in range(X_train.shape[1]):
    if 'movies_and_TVs' in itemid_genres_dict[v]:
        movies_concat_itemid_dict[v] = count
        count += 1
# inverse
inverse_movies_concat_itemid_dict = dict(map(reversed, movies_concat_itemid_dict.items()))

In [5]:
# それぞれにアクションしていないユーザを削る
# 全ユーザと、削ったあとでの対応関係を辞書として持っておく
book_train_selected = book_train[book_train.getnnz(1)>0]
movies_train_selected = movies_train[movies_train.getnnz(1)>0]

book_train_action_users = {}
book_users = book_train.getnnz(1)>0
count = 0
for i in range(book_train.shape[0]):
    if book_users[i]:
        book_train_action_users[i] = count
        count += 1

# inverse
inverse_book_train_action_users = dict(map(reversed, book_train_action_users.items()))

movies_train_action_users = {}
movies_users = movies_train.getnnz(1)>0
count = 0
for i in range(movies_train.shape[0]):
    if movies_users[i]:
        movies_train_action_users[i] = count
        count += 1

# inverse
inverse_movies_train_action_users = dict(map(reversed, movies_train_action_users.items()))

In [8]:
# bookだけでALSをまず行う
# mediateでの結果をそのまま持ってくる
import cloudpickle
book_ALS = cloudpickle.load(open('output/book_ALS.pkl', 'rb'))

In [12]:
# side informationとしてaction_NMFで得られたベクトルを使う
user_attributes = pd.DataFrame(book_ALS.user_factors)
user_attributes['UserId'] = user_attributes.index
# useridをconcatの次元に戻す
user_attributes['UserId'] = user_attributes['UserId'].map(inverse_book_train_action_users)

item_attributes = pd.DataFrame(book_ALS.item_factors)
item_attributes['ItemId'] = item_attributes.index
# itemidをconcatの次元に戻す
item_attributes['ItemId'] = item_attributes['ItemId'].map(inverse_book_concat_itemid_dict)

In [14]:
# X_trainから縦持ちに復元する(X_trainが既に横持ちなので)
userid_stacked = []
itemid_stacked = []
ratings_stacked = []

for i in tqdm(range(X_train.shape[0])):
    ratings = X_train[i, :].data
    item_idx = X_train[i, :].indices
    for idx, v in enumerate(item_idx):
        if 'movies' in itemid_genres_dict[v]:
            userid_stacked.append(i)
            itemid_stacked.append(v)
            ratings_stacked.append(ratings[idx])

movies_ratings_stacked = pd.DataFrame()
movies_ratings_stacked['UserId'] = userid_stacked
movies_ratings_stacked['ItemId'] = itemid_stacked
movies_ratings_stacked['Rating'] = ratings_stacked


  0%|          | 0/690240 [00:00<?, ?it/s][A
  0%|          | 336/690240 [00:00<03:25, 3354.07it/s][A
  0%|          | 727/690240 [00:00<03:16, 3502.15it/s][A
  0%|          | 1188/690240 [00:00<03:02, 3770.64it/s][A
  0%|          | 1602/690240 [00:00<02:57, 3874.14it/s][A
  0%|          | 2078/690240 [00:00<02:47, 4101.39it/s][A
  0%|          | 2454/690240 [00:00<02:52, 3992.38it/s][A
  0%|          | 2929/690240 [00:00<02:43, 4192.25it/s][A
  0%|          | 3360/690240 [00:00<02:42, 4217.97it/s][A
  1%|          | 3830/690240 [00:00<02:37, 4351.51it/s][A
  1%|          | 4314/690240 [00:01<02:32, 4487.19it/s][A
  1%|          | 4818/690240 [00:01<02:27, 4639.13it/s][A
  1%|          | 5280/690240 [00:01<02:30, 4556.50it/s][A
  1%|          | 5763/690240 [00:01<02:27, 4633.17it/s][A
  1%|          | 6283/690240 [00:01<02:22, 4788.35it/s][A
  1%|          | 6763/690240 [00:01<02:27, 4642.02it/s][A
  1%|          | 7229/690240 [00:01<02:34, 4420.76it/s][A
  1%|      

In [None]:
from cmfrec import CMF

# fitting a model and making some recommendations
recommender = CMF(k=20, k_main=3, k_user=2, k_item=1, reg_param=1e-4)
recommender.fit(ratings=movies_ratings_stacked, user_info=user_attributes, item_info=item_attributes,
                cols_bin_user=None, cols_bin_item=None)






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.







In [17]:
from lib.recommend_util import ndcg
ndcgs = {
    'ndcg5':  [],
    'ndcg10':  [],
    'ndcg20':  [],
    'ndcg50':  [],
    'ndcg100':  []
}
count = 0
for userid, pos_items in tqdm(test_movies_and_TVs_pos_items_dict.items()):
    pos_items = np.array(pos_items)
    try:
        recommended_items = np.array(recommender.topN(user=userid, n=100))
        ndcgs['ndcg5'].append(ndcg(recommended_items[:5], pos_items))
        ndcgs['ndcg10'].append(ndcg(recommended_items[:10], pos_items))
        ndcgs['ndcg20'].append(ndcg(recommended_items[:20], pos_items))
        ndcgs['ndcg50'].append(ndcg(recommended_items[:50], pos_items))
        ndcgs['ndcg100'].append(ndcg(recommended_items[:100], pos_items))
    except:
        count += 1
        # 推薦できないユーザの場合は無条件で0を入れる
        ndcgs['ndcg5'].append(0)
        ndcgs['ndcg10'].append(0)
        ndcgs['ndcg20'].append(0)
        ndcgs['ndcg50'].append(0)
        ndcgs['ndcg100'].append(0)


  0%|          | 0/10250 [00:00<?, ?it/s][A
  0%|          | 20/10250 [00:00<00:52, 194.75it/s][A
  0%|          | 43/10250 [00:00<00:50, 202.75it/s][A
  1%|          | 69/10250 [00:00<00:47, 216.07it/s][A
  1%|          | 94/10250 [00:00<00:45, 223.57it/s][A
  1%|          | 118/10250 [00:00<00:44, 228.25it/s][A
  1%|▏         | 144/10250 [00:00<00:42, 235.62it/s][A
  2%|▏         | 168/10250 [00:00<00:42, 236.54it/s][A
  2%|▏         | 192/10250 [00:00<00:42, 236.25it/s][A
  2%|▏         | 216/10250 [00:00<00:42, 236.09it/s][A
  2%|▏         | 240/10250 [00:01<00:42, 235.49it/s][A
  3%|▎         | 264/10250 [00:01<00:43, 231.33it/s][A
  3%|▎         | 287/10250 [00:01<00:44, 223.17it/s][A
  3%|▎         | 311/10250 [00:01<00:43, 226.99it/s][A
  3%|▎         | 336/10250 [00:01<00:42, 232.70it/s][A
  4%|▎         | 360/10250 [00:01<00:42, 232.88it/s][A
  4%|▍         | 386/10250 [00:01<00:41, 238.04it/s][A
  4%|▍         | 411/10250 [00:01<00:40, 241.21it/s][A
  4%|▍

In [18]:
print("ndcg@5: {}".format(np.mean(ndcgs['ndcg5'])))
print("ndcg@10: {}".format(np.mean(ndcgs['ndcg10'])))
print("ndcg@20: {}".format(np.mean(ndcgs['ndcg20'])))
print("ndcg@50: {}".format(np.mean(ndcgs['ndcg50'])))
print("ndcg@100: {}".format(np.mean(ndcgs['ndcg100'])))

ndcg@5: 0.0035867129080451843
ndcg@10: 0.005330545702154135
ndcg@20: 0.006959322362161446
ndcg@50: 0.009617060395639117
ndcg@100: 0.012869096688653638
