In [1]:
from tqdm import tqdm
from sklearn.decomposition import NMF
import pandas as pd
from scipy import sparse
import numpy as np
import cloudpickle
import implicit

In [2]:
# read data
ratings = pd.read_feather('data/amazon_review_ratings.feather')

In [3]:
# indexing ids
# userid
userid_unique = pd.Series(ratings["userId"].unique())
index_userid_dict = userid_unique.to_dict()
# inverse
userid_index_dict = dict(map(reversed, index_userid_dict.items()))

# itemid
itemid_unique = pd.Series(ratings["itemId"].unique())
index_itemid_dict = itemid_unique.to_dict()
# inverse
itemid_index_dict = dict(map(reversed, index_itemid_dict.items()))

ratings["userId_reindex"] = ratings["userId"].map(userid_index_dict)
ratings["itemid_reindex"] = ratings["itemId"].map(itemid_index_dict)

In [4]:
# reindexしたidを使って、アイテムとジャンルの対応が取れるdictを作る
itemid_genres_dict = ratings[['itemid_reindex', 'category']].set_index('itemid_reindex')['category'].to_dict()

In [5]:
import cloudpickle
X_train = cloudpickle.load(open("output/Amazon-X_train.pkl","rb"))
X_test = cloudpickle.load(open("output/Amazon-X_test.pkl","rb"))
test_movies_and_TVs_pos_items_dict = cloudpickle.load(open('output/test_movies_and_TVs_pos_items_dict.pkl', 'rb'))

In [6]:
# aggregateのtrainをbookとmoviesに分離する
# bookの列
book_columns = [v for v in range(X_train.shape[1]) if 'book' in itemid_genres_dict[v]]
# moviesの列
movies_columns = [v for v in range(X_train.shape[1]) if 'movies_and_TVs' in itemid_genres_dict[v]]

# 選んだカラムに応じてとってくる
book_train = X_train[:, book_columns]
movies_train = X_train[:, movies_columns]

In [7]:
# moviesのみ、アイテムidのconcatとの対応関係が必要なので辞書として持っておく
movies_concat_itemid_dict = {}
count = 0
for v in range(X_train.shape[1]):
    if 'movies_and_TVs' in itemid_genres_dict[v]:
        movies_concat_itemid_dict[v] = count
        count += 1

In [8]:
# それぞれにアクションしていないユーザを削る
# 全ユーザと、削ったあとでの対応関係を辞書として持っておく
book_train_selected = book_train[book_train.getnnz(1)>0]
movies_train_selected = movies_train[movies_train.getnnz(1)>0]

In [9]:
book_train_action_users = {}
book_users = book_train.getnnz(1)>0
count = 0
for i in range(book_train.shape[0]):
    if book_users[i]:
        book_train_action_users[i] = count
        count += 1

# inverse
inverse_book_train_action_users = dict(map(reversed, book_train_action_users.items()))

movies_train_action_users = {}
movies_users = movies_train.getnnz(1)>0
count = 0
for i in range(movies_train.shape[0]):
    if movies_users[i]:
        movies_train_action_users[i] = count
        count += 1

# inverse
inverse_movies_train_action_users = dict(map(reversed, movies_train_action_users.items()))

In [10]:
# それぞれでALSをする
np.random.seed(42)
book_ALS = implicit.als.AlternatingLeastSquares(factors=100)
book_ALS.fit(book_train_selected.transpose())



HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [11]:
movies_ALS = implicit.als.AlternatingLeastSquares(factors=100)
movies_ALS.fit(movies_train_selected.transpose())

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [14]:
# book側のユーザについて近傍検索空間を作成
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors()
neigh.fit(book_ALS.user_factors)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [None]:
neighbors_users = {}
# movies側にアクションしていないユーザについて
for userid in tqdm(test_movies_and_TVs_pos_items_dict.keys()):
    # bookの次元におけるこのユーザのベクトルを得る
    try:
        book_user_id = book_train_action_users[userid]
    except:
        continue
    book_user_vector = book_ALS.user_factors[book_user_id,:]
    # 候補ユーザを得る(これら候補ユーザはbookの次元)
    candidate_users = neigh.kneighbors([book_user_vector], 100, return_distance=False)[0][1:]
    # concatの次元に戻す
    candidate_users_ = [inverse_book_train_action_users[v] for v in candidate_users]
    candidates_ = []
    for c in candidate_users_:
        # movies_trainに存在しているかを確認する
        if c in movies_train_action_users:
            # 存在しているならneighbors_usersにconcatの次元のuseridで足す
            candidates_.append(c)
    neighbors_users[userid] = candidates_[:10]

 11%|█         | 1120/10250 [02:59<27:03,  5.62it/s]

In [19]:
from lib.recommend_util import ndcg
# neighbors_usersを使って推薦する
ndcgs = {
    'ndcg5':  [],
    'ndcg10':  [],
    'ndcg20':  [],
    'ndcg50':  [],
    'ndcg100':  []
}
count = 0
for userid, pos_items in tqdm(test_movies_and_TVs_pos_items_dict.items()):
    # pos_itemsをmovies_matrixの次元に変換する
    pos_items = np.array([movies_concat_itemid_dict[v] for v in pos_items])
    if userid in neighbors_users:
        neighs = neighbors_users[userid]
        sum_ratings = np.zeros(movies_ALS.item_factors.shape[0])
        for v in neighs:
            v_movies = movies_train_action_users[v]
            sum_ratings += np.dot(movies_ALS.user_factors[v_movies, :], movies_ALS.item_factors.T)
        # sum_ratingsをargsort
        sorted_indices = np.array([v for v in np.argsort(-sum_ratings)])
        ndcgs['ndcg5'].append(ndcg(sorted_indices[:5], pos_items))
        ndcgs['ndcg10'].append(ndcg(sorted_indices[:10], pos_items))
        ndcgs['ndcg20'].append(ndcg(sorted_indices[:20], pos_items))
        ndcgs['ndcg50'].append(ndcg(sorted_indices[:50], pos_items))
        ndcgs['ndcg100'].append(ndcg(sorted_indices[:100], pos_items))
    else:
        count += 1
        # 推薦できないユーザの場合は無条件で0を入れる
        ndcgs['ndcg5'].append(0)
        ndcgs['ndcg10'].append(0)
        ndcgs['ndcg20'].append(0)
        ndcgs['ndcg50'].append(0)
        ndcgs['ndcg100'].append(0)

100%|██████████| 10250/10250 [02:22<00:00, 71.97it/s]


In [20]:
print("ndcg@5: {}".format(np.mean(ndcgs['ndcg5'])))
print("ndcg@10: {}".format(np.mean(ndcgs['ndcg10'])))
print("ndcg@20: {}".format(np.mean(ndcgs['ndcg20'])))
print("ndcg@50: {}".format(np.mean(ndcgs['ndcg50'])))
print("ndcg@100: {}".format(np.mean(ndcgs['ndcg100'])))

ndcg@5: 0.004831150312612696
ndcg@10: 0.006653583658177678
ndcg@20: 0.009500250696250033
ndcg@50: 0.014420981659326794
ndcg@100: 0.0197377973405826


In [21]:
# モデルなどを保存
import cloudpickle
cloudpickle.dump(book_ALS, open("output/book_ALS.pkl", "wb"))
cloudpickle.dump(movies_ALS, open("output/movies_ALS.pkl", "wb"))