In [1]:
from tqdm import tqdm
import pandas as pd
from scipy import sparse
import numpy as np

# read data
movies = pd.read_csv("data/ml-20m/movies.csv")
ratings = pd.read_csv("data/ml-20m/ratings.csv")

# join
ratings_joined = pd.merge(ratings, movies)

# ratingsをsparse matrixに変換して横持ちにする
action_adventure_ratings = ratings_joined.query("genres.str.contains('Action') or genres.str.contains('Adventure')", 
                                                engine='python').reset_index(drop=True)
# indexing ids
# userid
userid_unique = pd.Series(action_adventure_ratings["userId"].unique())
index_userid_dict = userid_unique.to_dict()
# inverse
userid_index_dict = dict(map(reversed, index_userid_dict.items()))

# itemid
itemid_unique = pd.Series(action_adventure_ratings["movieId"].unique())
index_itemid_dict = itemid_unique.to_dict()
# inverse
itemid_index_dict = dict(map(reversed, index_itemid_dict.items()))

action_adventure_ratings["user_id"] = action_adventure_ratings["userId"].map(userid_index_dict)
action_adventure_ratings["item_id"] = action_adventure_ratings["movieId"].map(itemid_index_dict)

# reindexしたidを使って、アイテムとジャンルの対応が取れるdictを作る
itemid_genres_dict = action_adventure_ratings[['item_id', 'genres']].set_index('item_id')['genres'].to_dict()

In [2]:
import cloudpickle
X_train = cloudpickle.load(open("output/ML-20M-X_train.pkl","rb"))
X_test = cloudpickle.load(open("output/ML-20M-X_test.pkl","rb"))

In [3]:
# 評価用辞書
test_adventure_pos_items_dict = {}
for i in tqdm(range(X_test.shape[0])):
    # trainでadventureにアクションしていないユーザに
    rated_items = X_train[i, :].indices
    if len([v for v in rated_items if 'Adventure' in itemid_genres_dict[v]]) == 0:
        # X_testの中でstoreしているアイテムが0以上のユーザに
        if X_test[i, :].nnz > 0:
            test_items = []
            selected_user_ratings = X_test[i, :]
            value_indices = selected_user_ratings.indices
            sorted_indices = np.argsort(-X_test[i, :].toarray())[0]
            # valueがあるアイテムのジャンルがadventureの場合に
            for v in sorted_indices[:len(value_indices)]:
                if 'Adventure' in itemid_genres_dict[v]:
                    test_items.append(v)
            if len(test_items) > 0:
                test_adventure_pos_items_dict[i] = test_items

100%|██████████| 138389/138389 [00:21<00:00, 6490.66it/s]


In [9]:
# popular itemsを作る
# X_trainのratingを合計して、アイテムを降順に並べる
item_sum_ratings = X_train.sum(axis=0)

In [20]:
adventure_popular_items = []
sorted_indices = np.argsort(-item_sum_ratings).tolist()[0]
for idx in sorted_indices:
    if 'Adventure' in itemid_genres_dict[idx]:
        adventure_popular_items.append(idx)

In [22]:
from lib.recommend_util import ndcg

# 評価する
# ndcg@kでk各種
ndcgs = {
    'ndcg5':  [],
    'ndcg10':  [],
    'ndcg20':  [],
    'ndcg50':  [],
    'ndcg100':  []
}
for userid, pos_itemid in tqdm(test_adventure_pos_items_dict.items()):
    pos_itemid = np.array(pos_itemid)
    ndcgs['ndcg5'].append(ndcg(adventure_popular_items[:5], pos_itemid))
    ndcgs['ndcg10'].append(ndcg(adventure_popular_items[:10], pos_itemid))
    ndcgs['ndcg20'].append(ndcg(adventure_popular_items[:20], pos_itemid))
    ndcgs['ndcg50'].append(ndcg(adventure_popular_items[:50], pos_itemid))
    ndcgs['ndcg100'].append(ndcg(adventure_popular_items[:100], pos_itemid))

100%|██████████| 18523/18523 [00:06<00:00, 2688.56it/s]


In [23]:
print(f"ndcg@5: {np.mean(ndcgs['ndcg5'])}")
print(f"ndcg@10: {np.mean(ndcgs['ndcg10'])}")
print(f"ndcg@20: {np.mean(ndcgs['ndcg20'])}")
print(f"ndcg@50: {np.mean(ndcgs['ndcg50'])}")
print(f"ndcg@100: {np.mean(ndcgs['ndcg100'])}")

ndcg@5: 0.12653749128521544
ndcg@10: 0.1811593858743771
ndcg@20: 0.24991661947963087
ndcg@50: 0.3610562709197507
ndcg@100: 0.4375180027506518
