In [1]:
from tqdm import tqdm
from sklearn.decomposition import NMF
import pandas as pd
from scipy import sparse
import numpy as np

In [2]:
# read data
movies = pd.read_csv("data/ml-20m/movies.csv")
ratings = pd.read_csv("data/ml-20m/ratings.csv")

In [3]:
# join
ratings_joined = pd.merge(ratings, movies)

In [4]:
# ratingsをsparse matrixに変換して横持ちにする
action_adventure_ratings = ratings_joined.query("genres.str.contains('Action') or genres.str.contains('Adventure')", 
                                                engine='python').reset_index(drop=True)

In [5]:
# indexing ids
# userid
userid_unique = pd.Series(action_adventure_ratings["userId"].unique())
index_userid_dict = userid_unique.to_dict()
# inverse
userid_index_dict = dict(map(reversed, index_userid_dict.items()))

# itemid
itemid_unique = pd.Series(action_adventure_ratings["movieId"].unique())
index_itemid_dict = itemid_unique.to_dict()
# inverse
itemid_index_dict = dict(map(reversed, index_itemid_dict.items()))

action_adventure_ratings["user_id"] = action_adventure_ratings["userId"].map(userid_index_dict)
action_adventure_ratings["item_id"] = action_adventure_ratings["movieId"].map(itemid_index_dict)

In [9]:
# reindexしたidを使って、アイテムとジャンルの対応が取れるdictを作る
itemid_genres_dict = action_adventure_ratings[['item_id', 'genres']].set_index('item_id')['genres'].to_dict()

In [6]:
import cloudpickle
X_train = cloudpickle.load(open("output/X_train.pkl","rb"))
X_test = cloudpickle.load(open("output/X_test.pkl","rb"))

In [10]:
# aggregateのtrainをactionとadventureに分離する
# actionの列
action_columns = [v for v in range(X_train.shape[1]) if 'Action' in itemid_genres_dict[v]]
# adventureの列
adventure_columns = [v for v in range(X_train.shape[1]) if 'Adventure' in itemid_genres_dict[v]]

# 選んだカラムに応じてとってくる
action_train = X_train[:, action_columns]
adventure_train = X_train[:, adventure_columns]

In [32]:
# adventureのみ、アイテムidのconcatとの対応関係が必要なので辞書として持っておく
adventure_concat_itemid_dict = {}
count = 0
for v in range(X_train.shape[1]):
    if 'Adventure' in itemid_genres_dict[v]:
        adventure_concat_itemid_dict[v] = count
        count += 1

In [19]:
# それぞれにアクションしていないユーザを削る
# 全ユーザと、削ったあとでの対応関係を辞書として持っておく
action_train_selected = action_train[action_train.getnnz(1)>0]
adventure_train_selected = adventure_train[adventure_train.getnnz(1)>0]

In [27]:
action_train_action_users = {}
action_users = action_train.getnnz(1)>0
count = 0
for i in range(action_train.shape[0]):
    if action_users[i]:
        action_train_action_users[i] = count
        count += 1

# inverse
inverse_action_train_action_users = dict(map(reversed, action_train_action_users.items()))

In [28]:
adventure_train_action_users = {}
adventure_users = adventure_train.getnnz(1)>0
count = 0
for i in range(adventure_train.shape[0]):
    if adventure_users[i]:
        adventure_train_action_users[i] = count
        count += 1

# inverse
inverse_adventure_train_action_users = dict(map(reversed, adventure_train_action_users.items()))

In [None]:
# それぞれでNMFをする
action_NMF = NMF(n_components=100, random_state=42)
action_NMF_fitted = action_NMF.fit_transform(action_train_selected)
action_NMF_components = action_NMF.components_

In [None]:
adventure_NMF = NMF(n_components=100, random_state=42)
adventure_NMF_fitted = adventure_NMF.fit_transform(adventure_train_selected)
adventure_NMF_components = adventure_NMF.components_

In [23]:
# adventureにアクションしていないユーザを選択
# それらについて、action側のユーザ空間でk近傍を探索し10のユーザを選ぶ(これらユーザはどちらにもアクションしている必要がある)
# これらユーザはadventureにもアクションしているので、adventureでのこれらユーザのratingを平均して、降順にしたら推薦ができる
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors()
neigh.fit(action_NMF_fitted)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [24]:
action_predicted_ratings = np.dot(action_NMF_fitted, action_NMF_components)

In [25]:
adventure_predicted_ratings = np.dot(adventure_NMF_fitted, adventure_NMF_components)

In [26]:
# 評価対象のユーザ
test_adventure_pos_items_dict = {}
for i in tqdm(range(X_test.shape[0])):
    # trainでadventureにアクションしていないユーザに
    rated_items = X_train[i, :].indices
    if len([v for v in rated_items if 'Adventure' in itemid_genres_dict[v]]) == 0:
        # X_testの中でstoreしているアイテムが0以上のユーザに
        if X_test[i, :].nnz > 0:
            test_items = []
            selected_user_ratings = X_test[i, :]
            value_indices = selected_user_ratings.indices
            sorted_indices = np.argsort(-X_test[i, :].toarray())[0]
            # valueがあるアイテムのジャンルがadventureの場合に
            for v in sorted_indices[:len(value_indices)]:
                if 'Adventure' in itemid_genres_dict[v]:
                    test_items.append(v)
            if len(test_items) > 0:
                test_adventure_pos_items_dict[i] = test_items

100%|██████████| 138389/138389 [00:34<00:00, 4045.07it/s]


In [30]:
neighbors_users = {}
# adventure側にアクションしていないユーザについて
for userid in tqdm(test_adventure_pos_items_dict.keys()):
    # actionの次元におけるこのユーザのベクトルを得る
    try:
        action_user_id = action_train_action_users[userid]
    except:
        continue
    action_user_vector = action_NMF_fitted[action_user_id,:]
    # 候補ユーザを得る(これら候補ユーザはactionの次元)
    candidate_users = neigh.kneighbors([action_user_vector], 100, return_distance=False)[0][1:]
    # concatの次元に戻す
    candidate_users_ = [inverse_action_train_action_users[v] for v in candidate_users]
    candidates_ = []
    for c in candidate_users_:
        # adventure_trainに存在しているかを確認する
        if c in adventure_train_action_users:
            # 存在しているならneighbors_usersにconcatの次元のuseridで足す
            candidates_.append(c)
    neighbors_users[userid] = candidates_[:10]

100%|██████████| 643/643 [00:15<00:00, 40.60it/s]


In [48]:
from lib.recommend_util import ndcg
# neighbors_usersを使って推薦する
ndcgs = {
    'ndcg5':  [],
    'ndcg10':  [],
    'ndcg20':  [],
    'ndcg50':  [],
    'ndcg100':  []
}
count = 0
for userid, pos_items in tqdm(test_adventure_pos_items_dict.items()):
    # pos_itemsをadventure_matrixの次元に変換する
    pos_items = np.array([adventure_concat_itemid_dict[v] for v in pos_items])
    if userid in neighbors_users:
        neighs = neighbors_users[userid]
        sum_ratings = np.zeros(adventure_predicted_ratings.shape[1])
        for v in neighs:
            v_adv = adventure_train_action_users[v]
            sum_ratings += adventure_predicted_ratings[v_adv, :]
        # sum_ratingsをargsort
        sorted_indices = np.array([v for v in np.argsort(-sum_ratings)])
        ndcgs['ndcg5'].append(ndcg(sorted_indices[:5], pos_items))
        ndcgs['ndcg10'].append(ndcg(sorted_indices[:10], pos_items))
        ndcgs['ndcg20'].append(ndcg(sorted_indices[:20], pos_items))
        ndcgs['ndcg50'].append(ndcg(sorted_indices[:50], pos_items))
        ndcgs['ndcg100'].append(ndcg(sorted_indices[:100], pos_items))
    else:
        count += 1
        # 推薦できないユーザの場合は無条件で0を入れる
        ndcgs['ndcg5'].append(0)
        ndcgs['ndcg10'].append(0)
        ndcgs['ndcg20'].append(0)
        ndcgs['ndcg50'].append(0)
        ndcgs['ndcg100'].append(0)

100%|██████████| 643/643 [00:00<00:00, 1402.07it/s]


In [50]:
print("ndcg@5: {}".format(np.mean(ndcgs['ndcg5'])))
print("ndcg@10: {}".format(np.mean(ndcgs['ndcg10'])))
print("ndcg@20: {}".format(np.mean(ndcgs['ndcg20'])))
print("ndcg@50: {}".format(np.mean(ndcgs['ndcg50'])))
print("ndcg@100: {}".format(np.mean(ndcgs['ndcg100'])))

ndcg@5: 0.04040423253461325
ndcg@10: 0.05406263943408177
ndcg@20: 0.06519978638292286
ndcg@50: 0.07787088829659007
ndcg@100: 0.08891129873774099


In [52]:
# モデルなどを保存
import cloudpickle
cloudpickle.dump(action_NMF, open("output/action_NMF.pkl", "wb"))
cloudpickle.dump(adventure_NMF, open("output/adventure_NMF.pkl", "wb"))