In [1]:
from tqdm import tqdm
import pandas as pd
from scipy import sparse
import numpy as np
import implicit # 予め入れておく

In [2]:
# read data
movies = pd.read_csv("data/ml-20m/movies.csv")
ratings = pd.read_csv("data/ml-20m/ratings.csv")

In [3]:
# join
ratings_joined = pd.merge(ratings, movies)

In [4]:
# ratingsをsparse matrixに変換して横持ちにする
action_adventure_ratings = ratings_joined.query("genres.str.contains('Action') or genres.str.contains('Adventure')", 
                                                engine='python').reset_index(drop=True)

In [5]:
# indexing ids
# userid
userid_unique = pd.Series(action_adventure_ratings["userId"].unique())
index_userid_dict = userid_unique.to_dict()
# inverse
userid_index_dict = dict(map(reversed, index_userid_dict.items()))

# itemid
itemid_unique = pd.Series(action_adventure_ratings["movieId"].unique())
index_itemid_dict = itemid_unique.to_dict()
# inverse
itemid_index_dict = dict(map(reversed, index_itemid_dict.items()))

action_adventure_ratings["user_id"] = action_adventure_ratings["userId"].map(userid_index_dict)
action_adventure_ratings["item_id"] = action_adventure_ratings["movieId"].map(itemid_index_dict)

In [6]:
# reindexしたidを使って、アイテムとジャンルの対応が取れるdictを作る
itemid_genres_dict = action_adventure_ratings[['item_id', 'genres']].set_index('item_id')['genres'].to_dict()

In [7]:
item_id_values = action_adventure_ratings["item_id"].values
user_id_values = action_adventure_ratings["user_id"].values
rating_values = action_adventure_ratings["rating"].values

X = sparse.csr_matrix(
    (rating_values, (user_id_values, item_id_values)))

In [8]:
from lib.recommend_util import split_train_validation_cold_start_user_wise
X_train, X_test = split_train_validation_cold_start_user_wise(X, verbose=True, cold_items=2, full_train_percentage=0.2)

Users enough items: 137318
Users no enough items: 1071


In [9]:
# MF(ALS)する
model = implicit.als.AlternatingLeastSquares(factors=100)



In [10]:
np.random.seed(42)
model.fit(X_train.transpose())

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [11]:
# rating matrixを復元する
predicted_ratings = np.dot(model.user_factors, model.item_factors.T)

In [12]:
# 評価用lib.recommend_util(ジャンルはadventure)
test_adventure_pos_items_dict = {}
for i in tqdm(range(X_test.shape[0])):
    # trainでadventureにアクションしていないユーザに
    rated_items = X_train[i, :].indices
    if len([v for v in rated_items if 'Adventure' in itemid_genres_dict[v]]) == 0:
        # X_testの中でstoreしているアイテムが0以上のユーザに
        if X_test[i, :].nnz > 0:
            test_items = []
            selected_user_ratings = X_test[i, :]
            value_indices = selected_user_ratings.indices
            sorted_indices = np.argsort(-X_test[i, :].toarray())[0]
            # valueがあるアイテムのジャンルがadventureの場合に
            for v in sorted_indices[:len(value_indices)]:
                if 'Adventure' in itemid_genres_dict[v]:
                    test_items.append(v)
            if len(test_items) > 0:
                test_adventure_pos_items_dict[i] = test_items

100%|██████████| 138389/138389 [00:20<00:00, 6777.41it/s]


In [13]:
from lib.recommend_util import ndcg

# 評価する
# ndcg@kでk各種
ndcgs = {
    'ndcg5':  [],
    'ndcg10':  [],
    'ndcg20':  [],
    'ndcg50':  [],
    'ndcg100':  []
}
for userid, pos_itemid in tqdm(test_adventure_pos_items_dict.items()):
    pos_itemid = np.array(pos_itemid)
    # 予測した評価値の中でadventureのアイテムを持ってくる
    sorted_indices = np.array([v for v in np.argsort(-predicted_ratings[userid, :]) if 'Adventure' in itemid_genres_dict[v]])
    ndcgs['ndcg5'].append(ndcg(sorted_indices[:5], pos_itemid))
    ndcgs['ndcg10'].append(ndcg(sorted_indices[:10], pos_itemid))
    ndcgs['ndcg20'].append(ndcg(sorted_indices[:20], pos_itemid))
    ndcgs['ndcg50'].append(ndcg(sorted_indices[:50], pos_itemid))
    ndcgs['ndcg100'].append(ndcg(sorted_indices[:100], pos_itemid))

100%|██████████| 18523/18523 [00:56<00:00, 330.59it/s]


In [14]:
print("ndcg@5: {}".format(np.mean(ndcgs['ndcg5'])))
print("ndcg@10: {}".format(np.mean(ndcgs['ndcg10'])))
print("ndcg@20: {}".format(np.mean(ndcgs['ndcg20'])))
print("ndcg@50: {}".format(np.mean(ndcgs['ndcg50'])))
print("ndcg@100: {}".format(np.mean(ndcgs['ndcg100'])))

ndcg@5: 0.11311527224488772
ndcg@10: 0.15169676531050724
ndcg@20: 0.19715082901785136
ndcg@50: 0.2652105891266386
ndcg@100: 0.3172433919718339


In [None]:
# モデルなどを保存
import cloudpickle
cloudpickle.dump(model, open("output/ML-20M-aggregate_ALS.pkl", "wb"))
cloudpickle.dump(X_train, open("output/ML-20M-X_train.pkl", "wb"))
cloudpickle.dump(X_test, open("output/ML-20M-X_test.pkl", "wb"))