In [1]:
from tqdm import tqdm
from sklearn.decomposition import NMF
import pandas as pd
from scipy import sparse
import numpy as np

# read data
movies = pd.read_csv("data/ml-20m/movies.csv")
ratings = pd.read_csv("data/ml-20m/ratings.csv")

# join
ratings_joined = pd.merge(ratings, movies)

# ratingsをsparse matrixに変換して横持ちにする
action_adventure_ratings = ratings_joined.query("genres.str.contains('Action') or genres.str.contains('Adventure')", 
                                                engine='python').reset_index(drop=True)
# indexing ids
# userid
userid_unique = pd.Series(action_adventure_ratings["userId"].unique())
index_userid_dict = userid_unique.to_dict()
# inverse
userid_index_dict = dict(map(reversed, index_userid_dict.items()))

# itemid
itemid_unique = pd.Series(action_adventure_ratings["movieId"].unique())
index_itemid_dict = itemid_unique.to_dict()
# inverse
itemid_index_dict = dict(map(reversed, index_itemid_dict.items()))

action_adventure_ratings["user_id"] = action_adventure_ratings["userId"].map(userid_index_dict)
action_adventure_ratings["item_id"] = action_adventure_ratings["movieId"].map(itemid_index_dict)

# reindexしたidを使って、アイテムとジャンルの対応が取れるdictを作る
itemid_genres_dict = action_adventure_ratings[['item_id', 'genres']].set_index('item_id')['genres'].to_dict()

In [2]:
import cloudpickle
X_train = cloudpickle.load(open("output/X_train.pkl","rb"))
X_test = cloudpickle.load(open("output/X_test.pkl","rb"))

In [3]:
# aggregateのtrainをactionとadventureに分離する
# actionの列
action_columns = [v for v in range(X_train.shape[1]) if 'Action' in itemid_genres_dict[v]]
# adventureの列
adventure_columns = [v for v in range(X_train.shape[1]) if 'Adventure' in itemid_genres_dict[v]]

# 選んだカラムに応じてとってくる
action_train = X_train[:, action_columns]
adventure_train = X_train[:, adventure_columns]

# adventureのみ、アイテムidのconcatとの対応関係が必要なので辞書として持っておく
adventure_concat_itemid_dict = {}
count = 0
for v in range(X_train.shape[1]):
    if 'Adventure' in itemid_genres_dict[v]:
        adventure_concat_itemid_dict[v] = count
        count += 1

In [9]:
# collective matrix factorizationのパッケージ
!pip3 install cmfrec

Collecting cmfrec
  Using cached https://files.pythonhosted.org/packages/aa/a4/7e7f6396225ed0646a8a686273396370ca43eecec1e2fd10c43d14715646/cmfrec-0.5.2.3.tar.gz
Collecting tensorflow>=1.0.0 (from cmfrec)
  Downloading https://files.pythonhosted.org/packages/7c/fb/7b2c5b3e85ad335b53ca67deb2ef4af574dc0a8759f43b7f45e15005e449/tensorflow-1.14.0-cp35-cp35m-manylinux1_x86_64.whl (109.2MB)
[K    100% |████████████████████████████████| 109.2MB 13kB/s  eta 0:00:01
Collecting astor>=0.6.0 (from tensorflow>=1.0.0->cmfrec)
  Using cached https://files.pythonhosted.org/packages/d1/4f/950dfae467b384fc96bc6469de25d832534f6b4441033c39f914efd13418/astor-0.8.0-py2.py3-none-any.whl
Collecting termcolor>=1.1.0 (from tensorflow>=1.0.0->cmfrec)
  Using cached https://files.pythonhosted.org/packages/8a/48/a76be51647d0eb9f10e2a4511bf3ffb8cc1e6b14e9e4fab46173aa79f981/termcolor-1.1.0.tar.gz
Collecting keras-applications>=1.0.6 (from tensorflow>=1.0.0->cmfrec)
  Downloading https://files.pythonhosted.org/packa

In [4]:
# アイテムidのconcatとの対応関係が必要なので辞書として持っておく
action_concat_itemid_dict = {}
count = 0
for v in range(X_train.shape[1]):
    if 'Action' in itemid_genres_dict[v]:
        action_concat_itemid_dict[v] = count
        count += 1
# inverse
inverse_action_concat_itemid_dict = dict(map(reversed, action_concat_itemid_dict.items()))

adventure_concat_itemid_dict = {}
count = 0
for v in range(X_train.shape[1]):
    if 'Adventure' in itemid_genres_dict[v]:
        adventure_concat_itemid_dict[v] = count
        count += 1
# inverse
inverse_adventure_concat_itemid_dict = dict(map(reversed, adventure_concat_itemid_dict.items()))

In [5]:
# それぞれにアクションしていないユーザを削る
# 全ユーザと、削ったあとでの対応関係を辞書として持っておく
action_train_selected = action_train[action_train.getnnz(1)>0]
adventure_train_selected = adventure_train[adventure_train.getnnz(1)>0]

action_train_action_users = {}
action_users = action_train.getnnz(1)>0
count = 0
for i in range(action_train.shape[0]):
    if action_users[i]:
        action_train_action_users[i] = count
        count += 1

# inverse
inverse_action_train_action_users = dict(map(reversed, action_train_action_users.items()))

adventure_train_action_users = {}
adventure_users = adventure_train.getnnz(1)>0
count = 0
for i in range(adventure_train.shape[0]):
    if adventure_users[i]:
        adventure_train_action_users[i] = count
        count += 1

# inverse
inverse_adventure_train_action_users = dict(map(reversed, adventure_train_action_users.items()))

In [6]:
# actionだけでNMFをまず行う
action_NMF = NMF(n_components=100, random_state=42)
action_NMF_fitted = action_NMF.fit_transform(action_train_selected)
action_NMF_components = action_NMF.components_

In [34]:
adventure_train_selected.shape

(137244, 2287)

In [7]:
# side informationとしてaction_NMFで得られたベクトルを使う
user_attributes = pd.DataFrame(action_NMF_fitted)
user_attributes['UserId'] = user_attributes.index
# useridをconcatの次元に戻す
user_attributes['UserId'] = user_attributes['UserId'].map(inverse_action_train_action_users)

item_attributes = pd.DataFrame(action_NMF_components)
item_attributes['ItemId'] = item_attributes.index
# itemidをconcatの次元に戻す
item_attributes['ItemId'] = item_attributes['ItemId'].map(inverse_action_concat_itemid_dict)

In [17]:
# X_trainから縦持ちに復元する(X_trainが既に横持ちなので)
userid_stacked = []
itemid_stacked = []
ratings_stacked = []

for i in tqdm(range(X_train.shape[0])):
    ratings = X_train[i, :].data
    item_idx = X_train[i, :].indices
    for idx, v in enumerate(item_idx):
        if 'Adventure' in itemid_genres_dict[v]:
            userid_stacked.append(i)
            itemid_stacked.append(v)
            ratings_stacked.append(ratings[idx])

adventure_ratings_stacked = pd.DataFrame()
adventure_ratings_stacked['UserId'] = userid_stacked
adventure_ratings_stacked['ItemId'] = itemid_stacked
adventure_ratings_stacked['Rating'] = ratings_stacked



  0%|          | 0/138389 [00:00<?, ?it/s][A[A

  0%|          | 189/138389 [00:00<01:13, 1887.01it/s][A[A

  0%|          | 374/138389 [00:00<01:13, 1873.19it/s][A[A

  0%|          | 569/138389 [00:00<01:12, 1895.05it/s][A[A

  1%|          | 755/138389 [00:00<01:13, 1884.38it/s][A[A

  1%|          | 927/138389 [00:00<01:15, 1825.38it/s][A[A

  1%|          | 1115/138389 [00:00<01:14, 1840.77it/s][A[A

  1%|          | 1295/138389 [00:00<01:15, 1825.16it/s][A[A

  1%|          | 1473/138389 [00:00<01:15, 1805.37it/s][A[A

  1%|          | 1647/138389 [00:00<01:16, 1782.97it/s][A[A

  1%|▏         | 1820/138389 [00:01<01:17, 1766.65it/s][A[A

  1%|▏         | 1995/138389 [00:01<01:17, 1761.29it/s][A[A

  2%|▏         | 2176/138389 [00:01<01:16, 1775.16it/s][A[A

  2%|▏         | 2352/138389 [00:01<01:17, 1756.58it/s][A[A

  2%|▏         | 2539/138389 [00:01<01:16, 1781.34it/s][A[A

  2%|▏         | 2717/138389 [00:01<01:16, 1766.96it/s][A[A

  2%|▏  

In [20]:
adventure_ratings_stacked.head(5)

Unnamed: 0,UserId,ItemId,Rating
0,0,0,3.5
1,0,1,3.5
2,0,2,3.5
3,0,4,4.0
4,0,9,3.0


In [21]:
from cmfrec import CMF

# fitting a model and making some recommendations
recommender = CMF(k=20, k_main=3, k_user=2, k_item=1, reg_param=1e-4)
recommender.fit(ratings=adventure_ratings_stacked, user_info=user_attributes, item_info=item_attributes,
                cols_bin_user=None, cols_bin_item=None)




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.





INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
  Objective function value: 1.449462
  Number of iterations: 410
  Number of functions evaluations: 426


<cmfrec.CMF at 0x7f609d323550>

In [35]:
X_test.shape

(138389, 4796)

In [24]:
# 評価対象のユーザ
test_adventure_pos_items_dict = {}
for i in tqdm(range(X_test.shape[0])):
    # trainでadventureにアクションしていないユーザに
    rated_items = X_train[i, :].indices
    if len([v for v in rated_items if 'Adventure' in itemid_genres_dict[v]]) == 0:
        # X_testの中でstoreしているアイテムが0以上のユーザに
        if X_test[i, :].nnz > 0:
            test_items = []
            selected_user_ratings = X_test[i, :]
            value_indices = selected_user_ratings.indices
            sorted_indices = np.argsort(-X_test[i, :].toarray())[0]
            # valueがあるアイテムのジャンルがadventureの場合に
            for v in sorted_indices[:len(value_indices)]:
                if 'Adventure' in itemid_genres_dict[v]:
                    test_items.append(v)
            if len(test_items) > 0:
                test_adventure_pos_items_dict[i] = test_items



  0%|          | 0/138389 [00:00<?, ?it/s][A[A

  0%|          | 249/138389 [00:00<00:55, 2480.95it/s][A[A

  0%|          | 501/138389 [00:00<00:55, 2491.33it/s][A[A

  1%|          | 730/138389 [00:00<00:56, 2425.98it/s][A[A

  1%|          | 983/138389 [00:00<00:56, 2451.60it/s][A[A

  1%|          | 1241/138389 [00:00<00:55, 2486.06it/s][A[A

  1%|          | 1498/138389 [00:00<00:54, 2510.37it/s][A[A

  1%|▏         | 1745/138389 [00:00<00:54, 2494.88it/s][A[A

  1%|▏         | 2008/138389 [00:00<00:53, 2533.30it/s][A[A

  2%|▏         | 2267/138389 [00:00<00:53, 2549.92it/s][A[A

  2%|▏         | 2539/138389 [00:01<00:52, 2594.25it/s][A[A

  2%|▏         | 2792/138389 [00:01<00:53, 2557.07it/s][A[A

  2%|▏         | 3059/138389 [00:01<00:52, 2587.49it/s][A[A

  2%|▏         | 3315/138389 [00:01<00:53, 2545.02it/s][A[A

  3%|▎         | 3592/138389 [00:01<00:51, 2608.15it/s][A[A

  3%|▎         | 3852/138389 [00:01<00:52, 2545.71it/s][A[A

  3%|▎ 

In [32]:
recommender.A.shape

(138079, 25)

In [27]:
from lib.recommend_util import ndcg
ndcgs = {
    'ndcg5':  [],
    'ndcg10':  [],
    'ndcg20':  [],
    'ndcg50':  [],
    'ndcg100':  []
}
count = 0
for userid, pos_items in tqdm(test_adventure_pos_items_dict.items()):
    pos_items = np.array(pos_items)
    try:
        recommended_items = np.array(recommender.topN(user=userid, n=100))
        ndcgs['ndcg5'].append(ndcg(recommended_items[:5], pos_items))
    ndcgs['ndcg10'].append(ndcg(recommended_items[:10], pos_items))
    ndcgs['ndcg20'].append(ndcg(recommended_items[:20], pos_items))
    ndcgs['ndcg50'].append(ndcg(recommended_items[:50], pos_items))
    ndcgs['ndcg100'].append(ndcg(recommended_items[:100], pos_items))
    except:
        continue



  0%|          | 0/643 [00:00<?, ?it/s][A[A


ValueError: Can only predict for users who were in the training set.