In [1]:
from tqdm import tqdm
from sklearn.decomposition import NMF
import pandas as pd
from scipy import sparse
import numpy as np
import cloudpickle
import implicit

In [2]:
# read data
ratings_book = pd.read_json("data/reviews_Books_5.json.gz", compression='gzip', lines=True)
ratings_movies = pd.read_json("data/reviews_Movies_and_TV_5.json.gz", compression='gzip', lines=True)

In [3]:
ratings_book.head(5)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,000100039X,"[0, 0]",5,Spiritually and mentally inspiring! A book tha...,"12 16, 2012",A10000012B7CGYKOMPQ4L,Adam,Wonderful!,1355616000
1,000100039X,"[0, 2]",5,This is one my must have books. It is a master...,"12 11, 2003",A2S166WSCFIFP5,"adead_poet@hotmail.com ""adead_poet@hotmail.com""",close to god,1071100800
2,000100039X,"[0, 0]",5,This book provides a reflection that you can a...,"01 18, 2014",A1BM81XB4QHOA3,"Ahoro Blethends ""Seriously""",Must Read for Life Afficianados,1390003200
3,000100039X,"[0, 0]",5,I first read THE PROPHET in college back in th...,"09 27, 2011",A1MOSTXNIO5MPJ,Alan Krug,Timeless for every good and bad time in your l...,1317081600
4,000100039X,"[7, 9]",5,A timeless classic. It is a very demanding an...,"10 7, 2002",A2XQ5LZHTD4AFT,Alaturka,A Modern Rumi,1033948800


In [4]:
rating_book_ = ratings_book[['asin', 'reviewerID', 'overall']]
rating_book_.columns = ['itemId', 'userId', 'rating']
rating_book_['category'] = 'book'

ratings_movies_ = ratings_movies[['asin', 'reviewerID', 'overall']]
ratings_movies_.columns = ['itemId', 'userId', 'rating']
ratings_movies_['category'] = 'movies_and_TVs'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [5]:
# concat
ratings = pd.concat([rating_book_, ratings_movies_]).reset_index(drop=True)

In [6]:
# featherで保存しておく
ratings.to_feather('data/amazon_review_ratings.feather')

In [8]:
ratings = pd.read_feather('data/amazon_review_ratings.feather')

In [9]:
# indexing ids
# userid
userid_unique = pd.Series(ratings["userId"].unique())
index_userid_dict = userid_unique.to_dict()
# inverse
userid_index_dict = dict(map(reversed, index_userid_dict.items()))

# itemid
itemid_unique = pd.Series(ratings["itemId"].unique())
index_itemid_dict = itemid_unique.to_dict()
# inverse
itemid_index_dict = dict(map(reversed, index_itemid_dict.items()))

ratings["userId_reindex"] = ratings["userId"].map(userid_index_dict)
ratings["itemid_reindex"] = ratings["itemId"].map(itemid_index_dict)

In [10]:
# reindexしたidを使って、アイテムとジャンルの対応が取れるdictを作る
itemid_genres_dict = ratings[['itemid_reindex', 'category']].set_index('itemid_reindex')['category'].to_dict()

In [11]:
item_id_values = ratings["itemid_reindex"].values
user_id_values = ratings["userId_reindex"].values
rating_values = ratings["rating"].values

X = sparse.csr_matrix(
    (rating_values, (user_id_values, item_id_values)))

In [12]:
from lib.recommend_util import split_train_validation_cold_start_user_wise
X_train, X_test = split_train_validation_cold_start_user_wise(X, verbose=True, cold_items=2, full_train_percentage=0.2)

Users enough items: 690240
Users no enough items: 0


In [13]:
test_movies_and_TVs_pos_items_dict = {}
for i in tqdm(range(X_test.shape[0])):
    rated_items = X_train[i, :].indices
    # trainでmovies_and_TVsにアクションしていないユーザ
    if len([v for v in rated_items if 'movies_and_TVs' in itemid_genres_dict[v]]) == 0:
        # X_testの中でstoreしているアイテムが0以上のユーザに
        if X_test[i, :].nnz > 0:
            test_items = []
            value_indices = X_test[i, :].indices
            sorted_indices = np.argsort(-X_test[i, :].toarray())[0]
            # valueがあるアイテムのジャンルがmovies_and_TVsの場合に
            for v in sorted_indices[:len(value_indices)]:
                if 'movies_and_TVs' in itemid_genres_dict[v]:
                    test_items.append(v)
            if len(test_items) > 0:
                test_movies_and_TVs_pos_items_dict[i] = test_items

100%|██████████| 690240/690240 [1:12:40<00:00, 158.30it/s]  


In [14]:
import cloudpickle
cloudpickle.dump(test_movies_and_TVs_pos_items_dict, open('output/test_movies_and_TVs_pos_items_dict.pkl', 'wb'))

In [15]:
test_movies_and_TVs_pos_items_dict = cloudpickle.load(open('output/test_movies_and_TVs_pos_items_dict.pkl', 'rb'))

In [16]:
# ALSする
model = implicit.als.AlternatingLeastSquares(factors=100)



In [17]:
np.random.seed(42)
model.fit(X_train.transpose())

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [18]:
user_factors = model.user_factors
item_factors_transpose = model.item_factors.T

In [20]:
from lib.recommend_util import ndcg
# 評価する
# ndcg@kでk各種
ndcgs = {
    'ndcg5':  [],
    'ndcg10':  [],
    'ndcg20':  [],
    'ndcg50':  [],
    'ndcg100':  []
}
for userid, pos_itemid in tqdm(test_movies_and_TVs_pos_items_dict.items()):
    pos_itemid = np.array(pos_itemid)
    # 予測した評価値の中でadventureのアイテムを持ってくる
    predicted_ratings = np.dot(user_factors[userid, :], item_factors_transpose)
    sorted_indices = np.array([v for v in np.argsort(-predicted_ratings) if 'movies_and_TVs' in itemid_genres_dict[v]])
    ndcgs['ndcg5'].append(ndcg(sorted_indices[:5], pos_itemid))
    ndcgs['ndcg10'].append(ndcg(sorted_indices[:10], pos_itemid))
    ndcgs['ndcg20'].append(ndcg(sorted_indices[:20], pos_itemid))
    ndcgs['ndcg50'].append(ndcg(sorted_indices[:50], pos_itemid))
    ndcgs['ndcg100'].append(ndcg(sorted_indices[:100], pos_itemid))

100%|██████████| 10339/10339 [1:14:24<00:00,  2.32it/s]


In [21]:
print("ndcg@5: {}".format(np.mean(ndcgs['ndcg5'])))
print("ndcg@10: {}".format(np.mean(ndcgs['ndcg10'])))
print("ndcg@20: {}".format(np.mean(ndcgs['ndcg20'])))
print("ndcg@50: {}".format(np.mean(ndcgs['ndcg50'])))
print("ndcg@100: {}".format(np.mean(ndcgs['ndcg100'])))

ndcg@5: 0.008728212364549787
ndcg@10: 0.011954262401059736
ndcg@20: 0.015760902606399604
ndcg@50: 0.022376684596318332
ndcg@100: 0.028002380689556346


In [22]:
# モデルなどを保存
import cloudpickle
cloudpickle.dump(model, open("output/Amazon-aggregate_ALS.pkl", "wb"))
cloudpickle.dump(X_train, open("output/Amazon-X_train.pkl", "wb"))
cloudpickle.dump(X_test, open("output/Amazon-X_test.pkl", "wb"))