In [1]:
from tqdm import tqdm
from sklearn.decomposition import NMF
import pandas as pd
from scipy import sparse
import numpy as np
import implicit

# read data
ratings = pd.read_feather('data/amazon_review_ratings.feather')

# indexing ids
# userid
userid_unique = pd.Series(ratings["userId"].unique())
index_userid_dict = userid_unique.to_dict()
# inverse
userid_index_dict = dict(map(reversed, index_userid_dict.items()))

# itemid
itemid_unique = pd.Series(ratings["itemId"].unique())
index_itemid_dict = itemid_unique.to_dict()
# inverse
itemid_index_dict = dict(map(reversed, index_itemid_dict.items()))

ratings["userId_reindex"] = ratings["userId"].map(userid_index_dict)
ratings["itemid_reindex"] = ratings["itemId"].map(itemid_index_dict)

# reindexしたidを使って、アイテムとジャンルの対応が取れるdictを作る
itemid_genres_dict = ratings[['itemid_reindex', 'category']].set_index('itemid_reindex')['category'].to_dict()

In [2]:
import cloudpickle
X_train = cloudpickle.load(open("output/Amazon-X_train.pkl","rb"))
X_test = cloudpickle.load(open("output/Amazon-X_test.pkl","rb"))
test_movies_and_TVs_pos_items_dict = cloudpickle.load(open('output/test_movies_and_TVs_pos_items_dict.pkl', 'rb'))

In [3]:
# aggregateのtrainをbookとmoviesに分離する
# bookの列
book_columns = [v for v in range(X_train.shape[1]) if 'book' in itemid_genres_dict[v]]
# moviesの列
movies_columns = [v for v in range(X_train.shape[1]) if 'movies_and_TVs' in itemid_genres_dict[v]]

# 選んだカラムに応じてとってくる
book_train = X_train[:, book_columns]
movies_train = X_train[:, movies_columns]

# moviesのみ、アイテムidのconcatとの対応関係が必要なので辞書として持っておく
movies_concat_itemid_dict = {}
count = 0
for v in range(X_train.shape[1]):
    if 'movies_and_TVs' in itemid_genres_dict[v]:
        movies_concat_itemid_dict[v] = count
        count += 1

In [4]:
# アイテムidのconcatとの対応関係が必要なので辞書として持っておく
book_concat_itemid_dict = {}
count = 0
for v in range(X_train.shape[1]):
    if 'book' in itemid_genres_dict[v]:
        book_concat_itemid_dict[v] = count
        count += 1
# inverse
inverse_book_concat_itemid_dict = dict(map(reversed, book_concat_itemid_dict.items()))

movies_concat_itemid_dict = {}
count = 0
for v in range(X_train.shape[1]):
    if 'movies_and_TVs' in itemid_genres_dict[v]:
        movies_concat_itemid_dict[v] = count
        count += 1
# inverse
inverse_movies_concat_itemid_dict = dict(map(reversed, movies_concat_itemid_dict.items()))

In [5]:
# それぞれにアクションしていないユーザを削る
# 全ユーザと、削ったあとでの対応関係を辞書として持っておく
book_train_selected = book_train[book_train.getnnz(1)>0]
movies_train_selected = movies_train[movies_train.getnnz(1)>0]

book_train_action_users = {}
book_users = book_train.getnnz(1)>0
count = 0
for i in range(book_train.shape[0]):
    if book_users[i]:
        book_train_action_users[i] = count
        count += 1

# inverse
inverse_book_train_action_users = dict(map(reversed, book_train_action_users.items()))

movies_train_action_users = {}
movies_users = movies_train.getnnz(1)>0
count = 0
for i in range(movies_train.shape[0]):
    if movies_users[i]:
        movies_train_action_users[i] = count
        count += 1

# inverse
inverse_movies_train_action_users = dict(map(reversed, movies_train_action_users.items()))

In [6]:
# それぞれでALSする
# 今回は mediateでやったときのものを使う
book_ALS = cloudpickle.load(open('output/book_ALS.pkl', 'rb'))
movies_ALS = cloudpickle.load(open("output/movies_ALS.pkl","rb"))

book_ALS_user_vectors = book_ALS.user_factors
movies_ALS_user_vectors = movies_ALS.user_factors

In [7]:
# bookとmoviesでoverlapしているユーザで、ベクトルの対応表を作る
overlap_book_user_vectors = []
overlap_movies_user_vectors = []
count = 0
for u in tqdm(range(X_train.shape[0])):
    if u in book_train_action_users and u in movies_train_action_users:
        overlap_book_user_vectors.append(book_ALS_user_vectors[book_train_action_users[u]].tolist())
        overlap_movies_user_vectors.append(movies_ALS_user_vectors[movies_train_action_users[u]].tolist())

100%|██████████| 690240/690240 [00:00<00:00, 1044388.17it/s]


In [8]:
# AutoEncoderの学習をする(movielensで良かったモデルがAutoEncoderだったので、こちらではこれに絞る)
from keras.layers import Input, Dense, Dropout
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf
from keras import backend as K

np.random.seed(0)
tf.set_random_seed(0)
sess = tf.Session(graph=tf.get_default_graph())
K.set_session(sess)

def build_model(input_dim, output_dim):
    inputs = Input(shape=(input_dim,))
    encoded = Dense(128, activation='relu')(inputs)
    encoded = Dense(64, activation='relu')(encoded)
    encoded = Dense(32, activation='relu')(encoded)

    decoded = Dense(64, activation='relu')(encoded)
    decoded = Dense(128, activation='relu')(decoded)
    decoded = Dense(output_dim, activation='sigmoid')(decoded)
    autoencoder = Model(inputs, decoded)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae','mse'])
    return autoencoder

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

X = np.array(overlap_book_user_vectors)
y = np.array(overlap_movies_user_vectors)
epoch_size = 100
batch_size = 128
es_cb = EarlyStopping(
    monitor='val_loss',
    patience=10,
    verbose=0,
    mode='auto')
models = []
rmses_ = []
kf = KFold(n_splits=10, shuffle=True, random_state=42)
count = 0
for train_index, test_index in kf.split(X):
    print(f"learning_count: {count}")
    count += 1
    X_train_vector, X_test_vector = X[train_index], X[test_index]
    y_train_vector, y_test_vector = y[train_index], y[test_index]
    X_train_vector, X_val_vector, y_train_vector, y_val_vector = train_test_split(X_train_vector, y_train_vector, random_state=42)
    model = build_model(X_train_vector.shape[1], y_train_vector.shape[1])
    mcheck = ModelCheckpoint(
        f'output/Amazon-model_k_{count}.h5',
        monitor='val_loss',
        save_best_only=True,
        verbose=0
    )
    model.fit(
        X_train_vector,
        y_train_vector,
        batch_size=batch_size,
        epochs=epoch_size,
        validation_data=(
            X_val_vector,
            y_val_vector),
        callbacks=[
            mcheck,
            es_cb],
        shuffle=True,
        verbose=0)
    best_model = load_model(f'output/Amazon-model_k_{count}.h5')
    y_pred = best_model.predict(X_test_vector)
    rmse_ = np.sqrt(mean_squared_error(y_pred, y_test_vector))
    print('rmse: {}'.format(rmse_))
    rmses_.append(rmse_)
    models.append(best_model)

learning_count: 0

rmse: 0.10707534265854471
learning_count: 1
rmse: 0.10620641601163848
learning_count: 2
rmse: 0.09128998814209639
learning_count: 3
rmse: 0.10536068081350755
learning_count: 4
rmse: 0.10353830496714704
learning_count: 5
rmse: 0.09654737544047319
learning_count: 6
rmse: 0.0838253663763926
learning_count: 7
rmse: 0.08520135045128895
learning_count: 8
rmse: 0.09761120742132519
learning_count: 9
rmse: 0.10885488643645487


In [10]:
# moviesのitemのベクトル
movies_item_vectors = movies_ALS.item_factors

In [11]:
from lib.recommend_util import ndcg

ndcg_values = []
for learning_count in range(1,11):
    ndcgs = {
        'ndcg5':  [],
        'ndcg10':  [],
        'ndcg20':  [],
        'ndcg50':  [],
        'ndcg100':  []
    }
    best_model = load_model(f'output/Amazon-model_k_{learning_count}.h5')

    for userid, pos_items in tqdm(test_movies_and_TVs_pos_items_dict.items()):
       # pos_itemsをadventure_matrixの次元に変換する
        pos_items = np.array([movies_concat_itemid_dict[v] for v in pos_items])
       # useridに対応するユーザベクトル(book)を得る
        try:
            book_userid = book_train_action_users[userid]
        except:
            # 推薦できないユーザの場合は無条件で0を入れる
            ndcgs['ndcg5'].append(0)
            ndcgs['ndcg10'].append(0)
            ndcgs['ndcg20'].append(0)
            ndcgs['ndcg50'].append(0)
            ndcgs['ndcg100'].append(0)
            continue

        book_user_vector = book_ALS_user_vectors[book_userid, :]
        # AutoEncoderを使ってmoviesの次元に変換する
        movies_user_vector_action_AE = best_model.predict(book_user_vector.reshape(1, -1))
        # adventureのitemのベクトルと掛け合わせる
        movies_predict = np.dot(movies_user_vector_action_AE, movies_item_vectors.T)
        # sum_ratingsをargsort
        sorted_indices = np.array([v for v in np.argsort(-movies_predict)])[0]
        ndcgs['ndcg5'].append(ndcg(sorted_indices[:5], pos_items))
        ndcgs['ndcg10'].append(ndcg(sorted_indices[:10], pos_items))
        ndcgs['ndcg20'].append(ndcg(sorted_indices[:20], pos_items))
        ndcgs['ndcg50'].append(ndcg(sorted_indices[:50], pos_items))
        ndcgs['ndcg100'].append(ndcg(sorted_indices[:100], pos_items))
    
    ndcg_values.append(ndcgs)

100%|██████████| 10189/10189 [02:12<00:00, 76.76it/s]
100%|██████████| 10189/10189 [02:03<00:00, 82.62it/s]
100%|██████████| 10189/10189 [02:01<00:00, 83.54it/s]
100%|██████████| 10189/10189 [02:02<00:00, 82.89it/s]
100%|██████████| 10189/10189 [02:09<00:00, 78.96it/s]
100%|██████████| 10189/10189 [02:08<00:00, 79.16it/s]
100%|██████████| 10189/10189 [02:07<00:00, 79.92it/s]
100%|██████████| 10189/10189 [02:03<00:00, 82.64it/s]
100%|██████████| 10189/10189 [02:04<00:00, 81.89it/s]
100%|██████████| 10189/10189 [02:05<00:00, 81.42it/s]


In [13]:
ndcg5

[0.007450934447675713,
 0.007216759371651771,
 0.007303198221356147,
 0.007560777501411066,
 0.006897607838423477,
 0.007876377621849687,
 0.006323827456290612,
 0.006492817899959384,
 0.007009467216250075,
 0.008314511155281302]

In [12]:
ndcg5 = []
ndcg10 = []
ndcg20 = []
ndcg50 = []
ndcg100 = []
for ndcgs in  ndcg_values:
    ndcg5.append(np.mean(ndcgs['ndcg5']))
    ndcg10.append(np.mean(ndcgs['ndcg10']))
    ndcg20.append(np.mean(ndcgs['ndcg20']))
    ndcg50.append(np.mean(ndcgs['ndcg50']))
    ndcg100.append(np.mean(ndcgs['ndcg100']))
    
print("ndcg@5: {}".format(np.mean(ndcg5)))
print("ndcg@10: {}".format(np.mean(ndcg10)))
print("ndcg@20: {}".format(np.mean(ndcg20)))
print("ndcg@50: {}".format(np.mean(ndcg50)))
print("ndcg@100: {}".format(np.mean(ndcg100)))

ndcg@5: 0.007244627873014923
ndcg@10: 0.010229685288534119
ndcg@20: 0.014418256732126808
ndcg@50: 0.02120226903428524
ndcg@100: 0.02818490020565011
