In [1]:
from tqdm import tqdm
from sklearn.decomposition import NMF
import pandas as pd
from scipy import sparse
import numpy as np

# read data
movies = pd.read_csv("data/ml-20m/movies.csv")
ratings = pd.read_csv("data/ml-20m/ratings.csv")

# join
ratings_joined = pd.merge(ratings, movies)

# ratingsをsparse matrixに変換して横持ちにする
action_adventure_ratings = ratings_joined.query("genres.str.contains('Action') or genres.str.contains('Adventure')", 
                                                engine='python').reset_index(drop=True)
# indexing ids
# userid
userid_unique = pd.Series(action_adventure_ratings["userId"].unique())
index_userid_dict = userid_unique.to_dict()
# inverse
userid_index_dict = dict(map(reversed, index_userid_dict.items()))

# itemid
itemid_unique = pd.Series(action_adventure_ratings["movieId"].unique())
index_itemid_dict = itemid_unique.to_dict()
# inverse
itemid_index_dict = dict(map(reversed, index_itemid_dict.items()))

action_adventure_ratings["user_id"] = action_adventure_ratings["userId"].map(userid_index_dict)
action_adventure_ratings["item_id"] = action_adventure_ratings["movieId"].map(itemid_index_dict)

# reindexしたidを使って、アイテムとジャンルの対応が取れるdictを作る
itemid_genres_dict = action_adventure_ratings[['item_id', 'genres']].set_index('item_id')['genres'].to_dict()

In [2]:
import cloudpickle
X_train = cloudpickle.load(open("output/ML-20M-X_train.pkl","rb"))
X_test = cloudpickle.load(open("output/ML-20M-X_test.pkl","rb"))

In [3]:
# aggregateのtrainをactionとadventureに分離する
# actionの列
action_columns = [v for v in range(X_train.shape[1]) if 'Action' in itemid_genres_dict[v]]
# adventureの列
adventure_columns = [v for v in range(X_train.shape[1]) if 'Adventure' in itemid_genres_dict[v]]

# 選んだカラムに応じてとってくる
action_train = X_train[:, action_columns]
adventure_train = X_train[:, adventure_columns]

# adventureのみ、アイテムidのconcatとの対応関係が必要なので辞書として持っておく
adventure_concat_itemid_dict = {}
count = 0
for v in range(X_train.shape[1]):
    if 'Adventure' in itemid_genres_dict[v]:
        adventure_concat_itemid_dict[v] = count
        count += 1

In [4]:
# アイテムidのconcatとの対応関係が必要なので辞書として持っておく
action_concat_itemid_dict = {}
count = 0
for v in range(X_train.shape[1]):
    if 'Action' in itemid_genres_dict[v]:
        action_concat_itemid_dict[v] = count
        count += 1
# inverse
inverse_action_concat_itemid_dict = dict(map(reversed, action_concat_itemid_dict.items()))

adventure_concat_itemid_dict = {}
count = 0
for v in range(X_train.shape[1]):
    if 'Adventure' in itemid_genres_dict[v]:
        adventure_concat_itemid_dict[v] = count
        count += 1
# inverse
inverse_adventure_concat_itemid_dict = dict(map(reversed, adventure_concat_itemid_dict.items()))

In [5]:
# それぞれにアクションしていないユーザを削る
# 全ユーザと、削ったあとでの対応関係を辞書として持っておく
action_train_selected = action_train[action_train.getnnz(1)>0]
adventure_train_selected = adventure_train[adventure_train.getnnz(1)>0]

action_train_action_users = {}
action_users = action_train.getnnz(1)>0
count = 0
for i in range(action_train.shape[0]):
    if action_users[i]:
        action_train_action_users[i] = count
        count += 1

# inverse
inverse_action_train_action_users = dict(map(reversed, action_train_action_users.items()))

adventure_train_action_users = {}
adventure_users = adventure_train.getnnz(1)>0
count = 0
for i in range(adventure_train.shape[0]):
    if adventure_users[i]:
        adventure_train_action_users[i] = count
        count += 1

# inverse
inverse_adventure_train_action_users = dict(map(reversed, adventure_train_action_users.items()))

In [6]:
# それぞれでALSする
# 今回は mediateでやったときのものを使う
action_ALS = cloudpickle.load(open('output/ML-20M-action_ALS.pkl', 'rb'))
adventure_ALS = cloudpickle.load(open("output/ML-20M-adventure_ALS.pkl","rb"))

action_ALS_user_vectors = action_ALS.user_factors
adventure_ALS_user_vectors = adventure_ALS.user_factors

In [7]:
# actionとadventureでoverlapしているユーザで、ベクトルの対応表を作る
overlap_action_user_vectors = []
overlap_adventure_user_vectors = []
count = 0
for u in tqdm(range(X_train.shape[0])):
    if u in action_train_action_users and u in adventure_train_action_users:
        overlap_action_user_vectors.append(action_ALS_user_vectors[action_train_action_users[u]].tolist())
        overlap_adventure_user_vectors.append(adventure_ALS_user_vectors[adventure_train_action_users[u]].tolist())

100%|██████████| 138389/138389 [00:01<00:00, 80989.41it/s]


In [8]:
# AutoEncoderの学習をする
from keras.layers import Input, Dense
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf
from keras import backend as K

np.random.seed(0)
tf.set_random_seed(0)
sess = tf.Session(graph=tf.get_default_graph())
K.set_session(sess)

def build_model(input_dim, output_dim):
    inputs = Input(shape=(input_dim,))
    encoded = Dense(128, activation='relu')(inputs)
    encoded = Dense(64, activation='relu')(encoded)
    encoded = Dense(32, activation='relu')(encoded)

    decoded = Dense(64, activation='relu')(encoded)
    decoded = Dense(128, activation='relu')(decoded)
    decoded = Dense(output_dim, activation='sigmoid')(decoded)
    autoencoder = Model(inputs, decoded)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae','mse'])
    return autoencoder

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [15]:
from sklearn.model_selection import train_test_split
X_train_vector, X_test_vector, y_train_vector, y_test_vector = train_test_split(overlap_action_user_vectors, overlap_adventure_user_vectors, random_state=42)
X_train_vector, X_val_vector, y_train_vector, y_val_vector = train_test_split(X_train_vector, y_train_vector, random_state=42)

epoch_size = 100
batch_size = 256
mcheck = ModelCheckpoint(
    'output/ml-20m-model.h5',
    monitor='val_loss',
    save_best_only=True,
    verbose=1)
es_cb = EarlyStopping(
    monitor='val_loss',
    patience=10,
    verbose=1,
    mode='auto')
model = build_model(np.array(X_train_vector).shape[1], np.array(y_train_vector).shape[1])
model.fit(
    np.array(X_train_vector),
    np.array(y_train_vector),
    batch_size=batch_size,
    epochs=epoch_size,
    validation_data=(
        np.array(X_val_vector),
        np.array(y_val_vector)),
    callbacks=[
        mcheck,
        es_cb],
    shuffle=True,
    verbose=1)

Train on 61805 samples, validate on 20602 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.34584, saving model to output/ml-20m-model.h5
Epoch 2/100

Epoch 00002: val_loss improved from 0.34584 to 0.33902, saving model to output/ml-20m-model.h5
Epoch 3/100

Epoch 00003: val_loss improved from 0.33902 to 0.32669, saving model to output/ml-20m-model.h5
Epoch 4/100

Epoch 00004: val_loss improved from 0.32669 to 0.30927, saving model to output/ml-20m-model.h5
Epoch 5/100

Epoch 00005: val_loss improved from 0.30927 to 0.30083, saving model to output/ml-20m-model.h5
Epoch 6/100

Epoch 00006: val_loss improved from 0.30083 to 0.29502, saving model to output/ml-20m-model.h5
Epoch 7/100

Epoch 00007: val_loss improved from 0.29502 to 0.29096, saving model to output/ml-20m-model.h5
Epoch 8/100

Epoch 00008: val_loss improved from 0.29096 to 0.28856, saving model to output/ml-20m-model.h5
Epoch 9/100

Epoch 00009: val_loss improved from 0.28856 to 0.28621, saving model to outpu

Epoch 32/100

Epoch 00032: val_loss improved from 0.27152 to 0.27118, saving model to output/ml-20m-model.h5
Epoch 33/100

Epoch 00033: val_loss improved from 0.27118 to 0.27106, saving model to output/ml-20m-model.h5
Epoch 34/100

Epoch 00034: val_loss improved from 0.27106 to 0.27067, saving model to output/ml-20m-model.h5
Epoch 35/100

Epoch 00035: val_loss did not improve from 0.27067
Epoch 36/100

Epoch 00036: val_loss did not improve from 0.27067
Epoch 37/100

Epoch 00037: val_loss improved from 0.27067 to 0.27062, saving model to output/ml-20m-model.h5
Epoch 38/100

Epoch 00038: val_loss improved from 0.27062 to 0.27024, saving model to output/ml-20m-model.h5
Epoch 39/100

Epoch 00039: val_loss improved from 0.27024 to 0.27016, saving model to output/ml-20m-model.h5
Epoch 40/100

Epoch 00040: val_loss improved from 0.27016 to 0.27009, saving model to output/ml-20m-model.h5
Epoch 41/100

Epoch 00041: val_loss improved from 0.27009 to 0.26993, saving model to output/ml-20m-model.h


Epoch 00065: val_loss did not improve from 0.26839
Epoch 66/100

Epoch 00066: val_loss did not improve from 0.26839
Epoch 67/100

Epoch 00067: val_loss did not improve from 0.26839
Epoch 68/100

Epoch 00068: val_loss did not improve from 0.26839
Epoch 69/100

Epoch 00069: val_loss improved from 0.26839 to 0.26837, saving model to output/ml-20m-model.h5
Epoch 70/100

Epoch 00070: val_loss improved from 0.26837 to 0.26812, saving model to output/ml-20m-model.h5
Epoch 71/100

Epoch 00071: val_loss improved from 0.26812 to 0.26792, saving model to output/ml-20m-model.h5
Epoch 72/100

Epoch 00072: val_loss did not improve from 0.26792
Epoch 73/100

Epoch 00073: val_loss did not improve from 0.26792
Epoch 74/100

Epoch 00074: val_loss did not improve from 0.26792
Epoch 75/100

Epoch 00075: val_loss did not improve from 0.26792
Epoch 76/100

Epoch 00076: val_loss did not improve from 0.26792
Epoch 77/100

Epoch 00077: val_loss improved from 0.26792 to 0.26789, saving model to output/ml-20m-m


Epoch 00099: val_loss did not improve from 0.26740
Epoch 100/100

Epoch 00100: val_loss did not improve from 0.26740


<keras.callbacks.callbacks.History at 0x7fc9ac46bf28>

In [16]:
# テストデータに対するRMSE計算
from sklearn.metrics import mean_squared_error
best_model = load_model('output/ml-20m-model.h5')
y_pred = best_model.predict(np.array(X_test_vector))
rmse_ = np.sqrt(mean_squared_error(y_pred, np.array(y_test_vector)))
print('rmse: {}'.format(rmse_))

rmse: 0.5176917184448359


In [17]:
# vaeとで、rmseを比較し、良い方を選ぶ
# ref. https://keras.io/examples/variational_autoencoder/

from keras.layers import Lambda, Input, Dense
from keras.models import Model
from keras.losses import mse, binary_crossentropy
from keras.utils import plot_model
from keras import backend as K

class VAE():
    def __init__(self, input_dim, intermediate_dim, latent_dim, original_dim):
        self.input_dim = input_dim
        self.original_dim = original_dim
        self.intermediate_dim = intermediate_dim
        self.latent_dim = latent_dim
        self.z_mean = None
        self.z_log_var = None


    # reparameterization trick
    # instead of sampling from Q(z|X), sample epsilon = N(0,I)
    # z = z_mean + sqrt(var) * epsilon
    def sampling(self, args):
        """Reparameterization trick by sampling from an isotropic unit Gaussian.

        # Arguments
            args (tensor): mean and log of variance of Q(z|X)

        # Returns
            z (tensor): sampled latent vector
        """

        z_mean, z_log_var = args
        batch = K.shape(z_mean)[0]
        dim = K.int_shape(z_mean)[1]
        # by default, random_normal has mean = 0 and std = 1.0
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + K.exp(0.5 * z_log_var) * epsilon


    def vae_loss(self, y_true, y_pred):
        reconstruction_loss = mse(y_true, y_pred)
        reconstruction_loss *= self.original_dim
        kl_loss = 1 + self.z_log_var - K.square(self.z_mean) - K.exp(self.z_log_var)
        kl_loss = K.sum(kl_loss, axis=-1)
        kl_loss *= -0.5
        vae_loss = K.mean(reconstruction_loss + kl_loss)
        return vae_loss


    def build_vae(self):
        # VAE model = encoder + decoder
        # build encoder model
        inputs = Input(shape=(self.input_dim,), name='encoder_input')
        x = Dense(128, activation='relu')(inputs)
        x = Dense(64, activation='relu')(x)
        x = Dense(32, activation='relu')(x)
        self.z_mean = Dense(self.latent_dim, name='z_mean')(x)
        self.z_log_var = Dense(self.latent_dim, name='z_log_var')(x)

        # use reparameterization trick to push the sampling out as input
        # note that "output_shape" isn't necessary with the TensorFlow backend
        z = Lambda(self.sampling, output_shape=(self.latent_dim,), name='z')([self.z_mean, self.z_log_var])

        # instantiate encoder model
        encoder = Model(inputs, [self.z_mean, self.z_log_var, z], name='encoder')

        # build decoder model
        latent_inputs = Input(shape=(self.latent_dim,), name='z_sampling')
        x = Dense(32, activation='relu')(latent_inputs)
        x = Dense(64, activation='relu')(latent_inputs)
        x = Dense(128, activation='relu')(latent_inputs)
        decoder_outputs = Dense(self.original_dim, activation='sigmoid')(x)

        # instantiate decoder model
        decoder = Model(latent_inputs, decoder_outputs, name='decoder')

        # instantiate VAE model
        outputs = decoder(encoder(inputs)[2])
        vae = Model(inputs, outputs, name='vae')
        vae.compile(optimizer='adam', loss=self.vae_loss)
        return vae

In [18]:
vae = VAE(np.array(X_train_vector).shape[1], 256, 2, np.array(y_train_vector).shape[1])
model = vae.build_vae()

In [19]:
model.summary()

Model: "vae"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_input (InputLayer)   (None, 100)               0         
_________________________________________________________________
encoder (Model)              [(None, 2), (None, 2), (N 23396     
_________________________________________________________________
decoder (Model)              (None, 100)               13284     
Total params: 36,680
Trainable params: 36,680
Non-trainable params: 0
_________________________________________________________________


In [20]:
epoch_size = 100
batch_size = 256
mcheck = ModelCheckpoint(
    'output/ml-20m-vae.h5',
    monitor='val_loss',
    save_best_only=True,
    save_weights_only=True,
    verbose=1)
es_cb = EarlyStopping(
    monitor='val_loss',
    patience=10,
    verbose=1,
    mode='auto')
model.fit(
    np.array(X_train_vector),
    np.array(y_train_vector),
    batch_size=batch_size,
    epochs=epoch_size,
    validation_data=(
        np.array(X_val_vector),
        np.array(y_val_vector)),
    callbacks=[
        mcheck,
        es_cb],
    shuffle=True,
    verbose=1)

Train on 61805 samples, validate on 20602 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 34.33868, saving model to output/ml-20m-vae.h5
Epoch 2/100

Epoch 00002: val_loss improved from 34.33868 to 33.36791, saving model to output/ml-20m-vae.h5
Epoch 3/100

Epoch 00003: val_loss improved from 33.36791 to 32.79220, saving model to output/ml-20m-vae.h5
Epoch 4/100

Epoch 00004: val_loss improved from 32.79220 to 32.61888, saving model to output/ml-20m-vae.h5
Epoch 5/100

Epoch 00005: val_loss improved from 32.61888 to 32.51373, saving model to output/ml-20m-vae.h5
Epoch 6/100

Epoch 00006: val_loss improved from 32.51373 to 32.44305, saving model to output/ml-20m-vae.h5
Epoch 7/100

Epoch 00007: val_loss improved from 32.44305 to 32.42496, saving model to output/ml-20m-vae.h5
Epoch 8/100

Epoch 00008: val_loss improved from 32.42496 to 32.36423, saving model to output/ml-20m-vae.h5
Epoch 9/100

Epoch 00009: val_loss improved from 32.36423 to 32.30604, saving model to outp


Epoch 00045: val_loss did not improve from 32.01083
Epoch 46/100

Epoch 00046: val_loss did not improve from 32.01083
Epoch 00046: early stopping


<keras.callbacks.callbacks.History at 0x7fc9844d8c50>

In [21]:
# テストデータに対するRMSE計算
from sklearn.metrics import mean_squared_error
best_model_vae = vae.build_vae()
best_model_vae.load_weights('output/ml-20m-vae.h5')
y_pred = best_model_vae.predict(np.array(X_test_vector))
rmse_ = np.sqrt(mean_squared_error(y_pred, np.array(y_test_vector)))
print('rmse: {}'.format(rmse_))

rmse: 0.5572697988473447


(VAEのやりかたが悪かったようなだけな気もするが)今回は素のautoencoderを採用する

学習結果が固定できないので、10回学習した値の平均値を採用する

In [25]:
X = np.array(overlap_action_user_vectors)
y = np.array(overlap_adventure_user_vectors)
epoch_size = 100
batch_size = 256
es_cb = EarlyStopping(
    monitor='val_loss',
    patience=10,
    verbose=0,
    mode='auto')
models = []
rmses_ = []
for learning_count in range(10):
    print(f"learning_count: {learning_count}")
    X_train_vector, X_test_vector, y_train_vector, y_test_vector = train_test_split(X, y, random_state=42)
    X_train_vector, X_val_vector, y_train_vector, y_val_vector = train_test_split(X_train_vector, y_train_vector, random_state=42)
    model = build_model(X_train_vector.shape[1], y_train_vector.shape[1])
    mcheck = ModelCheckpoint(
        f'output/ml-20m-model_{learning_count}.h5',
        monitor='val_loss',
        save_best_only=True,
        verbose=0
    )
    model.fit(
        X_train_vector,
        y_train_vector,
        batch_size=batch_size,
        epochs=epoch_size,
        validation_data=(
            X_val_vector,
            y_val_vector),
        callbacks=[
            mcheck,
            es_cb],
        shuffle=True,
        verbose=0)
    best_model = load_model(f'output/ml-20m-model_{learning_count}.h5')
    y_pred = best_model.predict(X_test_vector)
    rmse_ = np.sqrt(mean_squared_error(y_pred, y_test_vector))
    print('rmse: {}'.format(rmse_))
    rmses_.append(rmse_)
    models.append(best_model)

learning_count: 0
rmse: 0.5183830380815655
learning_count: 1
rmse: 0.5187530618319904
learning_count: 2
rmse: 0.5181147621281058
learning_count: 3
rmse: 0.5179883554745452
learning_count: 4
rmse: 0.5177950388480708
learning_count: 5
rmse: 0.51769281554704
learning_count: 6
rmse: 0.5166800007327981
learning_count: 7
rmse: 0.5181607680538545
learning_count: 8
rmse: 0.5173283245405115
learning_count: 9
rmse: 0.5167998675632172


In [16]:
# 評価対象のユーザ
test_adventure_pos_items_dict = {}
for i in tqdm(range(X_test.shape[0])):
    # trainでadventureにアクションしていないユーザに
    rated_items = X_train[i, :].indices
    if len([v for v in rated_items if 'Adventure' in itemid_genres_dict[v]]) == 0:
        # X_testの中でstoreしているアイテムが0以上のユーザに
        if X_test[i, :].nnz > 0:
            test_items = []
            selected_user_ratings = X_test[i, :]
            value_indices = selected_user_ratings.indices
            sorted_indices = np.argsort(-X_test[i, :].toarray())[0]
            # valueがあるアイテムのジャンルがadventureの場合に
            for v in sorted_indices[:len(value_indices)]:
                if 'Adventure' in itemid_genres_dict[v]:
                    test_items.append(v)
            if len(test_items) > 0:
                test_adventure_pos_items_dict[i] = test_items

100%|██████████| 138389/138389 [00:20<00:00, 6720.76it/s]


In [18]:
# adventureのitemのベクトル
adventure_item_vectors = adventure_ALS.item_factors

In [28]:
from lib.recommend_util import ndcg

ndcg_values = []
for learning_count in range(10):
    ndcgs = {
        'ndcg5':  [],
        'ndcg10':  [],
        'ndcg20':  [],
        'ndcg50':  [],
        'ndcg100':  []
    }
    best_model = load_model(f'output/ml-20m-model_{learning_count}.h5')

    for userid, pos_items in tqdm(test_adventure_pos_items_dict.items()):
       # pos_itemsをadventure_matrixの次元に変換する
        pos_items = np.array([adventure_concat_itemid_dict[v] for v in pos_items])
        # useridに対応するユーザベクトル(action)を得る
        try:
            action_userid = action_train_action_users[userid]
        except:
            # 推薦できないユーザの場合は無条件で0を入れる
            ndcgs['ndcg5'].append(0)
            ndcgs['ndcg10'].append(0)
            ndcgs['ndcg20'].append(0)
            ndcgs['ndcg50'].append(0)
            ndcgs['ndcg100'].append(0)
            continue

        action_user_vector = action_ALS_user_vectors[action_userid, :]
        # autoencoderを使ってadventureの次元に変換する
        adventure_user_vector_action_AE = best_model.predict(action_user_vector.reshape(1, -1))
        # adventureのitemのベクトルと掛け合わせる
        adv_predict = np.dot(adventure_user_vector_action_AE, adventure_item_vectors.T)
        # sum_ratingsをargsort
        sorted_indices = np.array([v for v in np.argsort(-adv_predict)])[0]
        ndcgs['ndcg5'].append(ndcg(sorted_indices[:5], pos_items))
        ndcgs['ndcg10'].append(ndcg(sorted_indices[:10], pos_items))
        ndcgs['ndcg20'].append(ndcg(sorted_indices[:20], pos_items))
        ndcgs['ndcg50'].append(ndcg(sorted_indices[:50], pos_items))
        ndcgs['ndcg100'].append(ndcg(sorted_indices[:100], pos_items))
    
    ndcg_values.append(ndcgs)

100%|██████████| 18523/18523 [01:13<00:00, 251.67it/s]
100%|██████████| 18523/18523 [01:14<00:00, 248.33it/s]
100%|██████████| 18523/18523 [01:16<00:00, 242.66it/s]
100%|██████████| 18523/18523 [01:17<00:00, 239.73it/s]
100%|██████████| 18523/18523 [01:17<00:00, 239.98it/s]
100%|██████████| 18523/18523 [01:18<00:00, 235.97it/s]
100%|██████████| 18523/18523 [01:21<00:00, 226.29it/s]
100%|██████████| 18523/18523 [01:19<00:00, 231.56it/s]
100%|██████████| 18523/18523 [01:21<00:00, 226.48it/s]
100%|██████████| 18523/18523 [01:24<00:00, 220.29it/s]


In [33]:
ndcg5 = []
ndcg10 = []
ndcg20 = []
ndcg50 = []
ndcg100 = []
for ndcgs in  ndcg_values:
    ndcg5.append(np.mean(ndcgs['ndcg5']))
    ndcg10.append(np.mean(ndcgs['ndcg10']))
    ndcg20.append(np.mean(ndcgs['ndcg20']))
    ndcg50.append(np.mean(ndcgs['ndcg50']))
    ndcg100.append(np.mean(ndcgs['ndcg100']))
print("ndcg@5: {}".format(np.mean(ndcg5)))
print("ndcg@10: {}".format(np.mean(ndcg10)))
print("ndcg@20: {}".format(np.mean(ndcg20)))
print("ndcg@50: {}".format(np.mean(ndcg50)))
print("ndcg@100: {}".format(np.mean(ndcg100)))

ndcg@5: 0.13987014738496223
ndcg@10: 0.1872858615974345
ndcg@20: 0.23374553053393812
ndcg@50: 0.30161603726932323
ndcg@100: 0.34618679444756456


In [13]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error

X = np.array(overlap_action_user_vectors)
y = np.array(overlap_adventure_user_vectors)
epoch_size = 100
batch_size = 256
es_cb = EarlyStopping(
    monitor='val_loss',
    patience=10,
    verbose=0,
    mode='auto')
models = []
rmses_ = []
kf = KFold(n_splits=10, shuffle=True, random_state=42)
count = 0
for train_index, test_index in kf.split(X):
    print(f"learning_count: {count}")
    count += 1
    X_train_vector, X_test_vector = X[train_index], X[test_index]
    y_train_vector, y_test_vector = y[train_index], y[test_index]
    X_train_vector, X_val_vector, y_train_vector, y_val_vector = train_test_split(X_train_vector, y_train_vector, random_state=42)
    model = build_model(X_train_vector.shape[1], y_train_vector.shape[1])
    mcheck = ModelCheckpoint(
        f'output/ml-20m-model_k_{count}.h5',
        monitor='val_loss',
        save_best_only=True,
        verbose=0
    )
    model.fit(
        X_train_vector,
        y_train_vector,
        batch_size=batch_size,
        epochs=epoch_size,
        validation_data=(
            X_val_vector,
            y_val_vector),
        callbacks=[
            mcheck,
            es_cb],
        shuffle=True,
        verbose=0)
    best_model = load_model(f'output/ml-20m-model_k_{count}.h5')
    y_pred = best_model.predict(X_test_vector)
    rmse_ = np.sqrt(mean_squared_error(y_pred, y_test_vector))
    print('rmse: {}'.format(rmse_))
    rmses_.append(rmse_)
    models.append(best_model)

learning_count: 0
rmse: 0.5251619330752034
learning_count: 1
rmse: 0.507233192190287
learning_count: 2
rmse: 0.5161260012470608
learning_count: 3
rmse: 0.5108282018275829
learning_count: 4
rmse: 0.5257376499367238
learning_count: 5
rmse: 0.5233868038857069
learning_count: 6
rmse: 0.5217946238073435
learning_count: 7
rmse: 0.5153032209228041
learning_count: 8
rmse: 0.5137775134628675
learning_count: 9
rmse: 0.5158392218917118


In [19]:
from lib.recommend_util import ndcg

ndcg_values = []
for learning_count in range(1,11):
    ndcgs = {
        'ndcg5':  [],
        'ndcg10':  [],
        'ndcg20':  [],
        'ndcg50':  [],
        'ndcg100':  []
    }
    best_model = load_model(f'output/ml-20m-model_k_{learning_count}.h5')

    for userid, pos_items in tqdm(test_adventure_pos_items_dict.items()):
       # pos_itemsをadventure_matrixの次元に変換する
        pos_items = np.array([adventure_concat_itemid_dict[v] for v in pos_items])
        # useridに対応するユーザベクトル(action)を得る
        try:
            action_userid = action_train_action_users[userid]
        except:
            # 推薦できないユーザの場合は無条件で0を入れる
            ndcgs['ndcg5'].append(0)
            ndcgs['ndcg10'].append(0)
            ndcgs['ndcg20'].append(0)
            ndcgs['ndcg50'].append(0)
            ndcgs['ndcg100'].append(0)
            continue

        action_user_vector = action_ALS_user_vectors[action_userid, :]
        # autoencoderを使ってadventureの次元に変換する
        adventure_user_vector_action_AE = best_model.predict(action_user_vector.reshape(1, -1))
        # adventureのitemのベクトルと掛け合わせる
        adv_predict = np.dot(adventure_user_vector_action_AE, adventure_item_vectors.T)
        # sum_ratingsをargsort
        sorted_indices = np.array([v for v in np.argsort(-adv_predict)])[0]
        ndcgs['ndcg5'].append(ndcg(sorted_indices[:5], pos_items))
        ndcgs['ndcg10'].append(ndcg(sorted_indices[:10], pos_items))
        ndcgs['ndcg20'].append(ndcg(sorted_indices[:20], pos_items))
        ndcgs['ndcg50'].append(ndcg(sorted_indices[:50], pos_items))
        ndcgs['ndcg100'].append(ndcg(sorted_indices[:100], pos_items))
    
    ndcg_values.append(ndcgs)

100%|██████████| 18523/18523 [01:01<00:00, 303.13it/s]
100%|██████████| 18523/18523 [01:02<00:00, 297.20it/s]
100%|██████████| 18523/18523 [01:03<00:00, 292.42it/s]
100%|██████████| 18523/18523 [01:04<00:00, 288.73it/s]
100%|██████████| 18523/18523 [01:04<00:00, 285.32it/s]
100%|██████████| 18523/18523 [01:05<00:00, 282.67it/s]
100%|██████████| 18523/18523 [01:05<00:00, 283.13it/s]
100%|██████████| 18523/18523 [01:07<00:00, 274.51it/s]
100%|██████████| 18523/18523 [01:08<00:00, 271.65it/s]
100%|██████████| 18523/18523 [01:08<00:00, 271.63it/s]


In [20]:
ndcg5 = []
ndcg10 = []
ndcg20 = []
ndcg50 = []
ndcg100 = []
for ndcgs in  ndcg_values:
    ndcg5.append(np.mean(ndcgs['ndcg5']))
    ndcg10.append(np.mean(ndcgs['ndcg10']))
    ndcg20.append(np.mean(ndcgs['ndcg20']))
    ndcg50.append(np.mean(ndcgs['ndcg50']))
    ndcg100.append(np.mean(ndcgs['ndcg100']))
print("ndcg@5: {}".format(np.mean(ndcg5)))
print("ndcg@10: {}".format(np.mean(ndcg10)))
print("ndcg@20: {}".format(np.mean(ndcg20)))
print("ndcg@50: {}".format(np.mean(ndcg50)))
print("ndcg@100: {}".format(np.mean(ndcg100)))

ndcg@5: 0.13976131508579417
ndcg@10: 0.1865852172889774
ndcg@20: 0.23292729816527377
ndcg@50: 0.29863769196583057
ndcg@100: 0.34043177618780235
