# Neural Collaborative Filtering


* 参考  
https://github.com/microsoft/recommenders/blob/main/examples/02_model_collaborative_filtering/ncf_deep_dive.ipynb
* 一般化MFと深層学習を両方使い線形性と非線形性の両方を使いscoreを計算する
* 事前学習をすることもできる

In [3]:
%%capture
!pip install recommenders
!pip install tensorflow
!pip install tf_slim

In [44]:
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.timer import Timer

import pandas as pd

In [27]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# Model parameters
EPOCHS = 100
BATCH_SIZE = 256

SEED = 3655

## データの用意

In [28]:
df = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=["userID", "itemID", "rating", "timestamp"]
)

df.head()


INFO:recommenders.datasets.download_utils:Downloading https://files.grouplens.org/datasets/movielens/ml-100k.zip
100%|██████████| 4.81k/4.81k [00:01<00:00, 3.63kKB/s]


Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [29]:
train, test = python_chrono_split(df, 0.75)

In [36]:
test = test[test["userID"].isin(train["userID"].unique())]
test = test[test["itemID"].isin(train["itemID"].unique())]
leave_one_out_test = test.groupby("userID").last().reset_index()

In [37]:
# なぜかデータをファイルに保存しないといけない
train_file = "/workspace/train.csv"
test_file = "/workspace/test.csv"
leave_one_out_test_file = "/workspace/leave_one_out_test.csv"
train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)
leave_one_out_test.to_csv(leave_one_out_test_file, index=False)

In [38]:
data = NCFDataset(train_file=train_file, test_file=leave_one_out_test_file, seed=SEED, overwrite_test_file_full=True)

INFO:recommenders.models.ncf.dataset:Indexing /workspace/train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing /workspace/leave_one_out_test.csv ...
INFO:recommenders.models.ncf.dataset:Creating full leave-one-out test file /workspace/leave_one_out_test_full.csv ...
100%|██████████| 943/943 [00:04<00:00, 228.36it/s]
INFO:recommenders.models.ncf.dataset:Indexing /workspace/leave_one_out_test_full.csv ...


## NCFの学習
* 割と時間がかかる

In [39]:
model = NCF (
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

2023-01-13 00:47:18.450961: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:357] MLIR V1 optimization pass is not enabled


In [40]:
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [2.11s]: train_loss = 0.293230 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [2.28s]: train_loss = 0.257240 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 30 [2.06s]: train_loss = 0.242564 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 40 [1.95s]: train_loss = 0.236010 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 50 [2.22s]: train_loss = 0.231343 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 60 [2.14s]: train_loss = 0.227967 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 70 [1.86s]: train_loss = 0.226076 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 80 [1.93s]: train_loss = 0.224949 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 90 [1.85s]: train_loss = 0.223493 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 100 [1.85s]: train_loss = 0.221815 


Took 203.15090401098132 seconds for training.


## NCFの予測

In [41]:
predictions = [[row.userID, row.itemID, model.predict(row.userID, row.itemID)]
               for (_, row) in test.iterrows()]


predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,1.0,149.0,0.006061
1,1.0,88.0,0.590026
2,1.0,101.0,0.507564
3,1.0,110.0,0.058106
4,1.0,103.0,0.020737


## 予測値の評価

In [42]:
with Timer() as test_time:

    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time.interval))

Took 5.704244461026974 seconds for prediction.


In [45]:
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.047425
NDCG:	0.195586
Precision@K:	0.175716
Recall@K:	0.099128
