In [1]:
import os
import sys
import numpy as np
import zipfile
from tqdm import tqdm
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.nrms import NRMSModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

2024-06-13 13:12:46.567182: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


System version: 3.9.18 (main, Sep 11 2023, 08:20:50) 
[Clang 14.0.6 ]
Tensorflow version: 2.12.0


## Prepare parametrs

In [2]:
epochs = 5
seed = 42
batch_size = 32

# Options: demo, small, large
MIND_type = 'demo'

In [4]:
tmpdir = TemporaryDirectory()
data_path = tmpdir.name

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'nrms.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)
    
if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \
                               os.path.join(data_path, 'utils'), mind_utils)

100%|██████████| 17.0k/17.0k [00:04<00:00, 4.11kKB/s]
100%|██████████| 9.84k/9.84k [00:01<00:00, 5.36kKB/s]
100%|██████████| 95.0k/95.0k [00:14<00:00, 6.35kKB/s]


In [5]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs,
                          show_step=10)
print(hparams)

HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 20, 'head_dim': 20, 'filter_num': 200, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 5, 'batch_size': 32, 'show_step': 10, 'title_size': 30, 'his_size': 50, 'data_format': 'news', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'model_type': 'nrms', 'loss': 'cross_entropy_loss', 'wordEmb_file': '/var/folders/qd/_v2sn93n1gg2903qwd76wgfw0000gp/T/tmpird1zytn/utils/embedding.npy', 'wordDict_file': '/var/folders/qd/_v2sn93n1gg2903qwd76wgfw0000gp/T/tmpird1zytn/utils/word_dict.pkl', 'userDict_file': '/var/folders/qd/_v2sn93n1gg2903qwd76wgfw0000gp/T/tmpird1zytn/utils/uid2index.pkl'}


In [6]:
iterator = MINDIterator

In [7]:
model = NRMSModel(hparams, iterator, seed=seed)

2024-06-13 13:14:48.837880: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:353] MLIR V1 optimization pass is not enabled
2024-06-13 13:14:48.944694: W tensorflow/c/c_api.cc:300] Operation '{name:'embedding/embeddings/Assign' id:26 op device:{requested: '', assigned: ''} def:{{{node embedding/embeddings/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](embedding/embeddings, embedding/embeddings/Initializer/stateless_random_uniform)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
  super().__init__(name, **kwargs)


In [8]:
print(model.run_eval(valid_news_file, valid_behaviors_file))

  updates=self.state_updates,
2024-06-13 13:14:54.956280: W tensorflow/c/c_api.cc:300] Operation '{name:'att_layer2_1/q/Assign' id:814 op device:{requested: '', assigned: ''} def:{{{node att_layer2_1/q/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](att_layer2_1/q, att_layer2_1/q/Initializer/random_uniform)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
586it [00:18, 32.19it/s]
0it [00:00, ?it/s]2024-06-13 13:15:12.656937: W tensorflow/c/c_api.cc:300] Operation '{name:'att_layer2_1/Sum_1' id:864 op device:{requested: '', assigned: ''} def:{{{node att_layer2_1/Sum_1}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, _has_manual_control_dependencies=true, keep_dims=false](att_layer2_1/mul, att_layer2_1/Sum_1/reduction_indices)}}' was changed by setting attribute after it was run by a 

{'group_auc': 0.4792, 'mean_mrr': 0.2059, 'ndcg@5': 0.2045, 'ndcg@10': 0.2701}


In [9]:

%%time
model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)

0it [00:00, ?it/s]2024-06-13 13:20:16.879031: W tensorflow/c/c_api.cc:300] Operation '{name:'loss/mul' id:2002 op device:{requested: '', assigned: ''} def:{{{node loss/mul}} = Mul[T=DT_FLOAT, _has_manual_control_dependencies=true](loss/mul/x, loss/activation_loss/value)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2024-06-13 13:20:17.077067: W tensorflow/c/c_api.cc:300] Operation '{name:'training/Adam/self_attention_1/WV/v/Assign' id:2795 op device:{requested: '', assigned: ''} def:{{{node training/Adam/self_attention_1/WV/v/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](training/Adam/self_attention_1/WV/v, training/Adam/self_attention_1/WV/v/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, an

at epoch 1
train info: logloss loss:1.5145636053814193
eval info: group_auc:0.5804, mean_mrr:0.2434, ndcg@10:0.3307, ndcg@5:0.2586
at epoch 1 , train time: 3532.4 eval time: 290.4


step 1080 , total_loss: 1.4202, data_loss: 1.3225: : 1086it [1:00:33,  3.35s/it]
586it [00:09, 61.29it/s]
236it [04:12,  1.07s/it]
7538it [00:00, 8872.94it/s] 


at epoch 2
train info: logloss loss:1.4201377816841307
eval info: group_auc:0.6001, mean_mrr:0.2538, ndcg@10:0.3446, ndcg@5:0.2695
at epoch 2 , train time: 3633.4 eval time: 270.3


step 1080 , total_loss: 1.3774, data_loss: 1.1489: : 1086it [52:58,  2.93s/it]
586it [00:08, 71.93it/s]
236it [03:26,  1.14it/s]
7538it [00:00, 13012.97it/s]


at epoch 3
train info: logloss loss:1.3773771422344019
eval info: group_auc:0.6089, mean_mrr:0.265, ndcg@10:0.3569, ndcg@5:0.2837
at epoch 3 , train time: 3178.2 eval time: 222.1


step 1080 , total_loss: 1.3525, data_loss: 1.2344: : 1086it [56:26,  3.12s/it]
586it [00:08, 67.91it/s]
236it [03:26,  1.14it/s]
7538it [00:00, 12872.11it/s]


at epoch 4
train info: logloss loss:1.3525672707000052
eval info: group_auc:0.6156, mean_mrr:0.2705, ndcg@10:0.3624, ndcg@5:0.293
at epoch 4 , train time: 3386.1 eval time: 222.8


step 1080 , total_loss: 1.3314, data_loss: 1.3578: : 1086it [48:50,  2.70s/it]
586it [00:08, 68.29it/s]
236it [03:26,  1.14it/s]
7538it [00:00, 13604.46it/s]


at epoch 5
train info: logloss loss:1.3315283992013878
eval info: group_auc:0.6156, mean_mrr:0.2714, ndcg@10:0.3649, ndcg@5:0.2921
at epoch 5 , train time: 2930.9 eval time: 222.4
CPU times: user 11h 47min 26s, sys: 2h 19min 28s, total: 14h 6min 55s
Wall time: 4h 58min 8s


<recommenders.models.newsrec.models.nrms.NRMSModel at 0x7fdbde171490>

In [10]:
%%time
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

586it [00:10, 54.83it/s]
236it [03:29,  1.13it/s]
7538it [00:00, 13517.93it/s]


{'group_auc': 0.6156, 'mean_mrr': 0.2714, 'ndcg@5': 0.2921, 'ndcg@10': 0.3649}
CPU times: user 8min, sys: 1min 46s, total: 9min 46s
Wall time: 3min 47s
