In [1]:
import sys
import os
import numpy as np
import zipfile
from tqdm import tqdm
import scrapbook as sb
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.npa import NPAModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]
Tensorflow version: 2.8.0


^C
^C


In [2]:
path = os.getcwd() 
data_path = os.path.join(path,'dataset')
data_path

'C:\\Users\\chira\\OneDrive\\Documents\\GitHub\\finstories\\neural__network_training_for_news_recomendation\\dataset'

In [3]:
epochs = 5
seed = 42
batch_size = 32

# Options: demo, small, large
MIND_type = 'demo'

In [4]:
train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'npa.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)
    
if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \
                               os.path.join(data_path, 'utils'), mind_utils)

100%|████████████████████████████████████████████████████████████████████████████| 95.0k/95.0k [00:12<00:00, 7.48kKB/s]


In [5]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs)
print(hparams)

HParams object with values {'support_quick_scoring': False, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 4, 'head_dim': 100, 'filter_num': 400, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 100, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 5, 'batch_size': 32, 'show_step': 100000, 'title_size': 10, 'his_size': 50, 'data_format': 'news', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'cnn_activation': 'relu', 'model_type': 'npa', 'loss': 'cross_entropy_loss', 'wordEmb_file': 'C:\\Users\\chira\\OneDrive\\Documents\\GitHub\\finstories\\neural__network_training_for_news_recomendation\\dataset\\utils\\embedding.npy', 'wordDict_file': 'C:\\Users\\chira\\OneDrive\\Documents\\GitHub\\finstories\\neural__network_training_for_news_recomendation\\dataset\\utils\\word_dict.pkl', 'userDict_file': 'C:\\Users\\chira\\OneDrive\\Documents\\GitHub\\finstories\\neural__network_tr

In [6]:
iterator = MINDIterator


In [7]:
model = NPAModel(hparams, iterator, seed=seed)

  super(Adam, self).__init__(name, **kwargs)


In [8]:
from keras import backend as K
K._get_available_gpus()

[]

In [9]:
print(model.run_eval(valid_news_file, valid_behaviors_file))


  updates=self.state_updates,
8874it [05:38, 26.19it/s]


{'group_auc': 0.5228, 'mean_mrr': 0.2328, 'ndcg@5': 0.2376, 'ndcg@10': 0.303}


In [11]:
%%time
model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)

1086it [06:25,  2.81it/s]
8874it [05:02, 29.36it/s]


at epoch 1
train info: logloss loss:1.5233437564930643
eval info: group_auc:0.571, mean_mrr:0.2477, ndcg@10:0.3306, ndcg@5:0.2621
at epoch 1 , train time: 386.0 eval time: 307.1


1086it [06:17,  2.88it/s]
8874it [05:02, 29.36it/s]


at epoch 2
train info: logloss loss:1.4132603485939912
eval info: group_auc:0.5964, mean_mrr:0.2623, ndcg@10:0.3503, ndcg@5:0.2863
at epoch 2 , train time: 377.6 eval time: 307.1


1086it [06:15,  2.90it/s]
8874it [05:08, 28.79it/s]


at epoch 3
train info: logloss loss:1.3502503389811646
eval info: group_auc:0.5982, mean_mrr:0.2712, ndcg@10:0.3581, ndcg@5:0.2948
at epoch 3 , train time: 375.1 eval time: 313.3


1086it [06:41,  2.71it/s]
8874it [05:37, 26.33it/s]


at epoch 4
train info: logloss loss:1.3029499739463377
eval info: group_auc:0.5885, mean_mrr:0.2681, ndcg@10:0.3524, ndcg@5:0.2876
at epoch 4 , train time: 401.3 eval time: 342.0


1086it [06:43,  2.69it/s]
8874it [05:48, 25.47it/s]


at epoch 5
train info: logloss loss:1.2625307827693981
eval info: group_auc:0.5942, mean_mrr:0.2708, ndcg@10:0.3559, ndcg@5:0.2919
at epoch 5 , train time: 403.2 eval time: 353.6
Wall time: 59min 26s


<recommenders.models.newsrec.models.npa.NPAModel at 0x1386bb99cd0>

In [12]:
%%time
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

8874it [05:24, 27.37it/s]


{'group_auc': 0.5942, 'mean_mrr': 0.2708, 'ndcg@5': 0.2919, 'ndcg@10': 0.3559}
Wall time: 5min 29s


In [14]:
model_path = os.path.join(data_path, "model")
os.makedirs(model_path, exist_ok=True)

model.model.save_weights(os.path.join(model_path, "npa_ckpt"))

In [16]:
group_impr_indexes, group_labels, group_preds = model.run_slow_eval(valid_news_file, valid_behaviors_file)


8874it [05:30, 26.83it/s]


In [17]:
with open(os.path.join(data_path, 'prediction.txt'), 'w') as f:
    for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):
        impr_index += 1
        pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()
        pred_rank = '[' + ','.join([str(i) for i in pred_rank]) + ']'
        f.write(' '.join([str(impr_index), pred_rank])+ '\n')

7538it [00:00, 44976.16it/s]


In [21]:
group_preds[0]

[0.98031837,
 0.95835686,
 0.9597107,
 0.9662012,
 0.9431334,
 0.9617912,
 0.9491403,
 0.9441322,
 0.9927767,
 0.97449994,
 0.9718654,
 0.97154987,
 0.94522274,
 0.9490602,
 0.9668648,
 0.9454719,
 0.97620535,
 0.97525084,
 0.97036743,
 0.9484103,
 0.9900291,
 0.99278927,
 0.98265254,
 0.9831247,
 0.97974974,
 0.90888953,
 0.9563956,
 0.9532484]

In [18]:
f = zipfile.ZipFile(os.path.join(data_path, 'prediction.zip'), 'w', zipfile.ZIP_DEFLATED)
f.write(os.path.join(data_path, 'prediction.txt'), arcname='prediction.txt')
f.close()