#(1)Skeleton code

In [None]:
import pandas as pd

# read in the news.tsv file as a DataFrame
news_df = pd.read_csv('news.tsv', sep='\t', header=None, names=['id', 'title', 'abstract', 'url', 'category', 'entities'])

# convert the IDs to strings
news_df['id'] = news_df['id'].astype(str)

# randomly sample 10 articles from the DataFrame
sampled_articles = news_df.sample(10)

# display the sampled articles
print("Here are 10 randomly sampled articles:")
print(sampled_articles[['id', 'title', 'abstract']])
print()

# prompt the user to choose which articles they're interested in
chosen_ids_str = input("Please enter the IDs of the articles you're interested in, separated by commas: ")

# convert the input string into a list of IDs
chosen_ids = [id.strip()[1:] for id in chosen_ids_str.split(',')]
print("Chosen IDs:", chosen_ids)

# filter the sampled_articles DataFrame to only include the chosen articles
chosen_articles = sampled_articles[sampled_articles['id'].isin(chosen_ids)]

# get the entities from the chosen articles
chosen_entities = set()
for entities_str in chosen_articles['entities']:
    if pd.isna(entities_str):
        continue
    entities = entities_str.split(';')
    for entity in entities:
        chosen_entities.add(entity.strip())

# recommend articles based on the chosen entities
recommended_articles = pd.DataFrame(columns=['id', 'title', 'abstract', 'score'])
for entity in chosen_entities:
    entity_articles = news_df[news_df['entities'].str.contains(entity, na=False)]
    entity_articles = entity_articles[~entity_articles['id'].isin(chosen_ids)]
    entity_articles['score'] = entity_articles['entities'].apply(lambda x: x.split(';').count(entity))
    entity_articles = entity_articles.sort_values('score', ascending=False).head(5)
    entity_articles = entity_articles[['id', 'title', 'abstract', 'score']]
    entity_articles['score'] = entity_articles['score'].apply(lambda x: f"{x} (matched on '{entity}')")
    recommended_articles = recommended_articles.append(entity_articles)

# display the recommended articles
print("Here are some articles you might be interested in:")
print(recommended_articles[['id', 'title', 'abstract', 'score']])



In [None]:
!pip install scrapbook
pip install -e git+https://github.com/microsoft/recommenders/#egg=recommenders

In [None]:
!pip install recommenders[examples]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import sys
import os
import numpy as np
import zipfile
from tqdm import tqdm
import scrapbook as sb
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.nrms import NRMSModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.9.16 (main, Dec  7 2022, 01:11:51) 
[GCC 9.4.0]
Tensorflow version: 2.11.0


#(2) Set Parameters

In [None]:
epochs = 5
seed = 42
batch_size = 32

# Options: demo, small, large
MIND_type = 'demo'

#(3) Download and load data

In [None]:
tmpdir = TemporaryDirectory()
data_path = tmpdir.name

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'nrms.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)
    
if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \
                               os.path.join(data_path, 'utils'), mind_utils)


100%|██████████| 17.0k/17.0k [00:01<00:00, 9.15kKB/s]
100%|██████████| 9.84k/9.84k [00:01<00:00, 5.74kKB/s]
100%|██████████| 95.0k/95.0k [00:13<00:00, 7.16kKB/s]


#(4) Create hyper-parameters

In [None]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs,
                          show_step=10)
print(hparams)

HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 20, 'head_dim': 20, 'filter_num': 200, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 5, 'batch_size': 32, 'show_step': 10, 'title_size': 30, 'his_size': 50, 'data_format': 'news', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'model_type': 'nrms', 'loss': 'cross_entropy_loss', 'wordEmb_file': '/tmp/tmpsogykk5m/utils/embedding.npy', 'wordDict_file': '/tmp/tmpsogykk5m/utils/word_dict.pkl', 'userDict_file': '/tmp/tmpsogykk5m/utils/uid2index.pkl'}


(5) Train the NRMS model

In [None]:
iterator = MINDIterator
model = NRMSModel(hparams, iterator, seed=seed)
print(model.run_eval(valid_news_file, valid_behaviors_file))

  super().__init__(name, **kwargs)
  updates=self.state_updates,
1326it [00:49, 26.82it/s]
2286it [1:12:58,  1.92s/it]
73152it [00:14, 4979.76it/s]


{'group_auc': 0.5024, 'mean_mrr': 0.2136, 'ndcg@5': 0.2174, 'ndcg@10': 0.2846}


In [None]:
%%time
model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)

step 270 , total_loss: 1.6199, data_loss: 1.5256: : 279it [29:54,  6.59s/it]

In [None]:
%%time
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

In [None]:
sb.glue("res_syn", res_syn)