# **RANDOM MODEL**

### **Initial Setup**

In [None]:
!git clone https://github.com/microsoft/recommenders.git

fatal: destination path 'recommenders' already exists and is not an empty directory.


In [None]:
%cd recommenders

/content/recommenders


In [None]:
!pip install retrying

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install scrapbook

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


###  **Importing the needed libraries**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
import os
import numpy as np
import pandas as pd
import zipfile
from tqdm import tqdm
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.nrms import NRMSModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set
from sklearn.metrics import ndcg_score
from recommenders.evaluation.python_evaluation import ndcg_at_k

import warnings
# Avoid printing some FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)


print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
System version: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]
Tensorflow version: 2.12.0


### **Loading the behavior and news dataframes**

In [None]:
# Options: demo, small, large
MIND_type = 'demo'

In [None]:
tmpdir = TemporaryDirectory()
data_path = tmpdir.name

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'nrms.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)

if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \
                               os.path.join(data_path, 'utils'), mind_utils)

100%|██████████| 17.0k/17.0k [00:02<00:00, 8.19kKB/s]
100%|██████████| 9.84k/9.84k [00:02<00:00, 4.73kKB/s]
100%|██████████| 95.0k/95.0k [00:05<00:00, 18.0kKB/s]


## **RANDOM MODEL: Randomizing the results from a NMRS model**

Setting up the parameters

In [None]:
epochs = 5
seed = 42
batch_size = 32

In [None]:
hparams = prepare_hparams(yaml_file,
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file,
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs,
                          show_step=10)
print(hparams)

HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 20, 'head_dim': 20, 'filter_num': 200, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 5, 'batch_size': 32, 'show_step': 10, 'title_size': 30, 'his_size': 50, 'data_format': 'news', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'model_type': 'nrms', 'loss': 'cross_entropy_loss', 'wordEmb_file': '/tmp/tmp_4b4dz8k/utils/embedding.npy', 'wordDict_file': '/tmp/tmp_4b4dz8k/utils/word_dict.pkl', 'userDict_file': '/tmp/tmp_4b4dz8k/utils/uid2index.pkl'}


Instantiating the model

In [None]:
iterator = MINDIterator

In [None]:
model = NRMSModel(hparams, iterator, seed=seed)

  super().__init__(name, **kwargs)


Training the NMRS model

In [None]:
%%time
model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)

step 1080 , total_loss: 1.5140, data_loss: 1.3620: : 1086it [01:28, 12.24it/s]
  updates=self.state_updates,
586it [00:01, 410.18it/s]
236it [00:02, 80.80it/s]
7538it [00:00, 8078.11it/s]


at epoch 1
train info: logloss loss:1.5136202369605638
eval info: group_auc:0.5791, mean_mrr:0.2432, ndcg@10:0.331, ndcg@5:0.2586
at epoch 1 , train time: 88.8 eval time: 15.4


step 1080 , total_loss: 1.4178, data_loss: 1.2990: : 1086it [01:23, 12.97it/s]
586it [00:00, 839.56it/s]
236it [00:02, 100.52it/s]
7538it [00:00, 7684.18it/s]


at epoch 2
train info: logloss loss:1.418128491216382
eval info: group_auc:0.6019, mean_mrr:0.2579, ndcg@10:0.3484, ndcg@5:0.2748
at epoch 2 , train time: 83.7 eval time: 14.3


step 1080 , total_loss: 1.3776, data_loss: 1.1791: : 1086it [01:23, 12.98it/s]
586it [00:00, 829.13it/s]
236it [00:02, 100.16it/s]
7538it [00:00, 7652.13it/s]


at epoch 3
train info: logloss loss:1.3776330051062076
eval info: group_auc:0.6102, mean_mrr:0.2668, ndcg@10:0.3589, ndcg@5:0.2866
at epoch 3 , train time: 83.7 eval time: 14.5


step 1080 , total_loss: 1.3531, data_loss: 1.2992: : 1086it [01:23, 12.97it/s]
586it [00:00, 826.94it/s]
236it [00:02, 100.56it/s]
7538it [00:00, 7975.61it/s]


at epoch 4
train info: logloss loss:1.3530798823903718
eval info: group_auc:0.609, mean_mrr:0.2688, ndcg@10:0.3599, ndcg@5:0.2891
at epoch 4 , train time: 83.8 eval time: 14.4


step 1080 , total_loss: 1.3259, data_loss: 1.3403: : 1086it [01:23, 12.96it/s]
586it [00:00, 849.33it/s]
236it [00:02, 100.38it/s]
7538it [00:00, 7936.32it/s]


at epoch 5
train info: logloss loss:1.326026021117005
eval info: group_auc:0.6125, mean_mrr:0.2699, ndcg@10:0.3631, ndcg@5:0.2888
at epoch 5 , train time: 83.8 eval time: 14.3
CPU times: user 4min 33s, sys: 21.7 s, total: 4min 55s
Wall time: 8min 16s


<recommenders.models.newsrec.models.nrms.NRMSModel at 0x7fe265ac2ce0>

Evaluating the NMRS Model

In [None]:
%%time
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

586it [00:00, 843.42it/s]
236it [00:02, 99.36it/s] 
7538it [00:00, 7555.91it/s]


{'group_auc': 0.6125, 'mean_mrr': 0.2699, 'ndcg@5': 0.2888, 'ndcg@10': 0.3631}
CPU times: user 14.3 s, sys: 1.14 s, total: 15.4 s
Wall time: 14.3 s


In [None]:
group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)

586it [00:01, 543.10it/s]
236it [00:02, 97.24it/s]
7538it [00:00, 8387.96it/s]


| Model   | group_auc | mean_mrr | ndcg@5 | ndcg@10 |
|----------|-----------|----------|--------|---------|
| NMRS    |   0.619  |   0.2736  | 0.2966 |  0.3679 |

We can see that the NMRS gets a pretty decent score for the ndcg@k in the 5 and 10 metrics. But now let's shuffle these predictions to obtain what would be the baseline for the random model in this specific problem, and calculate how much our models are able to outperform this one.

-----------------------

Aux functions

In [None]:
def dcg_score(y_true, y_score, k=10):
    """Computing dcg score metric at k.

    Args:
        y_true (np.ndarray): Ground-truth labels.
        y_score (np.ndarray): Predicted labels.

    Returns:
        np.ndarray: dcg scores.
    """
    k = min(np.shape(y_true)[-1], k)
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)

In [None]:
def ndcg_score(y_true, y_score, k):
    """Computing ndcg score metric at k.

    Args:
        y_true (np.ndarray): Ground-truth labels.
        y_score (np.ndarray): Predicted labels.

    Returns:
        numpy.ndarray: ndcg scores.
    """
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best

In [None]:
def process_impression(impression_list):
    """
    Process the impression list and extract click and non-click information.

    Args:
        impression_list (str): List of impressions in string format.

    Returns:
        tuple: A tuple containing two lists - click and non-click.
    """
    list_of_strings = impression_list.split()
    click = [x.split('-')[0] for x in list_of_strings if x.split('-')[1] == '1']
    non_click = [x.split('-')[0] for x in list_of_strings if x.split('-')[1] == '0']
    return click, non_click

Getting the validation dataset with the real labels, for later comparisons.

In [None]:
# Read the file
behav_df_demo = pd.read_csv(valid_behaviors_file,sep='\t', header=None, names=['Impression_ID', 'User_ID', 'Time', 'History', 'Impressions'])
# Select a subset
behav_true_df = behav_df_demo.loc[:, ["Impression_ID", "User_ID", "Impressions"]]
# Create the Impressions_True
behav_true_df["Impressions array"] = behav_true_df["Impressions"].str.split().apply(lambda x: [int(item.split("-")[1]) for item in x])
#Display
behav_true_df.head()

Unnamed: 0,Impression_ID,User_ID,Impressions,Impressions array
0,1,U41827,N23699-0 N21291-0 N1901-0 N27292-0 N17443-0 N1...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,2,U61881,N26916-0 N4641-0 N25522-0 N14893-0 N19035-0 N3...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3,U54180,N13528-0 N27689-0 N10879-0 N11662-0 N14409-0 N...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,4,U41164,N20150-0 N1807-1 N26916-0 N28138-0 N9576-0 N19...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,5,U8588,N21325-0 N5982-0 N19737-1 N9576-0 N20150-0 N25...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"


Random Model Predictions (Extracting the predictions from the NMRS model and randomizing them)

In [None]:
# Select a subset
behav_random_df = behav_df_demo.loc[:, ["Impression_ID", "User_ID", "Impressions"]]
# Creation of Impressions_list
behav_random_df["Impressions count"] = behav_random_df["Impressions"].str.split().apply(len)
# Creation of the Random_Pred
behav_random_df["Random prediction"] = behav_random_df["Impressions count"].apply(lambda x: np.random.permutation(np.arange(1, x+1)))

#Display
behav_random_df.head()

Unnamed: 0,Impression_ID,User_ID,Impressions,Impressions count,Random prediction
0,1,U41827,N23699-0 N21291-0 N1901-0 N27292-0 N17443-0 N1...,28,"[23, 12, 7, 18, 10, 21, 19, 2, 16, 6, 14, 11, ..."
1,2,U61881,N26916-0 N4641-0 N25522-0 N14893-0 N19035-0 N3...,61,"[45, 43, 59, 35, 6, 53, 28, 36, 7, 48, 61, 27,..."
2,3,U54180,N13528-0 N27689-0 N10879-0 N11662-0 N14409-0 N...,54,"[41, 18, 38, 25, 2, 33, 11, 5, 19, 15, 26, 13,..."
3,4,U41164,N20150-0 N1807-1 N26916-0 N28138-0 N9576-0 N19...,13,"[4, 3, 10, 9, 11, 6, 5, 2, 8, 13, 7, 12, 1]"
4,5,U8588,N21325-0 N5982-0 N19737-1 N9576-0 N20150-0 N25...,10,"[9, 3, 5, 6, 4, 7, 2, 10, 1, 8]"


In [None]:
# We can then indexize these two new columns:
behav_random_df['Clicked'], behav_random_df['Not clicked'] = zip(*behav_random_df['Impressions'].map(process_impression))
behav_random_df["Clicks count"] = behav_random_df["Clicked"].apply(len)

In [None]:
# Create the 'Array of clicks' column by generating a list of 1's and 0's based on the 'Clicks count' and 'Random prediction' columns
behav_random_df['Array of clicks'] = behav_random_df.apply(lambda row: [1] * row['Clicks count'] + [0] * (len(row['Random prediction']) - row['Clicks count']), axis=1)

# Create the 'Sorted array of clicks' column by sorting the 'Random prediction' and 'Array of clicks' columns together and extracting the sorted 'Array of clicks' values
behav_random_df['Sorted array of clicks'] = behav_random_df.apply(lambda row: [x for _, x in sorted(zip(row['Random prediction'], row['Array of clicks']))], axis=1)

# Drop the 'Array of clicks' column from the DataFrame
behav_random_df.drop('Array of clicks', axis=1, inplace=True)

# Display the updated DataFrame
behav_random_df.head()

Unnamed: 0,Impression_ID,User_ID,Impressions,Impressions count,Random prediction,Clicked,Not clicked,Clicks count,Sorted array of clicks
0,1,U41827,N23699-0 N21291-0 N1901-0 N27292-0 N17443-0 N1...,28,"[23, 12, 7, 18, 10, 21, 19, 2, 16, 6, 14, 11, ...",[N8620],"[N23699, N21291, N1901, N27292, N17443, N18282...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,U61881,N26916-0 N4641-0 N25522-0 N14893-0 N19035-0 N3...,61,"[45, 43, 59, 35, 6, 53, 28, 36, 7, 48, 61, 27,...",[N19829],"[N26916, N4641, N25522, N14893, N19035, N3877,...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3,U54180,N13528-0 N27689-0 N10879-0 N11662-0 N14409-0 N...,54,"[41, 18, 38, 25, 2, 33, 11, 5, 19, 15, 26, 13,...",[N13530],"[N13528, N27689, N10879, N11662, N14409, N6849...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,4,U41164,N20150-0 N1807-1 N26916-0 N28138-0 N9576-0 N19...,13,"[4, 3, 10, 9, 11, 6, 5, 2, 8, 13, 7, 12, 1]","[N1807, N16798]","[N20150, N26916, N28138, N9576, N19737, N24553...",2,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,5,U8588,N21325-0 N5982-0 N19737-1 N9576-0 N20150-0 N25...,10,"[9, 3, 5, 6, 4, 7, 2, 10, 1, 8]",[N19737],"[N21325, N5982, N9576, N20150, N25701, N10908,...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"


This is the actual predictions that came out from our randomized model

In [None]:
new_preds = behav_random_df["Sorted array of clicks"]

In [None]:
ndcg_list = [5]
for k in ndcg_list:
    ndcg_temp= np.mean(
        [
            ndcg_score(each_labels, each_preds, k)
            for each_labels, each_preds in zip(group_labels, new_preds)
        ]
    )

In [None]:
print(f'The ndcg@5 for the random model is {ndcg_temp}')

The ndcg@5 for the random model is 0.21947139890765144


In [None]:
ndcg_list = [10]
for k in ndcg_list:
    ndcg_temp= np.mean(
        [
            ndcg_score(each_labels, each_preds, k)
            for each_labels, each_preds in zip(group_labels, group_preds)
        ]
    )

In [None]:
print(f'The ndcg@10 for the random model is {ndcg_temp}')

The ndcg@10 for the random model is 0.36310388212378397


| Model   | group_auc | mean_mrr | ndcg@5 | ndcg@10 |
|----------|-----------|----------|--------|---------|
| NMRS    |   0.619  |   0.2736  | 0.2966 |  0.3679 |
| Random    |   -  |   -  | 0.2194 |  0.3631 |

-------------