# **RANDOM MODEL**

### **Initial Setup**

In [135]:
!git clone https://github.com/microsoft/recommenders.git

Cloning into 'recommenders'...
remote: Enumerating objects: 37174, done.[K
remote: Counting objects: 100% (706/706), done.[K
remote: Compressing objects: 100% (245/245), done.[K
remote: Total 37174 (delta 469), reused 600 (delta 445), pack-reused 36468[K
Receiving objects: 100% (37174/37174), 205.34 MiB | 6.44 MiB/s, done.
Resolving deltas: 100% (25111/25111), done.


In [136]:
%cd recommenders

/content/recommenders/recommenders/recommenders


In [137]:
!pip install retrying



In [138]:
!pip install scrapbook



###  **Importing the needed libraries**

In [139]:
from google.colab import drive
drive.mount('/content/drive')

import sys
import os
import numpy as np
import pandas as pd
import zipfile
from tqdm import tqdm
import copy
import random
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.nrms import NRMSModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set
from sklearn.metrics import ndcg_score
from recommenders.evaluation.python_evaluation import ndcg_at_k

import warnings
# Avoid printing some FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)


print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
System version: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]
Tensorflow version: 2.12.0


### **Loading the behavior and news dataframes**

In [140]:
# Options: demo, small, large
MIND_type = 'small'

In [141]:
tmpdir = TemporaryDirectory()
data_path = tmpdir.name

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)

if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)

100%|██████████| 51.7k/51.7k [00:07<00:00, 6.67kKB/s]
100%|██████████| 30.2k/30.2k [00:06<00:00, 4.76kKB/s]


Aux functions

In [142]:
def dcg_score(y_true, y_score, k=10):
    """Computing dcg score metric at k.

    Args:
        y_true (np.ndarray): Ground-truth labels.
        y_score (np.ndarray): Predicted labels.

    Returns:
        np.ndarray: dcg scores.
    """
    k = min(np.shape(y_true)[-1], k)
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)

In [143]:
def ndcg_score(y_true, y_score, k):
    """Computing ndcg score metric at k.

    Args:
        y_true (np.ndarray): Ground-truth labels.
        y_score (np.ndarray): Predicted labels.

    Returns:
        numpy.ndarray: ndcg scores.
    """
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best

In [144]:
def process_impression(impression_list):
    """
    Process the impression list and extract click and non-click information.

    Args:
        impression_list (str): List of impressions in string format.

    Returns:
        tuple: A tuple containing two lists - click and non-click.
    """
    list_of_strings = impression_list.split()
    click = [x.split('-')[0] for x in list_of_strings if x.split('-')[1] == '1']
    non_click = [x.split('-')[0] for x in list_of_strings if x.split('-')[1] == '0']
    return click, non_click

Getting the validation dataset with the real labels, for later comparisons.

In [145]:
# Read the file
behav_df_demo = pd.read_csv(valid_behaviors_file,sep='\t', header=None, names=['Impression_ID', 'User_ID', 'Time', 'History', 'Impressions'])
# Select a subset
behav_true_df = behav_df_demo.loc[:, ["Impression_ID", "User_ID", "Impressions"]]
# Create the Impressions_True
behav_true_df["Impressions array"] = behav_true_df["Impressions"].str.split().apply(lambda x: [int(item.split("-")[1]) for item in x])
#Display
behav_true_df.head()

Unnamed: 0,Impression_ID,User_ID,Impressions,Impressions array
0,1,U80234,N28682-0 N48740-0 N31958-1 N34130-0 N6916-0 N5...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,U60458,N20036-0 N23513-1 N32536-0 N46976-0 N35216-0 N...,"[0, 1, 0, 0, 0, 0, 0]"
2,3,U44190,N36779-0 N62365-0 N58098-0 N5472-0 N13408-0 N5...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,4,U87380,N6950-0 N60215-0 N6074-0 N11930-0 N6916-0 N248...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
4,5,U9444,N5940-1 N23513-0 N49285-0 N23355-0 N19990-0 N3...,"[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"


In [146]:
group_labels = pd.Series(copy.deepcopy(behav_true_df["Impressions array"]))
random_labels = copy.deepcopy(group_labels)

for i in range(len(random_labels)):
    random_labels[i] = random.sample(random_labels[i], len(random_labels[i])) #Random.shuffle() would actually shuffle all copies

----

### **MODEL SCORING**

In [147]:
ndcg_list = [5]
for k in ndcg_list:
    ndcg_temp= np.mean(
        [
            ndcg_score(each_labels, each_preds, k)
            for each_labels, each_preds in zip(group_labels, random_labels)
        ]
    )

In [148]:
print(f'The ndcg@5 for the random model is {ndcg_temp}')

The ndcg@5 for the random model is 0.22431374503374416


In [149]:
ndcg_list = [10]
for k in ndcg_list:
    ndcg_temp= np.mean(
        [
            ndcg_score(each_labels, each_preds, k)
            for each_labels, each_preds in zip(group_labels, random_labels)
        ]
    )

In [150]:
print(f'The ndcg@10 for the random model is {ndcg_temp}')

The ndcg@10 for the random model is 0.28708592842687064


| Model   | group_auc | mean_mrr | ndcg@5 | ndcg@10 |
|----------|-----------|----------|--------|---------|
| Random    |   -  |   -  | 0.2243 |  0.2870 |

-------------