### **Initial Setup**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
route = '/content/drive/MyDrive/IMPACT PROJECT'

%cd {route}/recommenders

/content/drive/MyDrive/IMPACT PROJECT/recommenders


In [3]:
!pip install scrapbook retrying

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scrapbook
  Downloading scrapbook-0.5.0-py3-none-any.whl (34 kB)
Collecting retrying
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Collecting papermill (from scrapbook)
  Downloading papermill-2.4.0-py3-none-any.whl (38 kB)
Collecting jedi>=0.16 (from ipython->scrapbook)
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
Collecting ansiwrap (from papermill->scrapbook)
  Downloading ansiwrap-0.8.4-py2.py3-none-any.whl (8.5 kB)
Collecting textwrap3>=0.9.2 (from ansiwrap->papermill->scrapbook)
  Downloading textwrap3-0.9.2-py2.py3-none-any.whl (12 kB)
Installing collected packages: textwrap3, retrying, jedi, ansiwrap, papermill, scrapbook
Successfully installed ansiwrap-0.8.4 jedi-0.18.2 papermill-2.4.0 retrying-1.3.4 scrapbook-0.5.0 textwrap3-0.9.2


Imports

In [4]:
import sys
import os
import numpy as np
import pandas as pd
import zipfile
from tqdm import tqdm
import scrapbook as sb
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set
from sklearn.metrics import ndcg_score
from recommenders.evaluation.python_evaluation import ndcg_at_k

import warnings
# Avoid printing some FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]
Tensorflow version: 2.12.0


**Defining the MIND dataset type**

In [5]:
MIND_type = 'demo'

**Retrieving the necessary data**

In [6]:
tmpdir = TemporaryDirectory()
data_path = tmpdir.name

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'nrms.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)

if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \
                               os.path.join(data_path, 'utils'), mind_utils)

100%|██████████| 17.0k/17.0k [00:05<00:00, 3.13kKB/s]
100%|██████████| 9.84k/9.84k [00:04<00:00, 2.29kKB/s]
100%|██████████| 95.0k/95.0k [00:13<00:00, 7.29kKB/s]


# **Random model**

In [7]:
# Read the file
behav_df_demo = pd.read_csv(valid_behaviors_file,sep='\t', header=None, names=['Impression_ID', 'User_ID', 'Time', 'History', 'Impressions'])
# Select a subset
behav_true_df = behav_df_demo.loc[:, ["Impression_ID", "User_ID", "Impressions"]]
# Create the Impressions_True
behav_true_df["Impressions_True"] = behav_true_df["Impressions"].str.split().apply(lambda x: [int(item.split("-")[1]) for item in x])
#Display
behav_true_df.head()

Unnamed: 0,Impression_ID,User_ID,Impressions,Impressions_True
0,1,U41827,N23699-0 N21291-0 N1901-0 N27292-0 N17443-0 N1...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,2,U61881,N26916-0 N4641-0 N25522-0 N14893-0 N19035-0 N3...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3,U54180,N13528-0 N27689-0 N10879-0 N11662-0 N14409-0 N...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,4,U41164,N20150-0 N1807-1 N26916-0 N28138-0 N9576-0 N19...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,5,U8588,N21325-0 N5982-0 N19737-1 N9576-0 N20150-0 N25...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"


Random Model Predictions

In [8]:
# Read the file
behav_df_demo = pd.read_csv(valid_behaviors_file,sep='\t', header=None, names=['Impression_ID', 'User_ID', 'Time', 'History', 'Impressions'])
# Select a subset
behav_random_df = behav_df_demo.loc[:, ["Impression_ID", "User_ID", "Impressions"]]
# Creation of Impressions_list
behav_random_df["Impressions_count"] = behav_random_df["Impressions"].str.split().apply(len)
# Creation of the Random_Pred
behav_random_df["Random_Pred"] = behav_random_df["Impressions_count"].apply(lambda x: np.random.permutation(np.arange(1, x+1)))
#Display
behav_random_df.head()

Unnamed: 0,Impression_ID,User_ID,Impressions,Impressions_count,Random_Pred
0,1,U41827,N23699-0 N21291-0 N1901-0 N27292-0 N17443-0 N1...,28,"[24, 23, 19, 25, 27, 4, 22, 20, 2, 1, 26, 17, ..."
1,2,U61881,N26916-0 N4641-0 N25522-0 N14893-0 N19035-0 N3...,61,"[49, 7, 52, 10, 54, 58, 53, 30, 31, 47, 48, 40..."
2,3,U54180,N13528-0 N27689-0 N10879-0 N11662-0 N14409-0 N...,54,"[53, 46, 20, 15, 37, 50, 17, 25, 21, 28, 22, 2..."
3,4,U41164,N20150-0 N1807-1 N26916-0 N28138-0 N9576-0 N19...,13,"[13, 6, 1, 8, 3, 10, 7, 2, 5, 12, 9, 4, 11]"
4,5,U8588,N21325-0 N5982-0 N19737-1 N9576-0 N20150-0 N25...,10,"[2, 9, 4, 10, 7, 3, 6, 5, 1, 8]"


In [9]:
# Basic safety check
len(behav_random_df.iloc[0,:]["Random_Pred"])

28

In [19]:
behav_random_df['Random_Pred'] = behav_random_df['Random_Pred'].apply(lambda x: tuple(x))
behav_true_df['Impressions_True'] = behav_true_df['Impressions_True'].apply(lambda x: tuple(x))

In [26]:
ndgc_score_rand = ndcg_at_k(behav_true_df.loc[:,["Impression_ID", "User_ID", "Impressions_True"]],
                            behav_random_df.loc[:,["Impression_ID", "User_ID", "Random_Pred"]],
                            col_user = "User_ID",
                            col_item = "Impression_ID",
                            col_prediction = "Random_Pred",
                            col_rating = "Impressions_True")


ndgc_score_rand

1.0

In [16]:
#Creation of the predictions_rand.txt
with open(os.path.join("/content/drive/MyDrive/IMPACT PROJECT/models/RANDOM", 'prediction_rand.txt'), 'w') as file:
    for idx, row in behav_random_df.iterrows():
        index = idx + 1  # Adjust index to start from 1
        array = row['Random_Pred']
        line = f"{index} [{','.join(map(str, array))}]\n"  # Format the line as "index array\n"
        file.write(line)

# **Popularity Model**

In [27]:
# Read the file
behav_df_demo = pd.read_csv(valid_behaviors_file,sep='\t', header=None, names=['Impression_ID', 'User_ID', 'Time', 'History', 'Impressions'])
# Select a subset
behav_pop_df = behav_df_demo.loc[:, ["Impression_ID", "User_ID", "Impressions"]]
# Split impressions
behav_pop_df["Code_Impressions_1"] = behav_pop_df["Impressions"].str.split().apply(lambda x: [item.split("-")[0] for item in x if item.split("-")[1] == "1"])
# Display
behav_pop_df

Unnamed: 0,Impression_ID,User_ID,Impressions,Code_Impressions_1
0,1,U41827,N23699-0 N21291-0 N1901-0 N27292-0 N17443-0 N1...,[N8620]
1,2,U61881,N26916-0 N4641-0 N25522-0 N14893-0 N19035-0 N3...,[N19829]
2,3,U54180,N13528-0 N27689-0 N10879-0 N11662-0 N14409-0 N...,[N13530]
3,4,U41164,N20150-0 N1807-1 N26916-0 N28138-0 N9576-0 N19...,"[N1807, N16798]"
4,5,U8588,N21325-0 N5982-0 N19737-1 N9576-0 N20150-0 N25...,[N19737]
...,...,...,...,...
7533,7534,U23841,N26256-0 N28117-0 N2718-0 N16798-0 N27689-0 N6...,"[N227, N14073]"
7534,7535,U28014,N26670-0 N12794-0 N3390-0 N17443-0 N27292-0 N2...,[N1901]
7535,7536,U89684,N17443-0 N16798-0 N24553-0 N26096-0 N15927-0 N...,[N9576]
7536,7537,U92611,N14850-0 N26647-0 N272-0 N22751-0 N21398-0 N26...,[N6782]


In [28]:
# Create a new DataFrame to store unique keys and their counts
unique_keys_df = pd.DataFrame(columns=['news_code', 'count'])

# Iterate over each row in the original DataFrame
for index, row in behav_pop_df.iterrows():
  keys = row['Code_Impressions_1']
  # Iterate over each key in the list
  for key in keys:
      # Check if the key is already in the unique_keys_df
      if key in unique_keys_df['news_code'].values:
          # Increment the count for the existing key
          unique_keys_df.loc[unique_keys_df['news_code'] == key, 'count'] += 1
      else:
          # Add a new row with the key and count = 1
          unique_keys_df = unique_keys_df.append({'news_code': key, 'count': 1}, ignore_index=True)

unique_keys_df.sort_values("count")

Unnamed: 0,news_code,count
948,N14073,1
661,N26073,1
662,N24083,1
664,N2922,1
666,N20990,1
...,...,...
2,N13530,291
15,N20150,388
26,N21325,404
22,N26508,476


In [14]:
# Create an empty dictionary to store unique keys and their counts
unique_keys_dict = {}

# Iterate over each row in the original DataFrame
for index, row in behav_pop_df.iterrows():
    keys = row['Code_Impressions_1']
    # Iterate over each key in the list
    for key in keys:
        # Check if the key is already in the unique_keys_dict
        if key in unique_keys_dict:
            # Increment the count for the existing key
            unique_keys_dict[key] += 1
        else:
            # Add a new key with count = 1 to the dictionary
            unique_keys_dict[key] = 1

In [15]:
# Define a function to generate the new array
def generate_new_array(arr):
    indexed_array = [(value, index) for index, value in enumerate(arr)]
    sorted_array = sorted(indexed_array, key=lambda x: x[0], reverse=True)
    new_array = [item[1] + 1 for item in sorted_array]
    return new_array

In [16]:
# Create a new column with the list of code
behav_pop_df["Codes_Impressions"] = behav_pop_df["Impressions"].str.split().apply(lambda x: [item.split("-")[0] for item in x])
# Creation of the Codes_Count
behav_pop_df['Codes_Count'] = behav_pop_df['Codes_Impressions'].map(lambda x: [unique_keys_dict.get(code, 0) for code in x])
# Creation of Popular_Pred
behav_pop_df['Popular_Pred'] = behav_pop_df['Codes_Count'].apply(generate_new_array)
# Display
behav_pop_df

Unnamed: 0,Impression_ID,User_ID,Impressions,Code_Impressions_1,Codes_Impressions,Codes_Count,Popular_Pred
0,1,U41827,N23699-0 N21291-0 N1901-0 N27292-0 N17443-0 N1...,[N8620],"[N23699, N21291, N1901, N27292, N17443, N18282...","[222, 2, 72, 16, 111, 99, 58, 52, 86, 9, 157, ...","[1, 21, 12, 15, 11, 20, 19, 5, 6, 28, 18, 9, 3..."
1,2,U61881,N26916-0 N4641-0 N25522-0 N14893-0 N19035-0 N3...,[N19829],"[N26916, N4641, N25522, N14893, N19035, N3877,...","[140, 67, 17, 20, 11, 26, 3, 64, 404, 282, 1, ...","[58, 20, 9, 10, 41, 47, 13, 16, 50, 1, 51, 59,..."
2,3,U54180,N13528-0 N27689-0 N10879-0 N11662-0 N14409-0 N...,[N13530],"[N13528, N27689, N10879, N11662, N14409, N6849...","[12, 77, 18, 42, 94, 58, 31, 51, 111, 17, 0, 7...","[20, 49, 36, 42, 25, 22, 9, 5, 2, 6, 21, 33, 4..."
3,4,U41164,N20150-0 N1807-1 N26916-0 N28138-0 N9576-0 N19...,"[N1807, N16798]","[N20150, N1807, N26916, N28138, N9576, N19737,...","[388, 166, 140, 157, 818, 282, 36, 28, 404, 21...","[5, 12, 9, 1, 6, 11, 10, 2, 4, 3, 7, 8, 13]"
4,5,U8588,N21325-0 N5982-0 N19737-1 N9576-0 N20150-0 N25...,[N19737],"[N21325, N5982, N19737, N9576, N20150, N25701,...","[404, 18, 282, 818, 388, 58, 27, 42, 476, 291]","[4, 9, 1, 5, 10, 3, 6, 8, 7, 2]"
...,...,...,...,...,...,...,...
7533,7534,U23841,N26256-0 N28117-0 N2718-0 N16798-0 N27689-0 N6...,"[N227, N14073]","[N26256, N28117, N2718, N16798, N27689, N6280,...","[7, 33, 23, 27, 77, 3, 11, 58, 11, 6, 166, 21,...","[18, 27, 17, 14, 44, 24, 49, 11, 13, 21, 5, 8,..."
7534,7535,U28014,N26670-0 N12794-0 N3390-0 N17443-0 N27292-0 N2...,[N1901],"[N26670, N12794, N3390, N17443, N27292, N21852...","[46, 86, 17, 111, 16, 214, 140, 197, 818, 72, ...","[9, 14, 6, 8, 18, 11, 7, 4, 17, 16, 2, 10, 12,..."
7535,7536,U89684,N17443-0 N16798-0 N24553-0 N26096-0 N15927-0 N...,[N9576],"[N17443, N16798, N24553, N26096, N15927, N2625...","[111, 27, 36, 7, 16, 7, 55, 10, 282, 94, 7, 1,...","[34, 22, 31, 32, 9, 25, 15, 41, 38, 21, 1, 10,..."
7536,7537,U92611,N14850-0 N26647-0 N272-0 N22751-0 N21398-0 N26...,[N6782],"[N14850, N26647, N272, N22751, N21398, N26916,...","[6, 11, 6, 0, 31, 140, 90, 170, 44, 10, 15, 79...","[66, 38, 17, 23, 21, 8, 19, 31, 65, 6, 39, 32,..."
