# Introduction

This Notebooks is a join notebook from both the prepare_data and pytorch-bst in order to be run in google colab.

# Prepare data section

In [None]:
!pip install pytorch_lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import torch
import pytorch_lightning as pl
from tqdm import tqdm
import torchmetrics
import math
from urllib.request import urlretrieve
from zipfile import ZipFile
import os
import torch.nn as nn
import numpy as np
from math import sqrt
from tqdm import tqdm
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import dcg_score
from collections import defaultdict

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

perturb_num = 50
path_to_txt = "/content/gdrive/My Drive/data/model/random_ratings_data_{}/cnn_evaluation.txt".format(perturb_num)
path_to_specific_model = "/content/gdrive/MyDrive/data/model/random_ratings_data_{}/bst_model.ckpt".format(perturb_num)
path_to_model = "/content/gdrive/MyDrive/data/model/random_ratings_data_{}/".format(perturb_num)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Settings

In [None]:
WINDOW_SIZE = 20

## Data

In [None]:
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
ZipFile("movielens.zip", "r").extractall()

In [None]:
users = pd.read_csv(
    "ml-1m/users.dat",
    sep="::",
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
)

ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
)

movies = pd.read_csv(
    "ml-1m/movies.dat", sep="::", names=["movie_id", "title", "genres"], encoding="ISO-8859-1"
)

  return func(*args, **kwargs)


In [None]:
## Movies
movies["year"] = movies["title"].apply(lambda x: x[-5:-1])
movies.year = pd.Categorical(movies.year)
movies["year"] = movies.year.cat.codes
## Users
users.sex = pd.Categorical(users.sex)
users["sex"] = users.sex.cat.codes


users.age_group = pd.Categorical(users.age_group)
users["age_group"] = users.age_group.cat.codes


users.occupation = pd.Categorical(users.occupation)
users["occupation"] = users.occupation.cat.codes


users.zip_code = pd.Categorical(users.zip_code)
users["zip_code"] = users.zip_code.cat.codes

#Ratings
ratings['unix_timestamp'] = pd.to_datetime(ratings['unix_timestamp'],unit='s')


In [None]:
# Save primary csv's
if not os.path.exists('data'):
    os.makedirs('data')
    
    
users.to_csv("data/users.csv",index=False)
movies.to_csv("data/movies.csv",index=False)
ratings.to_csv("data/ratings.csv",index=False)

In [None]:
## Movies
movies["movie_id"] = movies["movie_id"].astype(str)
## Users
users["user_id"] = users["user_id"].astype(str)

##Ratings 
ratings["movie_id"] = ratings["movie_id"].astype(str)
ratings["user_id"] = ratings["user_id"].astype(str)

In [None]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]

for genre in genres:
    movies[genre] = movies["genres"].apply(
        lambda values: int(genre in values.split("|"))
    )


### Load the movie ratings data

In [None]:
from ast import literal_eval
# Load and clean
df = pd.read_csv('/content/gdrive/MyDrive/data/random_ratings_data_{}.csv'.format(perturb_num))
df = df[['user_id','random_movie_ids',	'random_ratings']]#,	'sex',	'age_group',	'occupation'
df["user_id"] = df["user_id"].astype(str)
df.rename(columns = {'random_movie_ids':'movie_ids','random_ratings':'ratings'}, inplace=True)
df['movie_ids'] = df['movie_ids'].apply(lambda x: literal_eval(x))
df['ratings'] = df['ratings'].apply(lambda x: literal_eval(x))
# Process test data such that T-T+L is in a separate array to get the ground truth
sequence_length = 8
ground_truth_lenght = 15
test_sequence_lenght = 15+sequence_length-1
df['movie_ids_test'] = df['movie_ids'].apply(lambda x: x[len(x) - test_sequence_lenght:])
# We save only 0:T in sequence_movie_ids
df['movie_ids'] = df['movie_ids'].apply(lambda x: x[:len(x) - ground_truth_lenght])
# We save only 0:T in ratings
df['ratings_test'] = df['ratings'].apply(lambda x: x[len(x) - test_sequence_lenght:])
# We save only 0:T in sequence_ratings
df['ratings'] = df['ratings'].apply(lambda x: x[:len(x) - ground_truth_lenght])
# Remove anything any user with chain less than 6 (total less than 21)
df['len'] = df['movie_ids'].apply(lambda x: len(x) )
df = df[df['len']>=sequence_length]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["user_id"] = df["user_id"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

## Create test data set

In [None]:
# Clean the test data frame
df_test = df[['user_id','movie_ids_test',	'ratings_test']]#,	'sex',	'age_group',	'occupation']]
df_test.rename(columns={'movie_ids_test':'sequence_movie_ids','ratings_test':'sequence_ratings'}, inplace=True)
df.drop(columns = ['movie_ids_test',	'ratings_test', 'len'], inplace= True)

# Process test data such that T-T+L is in a separate array to get the ground truth
df_test['sequence_movie_ids_ground_truth'] = df_test['sequence_movie_ids'].apply(lambda x: x[sequence_length-1:])
# We save only 0:T in sequence_movie_ids
df_test['sequence_movie_ids'] = df_test['sequence_movie_ids'].apply(lambda x: x[:sequence_length])
# We save only 0:T in ratings
df_test['sequence_ratings'] = df_test['sequence_ratings'].apply(lambda x: x[:sequence_length])

## Create sequences of length "sequence length" for train set (explode the dataset with step size =1)

In [None]:
step_size = 1


def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            #seq = values[-window_size:]
            #if len(seq) == window_size:
            #    sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences


df.movie_ids = df.movie_ids.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

df.ratings = df.ratings.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)
ratings_data = df
ratings_data_movies = ratings_data[["user_id", "movie_ids"]].explode(
    "movie_ids", ignore_index=True
)
ratings_data_rating = ratings_data[["ratings"]].explode("ratings", ignore_index=True)
ratings_data_transformed = pd.concat([ratings_data_movies, ratings_data_rating], axis=1)
# Join all exploded ratings and movies sequences in a single dataframe 
ratings_data_transformed = ratings_data_transformed.join(
    users.set_index("user_id"), on="user_id"
)

del ratings_data_transformed["zip_code"]

ratings_data_transformed.rename(
    columns={"movie_ids": "sequence_movie_ids", "ratings": "sequence_ratings"},
    inplace=True,
)

test_data = df_test.join(
    users.set_index("user_id"), on="user_id"
)

del test_data["zip_code"]
# Keep a separate ground truth df by user_id
ground_truth_df = test_data[['sequence_movie_ids_ground_truth', 'user_id']]
del test_data['sequence_movie_ids_ground_truth']
train_data, val_data = train_test_split(ratings_data_transformed, test_size=0.15)

## We want to modify the test data to have in the end of each sequence all possible movies such that we get a prediction for each

In [None]:
# add all movies to the end of each sequence
df_list = []
for i, movie in tqdm(enumerate(list(movies.movie_id))):
  test_data['sequence_movie_ids'] = test_data['sequence_movie_ids'].apply(lambda x: x[:-1] + [movie] )
  df_list.append(test_data.copy())
test_data = pd.concat(df_list, axis=0)

3883it [00:36, 106.07it/s]


## Some last formatting arrangements that the model requires

In [None]:
test_data['sequence_movie_ids'] = test_data['sequence_movie_ids'].apply(lambda x: ','.join(x))
test_data['sequence_ratings'] = test_data['sequence_ratings'].apply(lambda x: ','.join([str(y) for y in x]))
ground_truth_df['sequence_movie_ids_ground_truth'] = ground_truth_df['sequence_movie_ids_ground_truth'].apply(lambda x: ','.join(x))

train_data['sequence_movie_ids'] = train_data['sequence_movie_ids'].apply(lambda x: ','.join(x))
train_data['sequence_ratings'] = train_data['sequence_ratings'].apply(lambda x: ','.join([str(y) for y in x]))

val_data['sequence_movie_ids'] = val_data['sequence_movie_ids'].apply(lambda x: ','.join(x))
val_data['sequence_ratings'] = val_data['sequence_ratings'].apply(lambda x: ','.join([str(y) for y in x]))

In [None]:
train_data.to_csv("data/train_data.csv", index=False, sep=",")
test_data.to_csv("data/test_data.csv", index=False, sep=",")
val_data.to_csv("data/val_data.csv", index=False, sep=",")

# BST Implementation and training

In [None]:
import pandas as pd
import torch
import pytorch_lightning as pl
from tqdm import tqdm
import torchmetrics
import math
from urllib.request import urlretrieve
from zipfile import ZipFile
import os
import torch.nn as nn
import numpy as np

In [None]:
users = pd.read_csv(
    "data/users.csv",
    sep=",",
)

ratings = pd.read_csv(
    "data/ratings.csv",
    sep=",",
)

movies = pd.read_csv(
    "data/movies.csv", sep=","
)

## Pytorch dataset

In [None]:
import pandas as pd
import torch
import torch.utils.data as data
from torchvision import transforms
import ast
from torch.nn.utils.rnn import pad_sequence

class MovieDataset(data.Dataset):
    """Movie dataset."""

    def __init__(
        self, ratings_file,test=False
    ):
        """
        Args:
            csv_file (string): Path to the csv file with user,past,future.
        """
        self.ratings_frame = pd.read_csv(
            ratings_file,
            delimiter=",",
            # iterator=True,
        )
        self.test = test

    def __len__(self):
        return len(self.ratings_frame)

    def __getitem__(self, idx):
        data = self.ratings_frame.iloc[idx]
        user_id = data.user_id
        
        movie_history = eval(data.sequence_movie_ids)
        movie_history_ratings = eval(data.sequence_ratings)
        target_movie_id = movie_history[-1:][0]
        target_movie_rating = movie_history_ratings[-1:][0]
        
        movie_history = torch.LongTensor(movie_history[:-1])
        movie_history_ratings = torch.LongTensor(movie_history_ratings[:-1])

        
        
        sex = data.sex
        age_group = data.age_group
        occupation = data.occupation
        
        return user_id, movie_history, target_movie_id,  movie_history_ratings, target_movie_rating, sex, age_group, occupation

In [None]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]

for genre in genres:
    movies[genre] = movies["genres"].apply(
        lambda values: int(genre in values.split("|"))
    )
    

In [None]:
class PositionalEmbedding(nn.Module):
    """
    Computes positional embedding following "Attention is all you need"
    """

    def __init__(self, max_len, d_model):
        super().__init__()

        # Compute the positional encodings once in log space.
        self.pe = nn.Embedding(max_len, d_model)

    def forward(self, x):
        batch_size = x.size(0)
        return self.pe.weight.unsqueeze(0).repeat(batch_size, 1, 1)


class BST(pl.LightningModule):
    def __init__(
        self, args=None,
    ):
        super().__init__()
        super(BST, self).__init__()
        
        self.save_hyperparameters()
        self.args = args
        #-------------------
        # Embedding layers
        ##Users 
        self.embeddings_user_id = nn.Embedding(
            int(users.user_id.max())+1, int(math.sqrt(users.user_id.max()))+1
        )
        ###Users features embeddings
        self.embeddings_user_sex = nn.Embedding(
            len(users.sex.unique()), int(math.sqrt(len(users.sex.unique())))
        )
        self.embeddings_age_group = nn.Embedding(
            len(users.age_group.unique()), int(math.sqrt(len(users.age_group.unique())))
        )
        self.embeddings_user_occupation = nn.Embedding(
            len(users.occupation.unique()), int(math.sqrt(len(users.occupation.unique())))
        )
        self.embeddings_user_zip_code = nn.Embedding(
            len(users.zip_code.unique()), int(math.sqrt(len(users.sex.unique())))
        )
        
        ##Movies
        self.embeddings_movie_id = nn.Embedding(
            int(movies.movie_id.max())+1, int(math.sqrt(movies.movie_id.max()))+1
        )
        
        ###Movies features embeddings
        genre_vectors = movies[genres].to_numpy()
        self.embeddings_movie_genre = nn.Embedding(
            genre_vectors.shape[0], genre_vectors.shape[1]
        )
        
        
        
        self.embeddings_movie_year = nn.Embedding(
            len(movies.year.unique()), int(math.sqrt(len(movies.year.unique())))
        )
        
        self.positional_embedding = PositionalEmbedding(8, 9)
        
        # Network
        self.transfomerlayer = nn.TransformerEncoderLayer(72, 3, dropout=0.2)
        self.linear = nn.Sequential(
            nn.Linear(
                661,
                1024,
            ),
            nn.LeakyReLU(),
            nn.Linear(1024, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 1),
        )
        self.criterion = torch.nn.MSELoss()
        self.mae = torchmetrics.MeanAbsoluteError()
        self.mse = torchmetrics.MeanSquaredError()
        


    def encode_input(self,inputs):
        user_id, movie_history, target_movie_id,  movie_history_ratings, target_movie_rating, sex, age_group, occupation = inputs
               
        #MOVIES
        movie_history = self.embeddings_movie_id(movie_history)
        target_movie = self.embeddings_movie_id(target_movie_id)
         
        target_movie = torch.unsqueeze(target_movie, 1)
        transfomer_features = torch.cat((movie_history, target_movie),dim=1)

        #USERS
        user_id = self.embeddings_user_id(user_id)
        
        sex = self.embeddings_user_sex(sex)
        age_group = self.embeddings_age_group(age_group)
        occupation = self.embeddings_user_occupation(occupation)
        user_features = torch.cat((user_id, sex, age_group,occupation), 1)
        
        return transfomer_features, user_features, target_movie_rating.float()
    
    def forward(self, batch):
        transfomer_features, user_features, target_movie_rating = self.encode_input(batch)
        positional_embedding = self.positional_embedding(transfomer_features)
        transfomer_features = torch.cat((transfomer_features, positional_embedding), dim=2)
        transformer_output = self.transfomerlayer(transfomer_features)
        transformer_output = torch.flatten(transformer_output,start_dim=1)
        
        #Concat with other features
        features = torch.cat((transformer_output,user_features),dim=1)

        output = self.linear(features)
        return output, target_movie_rating
        
    def training_step(self, batch, batch_idx):
        out, target_movie_rating = self(batch)
        out = out.flatten()
        loss = self.criterion(out, target_movie_rating)
        
        mae = self.mae(out, target_movie_rating)
        mse = self.mse(out, target_movie_rating)
        rmse =torch.sqrt(mse)
        self.log(
            "train/mae", mae, on_step=True, on_epoch=False, prog_bar=False
        )
        
        self.log(
            "train/rmse", rmse, on_step=True, on_epoch=False, prog_bar=False
        )
        
        self.log("train/step_loss", loss, on_step=True, on_epoch=False, prog_bar=False)
        return loss
    
    def validation_step(self, batch, batch_idx):
        out, target_movie_rating = self(batch)
        out = out.flatten()
        loss = self.criterion(out, target_movie_rating)
        
        mae = self.mae(out, target_movie_rating)
        mse = self.mse(out, target_movie_rating)
        rmse =torch.sqrt(mse)
        
        return {"val_loss": loss, "mae": mae.detach(), "rmse":rmse.detach()}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        avg_mae = torch.stack([x["mae"] for x in outputs]).mean()
        avg_rmse = torch.stack([x["rmse"] for x in outputs]).mean()
        
        self.log("val/loss", avg_loss, on_step=False, on_epoch=True, prog_bar=False)
        self.log("val/mae", avg_mae, on_step=False, on_epoch=True, prog_bar=False)
        self.log("val/rmse", avg_rmse, on_step=False, on_epoch=True, prog_bar=False)


    def test_epoch_end(self, outputs):
        users = torch.cat([x["users"] for x in outputs])
        y_hat = torch.cat([x["top14"] for x in outputs])
        users = users.tolist()
        y_hat = y_hat.tolist()
        
        data = {"users": users, "top14": y_hat}
        df = pd.DataFrame.from_dict(data)
        print(len(df))
        df.to_csv("lightning_logs/predict.csv", index=False)

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=0.0005)

    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = ArgumentParser(parents=[parent_parser], add_help=False)
        parser.add_argument("--learning_rate", type=float, default=0.01)
        return parser

    ####################
    # DATA RELATED HOOKS
    ####################

    def setup(self, stage=None):
        print("Loading datasets")
        self.train_dataset = MovieDataset("data/train_data.csv")
        self.val_dataset = MovieDataset("data/val_data.csv")
        self.test_dataset = MovieDataset("data/test_data.csv")
        print("Done")

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=128,
            shuffle=False,
            num_workers=os.cpu_count(),
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.val_dataset,
            batch_size=128,
            shuffle=False,
            num_workers=os.cpu_count(),
        )

    def test_dataloader(self):
        return torch.utils.data.DataLoader(
            self.test_dataset,
            batch_size=128,
            shuffle=False,
            num_workers=os.cpu_count(),
        )
        
model = BST()
trainer = pl.Trainer(accelerator='gpu', devices=1, max_epochs=50, default_root_dir=path_to_model)
trainer.fit(model)

In [None]:
# UNCOMMENT THIS CODE TO LOAD SAVED MODEL FROM CHECKPOINT - edit checkpoint path
# model = BST.load_from_checkpoint(path_to_model + "lightning_logs/version_3/checkpoints/epoch=38-step=249054.ckpt")
#model = BST.load_from_checkpoint(path_to_specific_model)#"fully_trained_model.ckpt"
#model.setup()

Loading datasets
Done


## Generate predictions for test dataset

In [None]:
pred = trainer.predict(model,model.test_dataloader())

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


Loading datasets
Done


INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [None]:
# joins all predicted ratings into one list
rating_pred = []
for i in range(len(pred)):
  for j in range(len(pred[i][0])):
    rating_pred.append(float(pred[i][0][j]))

In [None]:
# Create predicted rating column and index column. Index column is a unique user
# sequence combination.
test_data['rating_pred'] = rating_pred
test_data = test_data.sort_values('rating_pred', ascending=False).groupby('user_id').head(15).reset_index(drop=True)
unique_user = list(test_data['user_id'].unique())
test_data.set_index('user_id', inplace=True)
ground_truth_df.set_index('user_id', inplace=True)
test_data.drop(columns = ['sequence_ratings',	'sex',	'age_group',	'occupation'], inplace=True)
test_data['sequence_movie_ids'] = test_data['sequence_movie_ids'].str.split(',').apply(lambda x: x[-1])

Unnamed: 0,level_0,index,user_id,sequence_movie_ids,sequence_ratings,sex,age_group,occupation,rating_pred
0,0,0,1,318653152727911029291811971,44545434,0,0,10,3.573877
1,1,1,10,1959103520023347287452734381,55455435,0,3,1,4.541002
2,2,2,100,121013712409292253520834411,43332224,1,3,17,4.852975
3,3,3,1000,20182687213712201196375129471,34343443,0,2,6,3.740940
4,4,4,1001,16691722168324371535254127071,42434234,1,2,4,4.401700
...,...,...,...,...,...,...,...,...,...
22467033,22467033,6035,995,357215353653954321008383952,55454333,0,1,4,4.677387
22467034,22467034,6036,996,16823793191238792167219431763952,45434544,1,2,17,3.320209
22467035,22467035,6037,997,15893623126522782501343952,44443345,1,0,19,2.609436
22467036,22467036,6038,998,16333182166364267127236183952,44444434,1,4,20,4.808477


In [None]:
# https://github.com/dkaterenchuk/ranking_measures/blob/master/measures.py
def find_dcg(element_list):
    """
    Discounted Cumulative Gain (DCG)
    The definition of DCG can be found in this paper:
        Azzah Al-Maskari, Mark Sanderson, and Paul Clough. 2007.
        "The relationship between IR effectiveness measures and user satisfaction."
    Parameters:
        element_list - a list of ranks Ex: [5,4,2,2,1]
    Returns:
        score
    """
    score = 0.0
    for order, rank in enumerate(element_list):
        score += float(rank)/math.log((order+2))
    return score


def find_ndcg(reference, hypothesis):
    """
    Normalized Discounted Cumulative Gain (nDCG)
    Normalized version of DCG:
        nDCG = DCG(hypothesis)/DCG(reference)
    Parameters:
        reference   - a gold standard (perfect) ordering Ex: [5,4,3,2,1]
        hypothesis  - a proposed ordering Ex: [5,2,2,3,1]
    Returns:
        ndcg_score  - normalized score
    """

    return find_dcg(hypothesis)/find_dcg(reference)
def def_value():
  return 0

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def compute_precision_recall_ndcg(targets, predictions):
    """
    Compute precision, recall, and NDCG for one user.
    """
    # Compute precision and recall
    precision = precision_score(targets, predictions,average='micro')
    recall = recall_score(targets, predictions,average='micro')

    # Compute NDCG
    # Relevance scores in ideal order
    true_relevance = []
          
    d = defaultdict(def_value)

    for i in range(len(targets), 0, -1):
      true_relevance.append(i)

    # Relevance scores in output order
    for i, value in enumerate(true_relevance):
      d[targets[i]] = value
    
    pred_relevance = []

    for predicted in predictions:
      pred_relevance.append(d[predicted])
    
    # Normalized DCG score
    ndcg = find_ndcg(true_relevance, pred_relevance)

    return precision, recall, ndcg

In [None]:
k = [5, 10, 15]
precisions = [list() for _ in range(len(k))]
recalls = [list() for _ in range(len(k))]
NDCGs = [list() for _ in range(len(k))]

for user in tqdm(unique_user):
    test_data_indexed  = test_data[test_data.index.isin([user])]

    # predicted ordered recommendation top k
    predictions = list(test_data_indexed['sequence_movie_ids'])
    
    # ground truth ordered recommendation top k
    targets = ground_truth_df[ground_truth_df.index.isin([user])]['sequence_movie_ids_ground_truth'].str.split(',').values[0]


    for i, _k in enumerate(k):
        precision, recall, NDCG = compute_precision_recall_ndcg(targets[0:_k], predictions[:_k])
        precisions[i].append(precision)
        recalls[i].append(recall)
        NDCGs[i].append(NDCG)

precisions = [np.mean(i) for i in precisions]
recalls = [np.mean(i) for i in recalls]
NDCGs = [np.mean(i) for i in NDCGs]

In [None]:
# Writing to file
with open(path_to_txt, "w") as file1:
    # Writing data to a file
    for i, _k in enumerate(k):
      file1.write('Precision at {} = {}\n'.format(_k, precisions[i]))
      file1.write('Recall at {} = {}\n'.format(_k, recalls[i]))
      file1.write('NDCG at {} = {}\n'.format(_k, NDCGs[i]))