# LightGCN - simplified GCN model for recommendation

This notebook serves as an introduction to LightGCN [1], which is an simple, linear and neat Graph Convolution Network (GCN) [3] model for recommendation.

## 0 Global Settings and Imports

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.utils.constants import (
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_PREDICTION_COL,
    DEFAULT_RELEVANCE_COL,
    DEFAULT_SIMILARITY_COL,
    DEFAULT_ITEM_FEATURES_COL,
    DEFAULT_ITEM_SIM_MEASURE,
    DEFAULT_K,
    DEFAULT_THRESHOLD,
)
DEFAULT_RNDCG_MULTIPLIER = 3
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.8.15 | packaged by conda-forge | (default, Nov 22 2022, 08:51:59) 
[Clang 14.0.6 ]
Pandas version: 1.5.3
Tensorflow version: 2.9.1


In [2]:
THIS_ENGINE_NAME = "lightgcn_field_7"

# DATA_FILE_NAME = "../Data/20230813T210908_sales_6mo_basic_single_events_removed.csv"
DATA_FILE_NAME = "../Data/20230811T031507_sales_12mo_basic_single_events_removed.csv"
# DATA_FILE_NAME = "../Data/20230809T003134_sales_25mo_basic_single_events_removed.csv"
COL_USER = "location_id"
COL_ITEM = "product"
COL_RATING = "sold_revenue"

# country
COUNTRY = "nigeria"

# top k items to recommend, for train & test
TOP_K_SPLIT_TRAIN_TEST = 10
# top k items to recommend, for final product recommendation output
TOP_K_WHOLE = 10
SAVE_ALL_RECS = False
SAVE_NEW_RECS = False

# fraction of location_skus to include in training dataset
TRAIN_FRAC = 0.75

# top MULTIPLIER * k items are considered relevant for nDCG
RNDCG_MULTIPLIER = 3

# Model parameters
EPOCHS = 50
BATCH_SIZE = 1024
N_LAYERS = 3
LEARNING_RATE = 0.005
EVAL_EPOCH = 5

SEED = DEFAULT_SEED  # Set None for non-deterministic results

yaml_file = THIS_ENGINE_NAME + "_hparams.yaml"
# user_file = "../../tests/resources/deeprec/lightgcn/user_embeddings.csv"
# item_file = "../../tests/resources/deeprec/lightgcn/item_embeddings.csv"

### Revised/copied code from packages

In [3]:
def _get_rating_column(relevancy_method: str, **kwargs) -> str:
    r"""Helper utility to simplify the arguments of eval metrics
    Attemtps to address https://github.com/microsoft/recommenders/issues/1737.

    Args:
        relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
            top k items are directly provided, so there is no need to compute the relevancy operation.

    Returns:
        str: rating column name.
    """
    if relevancy_method != "top_k":
        if "col_rating" not in kwargs:
            raise ValueError("Expected an argument `col_rating` but wasn't found.")
        col_rating = kwargs.get("col_rating")
    else:
        col_rating = kwargs.get("col_rating", DEFAULT_RATING_COL)
    return col_rating


def get_top_k_items(
    dataframe, col_user=DEFAULT_USER_COL, col_rating=DEFAULT_RATING_COL, k=DEFAULT_K
):
    """Get the input customer-item-rating tuple in the format of Pandas
    DataFrame, output a Pandas DataFrame in the dense format of top k items
    for each user.

    Note:
        If it is implicit rating, just append a column of constants to be
        ratings.

    Args:
        dataframe (pandas.DataFrame): DataFrame of rating data (in the format
        customerID-itemID-rating)
        col_user (str): column name for user
        col_rating (str): column name for rating
        k (int or None): number of items for each user; None means that the input has already been
        filtered out top k items and sorted by ratings and there is no need to do that again.

    Returns:
        pandas.DataFrame: DataFrame of top k items for each user, sorted by `col_user` and `rank`
    """
    # Sort dataframe by col_user and (top k) col_rating
    if k is None:
        top_k_items = dataframe
    else:
        top_k_items = (
            dataframe.sort_values([col_user, col_rating], ascending=[True, False])
            .groupby(col_user, as_index=False)
            .head(k)
            .reset_index(drop=True)
        )
    # Add ranks
    top_k_items["rank"] = top_k_items.groupby(col_user, sort=False).cumcount() + 1
    return top_k_items


def merge_ranking_true_pred_new(
    rating_true,
    rating_pred,
    col_user,
    col_item,
    col_rating,
    col_prediction,
    relevancy_method,
    k=DEFAULT_K,
    rndcg_multiplier=DEFAULT_RNDCG_MULTIPLIER,
    threshold=DEFAULT_THRESHOLD,
):
    """Filter truth and prediction data frames on common users

    Args:
        rating_true (pandas.DataFrame): True DataFrame
        rating_pred (pandas.DataFrame): Predicted DataFrame
        col_user (str): column name for user
        col_item (str): column name for item
        col_rating (str): column name for rating
        col_prediction (str): column name for prediction
        relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
            top k items are directly provided, so there is no need to compute the relevancy operation.
        k (int): number of top k items per user (optional)
        threshold (float): threshold of top items per user (optional)

    Returns:
        pandas.DataFrame: DataFrame of recommendation hits, sorted by `col_user` and `rank`
    """

    # Make sure the prediction and true data frames have the same set of users
    common_users = set(rating_true[col_user]).intersection(set(rating_pred[col_user]))
    rating_true_common = rating_true[rating_true[col_user].isin(common_users)]
    rating_pred_common = rating_pred[rating_pred[col_user].isin(common_users)]

    # Return hit items in prediction data frame with ranking information. This is used for calculating NDCG and MAP.
    # Use first to generate unique ranking values for each item. This is to align with the implementation in
    # Spark evaluation metrics, where index of each recommended items (the indices are unique to items) is used
    # to calculate penalized precision of the ordered items.
    if relevancy_method == "top_k":
        top_k = k
    elif relevancy_method == "by_threshold":
        top_k = threshold
    elif relevancy_method is None:
        top_k = None
    else:
        raise NotImplementedError("Invalid relevancy_method")
    df_hit = get_top_k_items(
        dataframe=rating_pred_common,
        col_user=col_user,
        col_rating=col_prediction,
        k=top_k,
    )
    rating_true_common_top_mult_k = get_top_k_items(
        dataframe=rating_true_common,
        col_user=col_user,
        col_rating=col_rating,
        k=RNDCG_MULTIPLIER * top_k,
    )[[col_user, col_item]]
    df_hit = pd.merge(df_hit, rating_true_common_top_mult_k, on=[col_user, col_item])[
        [col_user, col_item, "rank"]
    ]

    return df_hit


def rndcg_at_k(
    rating_true,
    rating_pred,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_prediction=DEFAULT_PREDICTION_COL,
    relevancy_method="top_k",
    k=DEFAULT_K,
    rndcg_multiplier=RNDCG_MULTIPLIER,
    threshold=DEFAULT_THRESHOLD,
    score_type="binary",
    discfun_type="loge",
    **kwargs
):
    """Normalized Discounted Cumulative Gain (nDCG).

    Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain

    Args:
        rating_true (pandas.DataFrame): True DataFrame
        rating_pred (pandas.DataFrame): Predicted DataFrame
        col_user (str): column name for user
        col_item (str): column name for item
        col_rating (str): column name for rating
        col_prediction (str): column name for prediction
        relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
            top k items are directly provided, so there is no need to compute the relevancy operation.
        k (int): number of top k items per user
        threshold (float): threshold of top items per user (optional)
        score_type (str): type of relevance scores ['binary', 'raw', 'exp']. With the default option 'binary', the
            relevance score is reduced to either 1 (hit) or 0 (miss). Option 'raw' uses the raw relevance score.
            Option 'exp' uses (2 ** RAW_RELEVANCE - 1) as the relevance score
        discfun_type (str): type of discount function ['loge', 'log2'] used to calculate DCG.

    Returns:
        float: nDCG at k (min=0, max=1).
    """
    col_rating = _get_rating_column(relevancy_method, **kwargs)
    df_hit = merge_ranking_true_pred_new(
        rating_true=rating_true,
        rating_pred=rating_pred,
        col_user=col_user,
        col_item=col_item,
        col_rating=col_rating,
        col_prediction=col_prediction,
        relevancy_method=relevancy_method,
        k=k,
        rndcg_multiplier=rndcg_multiplier,
        threshold=threshold,
    )

    if df_hit.shape[0] == 0:
        return 0.0

    df_dcg = df_hit.merge(rating_pred, on=[col_user, col_item]).merge(
        rating_true, on=[col_user, col_item], how="outer", suffixes=("_left", None)
    )

    if score_type == "binary":
        df_dcg["rel"] = 1
    elif score_type == "raw":
        df_dcg["rel"] = df_dcg[col_rating]
    elif score_type == "exp":
        df_dcg["rel"] = 2 ** df_dcg[col_rating] - 1
    else:
        raise ValueError("score_type must be one of 'binary', 'raw', 'exp'")

    if discfun_type == "loge":
        discfun = np.log
    elif discfun_type == "log2":
        discfun = np.log2
    else:
        raise ValueError("discfun_type must be one of 'loge', 'log2'")

    # Calculate the actual discounted gain for each record
    df_dcg["dcg"] = df_dcg["rel"] / discfun(1 + df_dcg["rank"])

    # Calculate the ideal discounted gain for each record
    df_idcg = df_dcg.sort_values([col_user, col_rating], ascending=False)
    df_idcg["irank"] = df_idcg.groupby(col_user, as_index=False, sort=False)[
        col_rating
    ].rank("first", ascending=False)
    df_idcg["idcg"] = df_idcg["rel"] / discfun(1 + df_idcg["irank"])

    # Calculate the actual DCG for each user
    df_user = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"})

    # Calculate the ideal DCG for each user
    df_user = df_user.merge(
        df_idcg.groupby(col_user, as_index=False, sort=False)
        .head(k)
        .groupby(col_user, as_index=False, sort=False)
        .agg({"idcg": "sum"}),
        on=col_user,
    )

    # DCG over IDCG is the normalized DCG
    df_user["ndcg"] = df_user["dcg"] / df_user["idcg"]
    return df_user["ndcg"].mean()

## 1 Data

### 1.1 Load and split data

We split the full dataset into a `train` and `test` dataset to evaluate performance of the algorithm against a held-out set not seen during training. Because LightGCN generates recommendations based on user preferences, all users that are in the test set must also exist in the training set. For this case, we can use the provided `python_stratified_split` function which holds out a percentage (in this case 25%) of items from each user, but ensures all users are in both `train` and `test` datasets. Other options are available in the `dataset.python_splitters` module which provide more control over how the split occurs.

In [4]:
df_all_cols = pd.read_csv(DATA_FILE_NAME)
df_all_cols = df_all_cols[df_all_cols["country"] == COUNTRY]

df = df_all_cols[[COL_USER, COL_ITEM, "sold_count", "price"]]
# df[df["price"] == 0].head()
df[COL_RATING] = df["sold_count"] * df["price"]
df.drop(labels=["sold_count", "price"], axis=1, inplace=True)
df.rename(columns = {COL_USER: DEFAULT_USER_COL, COL_ITEM: DEFAULT_ITEM_COL, COL_RATING: DEFAULT_RATING_COL}, inplace = True)
df_num_items = df[DEFAULT_ITEM_COL].nunique()

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[COL_RATING] = df["sold_count"] * df["price"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(labels=["sold_count", "price"], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns = {COL_USER: DEFAULT_USER_COL, COL_ITEM: DEFAULT_ITEM_COL, COL_RATING: DEFAULT_RATING_COL}, inplace = True)


Unnamed: 0,userID,itemID,rating
0,"""f3c08845-37cd-420a-819c-653cfad4df7e""",Coartem 80/480mg Tablets x6,144900.0
1,"""d46d529e-e728-4d4e-b1a5-6512c894f43d""",Coartem 80/480mg Tablets x6,16560.0
2,"""67353226-2229-4b3b-b6aa-919fcc6bd7ff""",Coartem 80/480mg Tablets x6,35190.0
3,"""90dbea29-ee13-4aaf-8bda-e578befc20f7""",Coartem 80/480mg Tablets x6,2070.0
4,"""f3686258-8c08-4918-9bbb-545618397f3e""",Coartem 80/480mg Tablets x6,113850.0


In [5]:
train, test = python_stratified_split(df, ratio = TRAIN_FRAC, seed = SEED)

### 1.2 Process data

`ImplicitCF` is a class that intializes and loads data for the training process. During the initialization of this class, user IDs and item IDs are reindexed, ratings greater than zero are converted into implicit positive interaction, and adjacency matrix $R$ of user-item graph is created. Some important methods of `ImplicitCF` are:

`get_norm_adj_mat`, load normalized adjacency matrix of user-item graph if it already exists in `adj_dir`, otherwise call `create_norm_adj_mat` to create the matrix and save the matrix if `adj_dir` is not `None`. This method will be called during the initialization process of LightGCN model.

`create_norm_adj_mat`, create normalized adjacency matrix of user-item graph by calculating $D^{-\frac{1}{2}} A D^{-\frac{1}{2}}$, where $\mathbf{A}=\left(\begin{array}{cc}\mathbf{0} & \mathbf{R} \\ \mathbf{R}^{T} & \mathbf{0}\end{array}\right)$.

`train_loader`, generate a batch of training data — sample a batch of users and then sample one positive item and one negative item for each user. This method will be called before each epoch of the training process.


In [6]:
data = ImplicitCF(train=train, test=test, seed=SEED)

  df = train if test is None else train.append(test)


### 1.3 Prepare hyper-parameters

Important parameters of `LightGCN` model are:

`data`, initialized LightGCNDataset object.

`epochs`, number of epochs for training.

`n_layers`, number of layers of the model.

`eval_epoch`, if it is not None, evaluation metrics will be calculated on test set every "eval_epoch" epochs. In this way, we can observe the effect of the model during the training process.

`top_k`, the number of items to be recommended for each user when calculating ranking metrics.

A complete list of parameters can be found in `yaml_file`. We use `prepare_hparams` to read the yaml file and prepare a full set of parameters for the model. Parameters passed as the function's parameters will overwrite yaml settings.

In [7]:
hparams = prepare_hparams(yaml_file,
                          n_layers=N_LAYERS,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=LEARNING_RATE,
                          eval_epoch=EVAL_EPOCH,
                          top_k=TOP_K_SPLIT_TRAIN_TEST,
                         )

## 2 Train model

With data and parameters prepared, we can create the LightGCN model.

To train the model, we simply need to call the `fit()` method.

In [8]:
model = LightGCN(hparams, data, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


2023-08-10 21:56:21.639161: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-10 21:56:21.655636: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled


In [9]:
with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

Epoch 1 (train)3.8s: train loss = 0.47255 = (mf)0.47232 + (embed)0.00023
Epoch 2 (train)3.1s: train loss = 0.30749 = (mf)0.30692 + (embed)0.00057
Epoch 3 (train)3.2s: train loss = 0.28320 = (mf)0.28253 + (embed)0.00067
Epoch 4 (train)3.2s: train loss = 0.25437 = (mf)0.25353 + (embed)0.00084
Epoch 5 (train)3.1s + (eval)0.1s: train loss = 0.23280 = (mf)0.23176 + (embed)0.00103, 
Epoch 6 (train)3.1s: train loss = 0.22046 = (mf)0.21926 + (embed)0.00120
Epoch 7 (train)3.1s: train loss = 0.20976 = (mf)0.20841 + (embed)0.00136
Epoch 8 (train)3.1s: train loss = 0.20090 = (mf)0.19938 + (embed)0.00152
Epoch 9 (train)3.1s: train loss = 0.18758 = (mf)0.18588 + (embed)0.00169
Epoch 10 (train)3.1s + (eval)0.0s: train loss = 0.17971 = (mf)0.17782 + (embed)0.00189, 
Epoch 11 (train)3.1s: train loss = 0.17075 = (mf)0.16867 + (embed)0.00208
Epoch 12 (train)3.2s: train loss = 0.16143 = (mf)0.15915 + (embed)0.00228
Epoch 13 (train)3.2s: train loss = 0.15539 = (mf)0.15293 + (embed)0.00246
Epoch 14 (train)3

## 3 Prediction/Recommendation

Recommendation and evaluation have been performed on the specified test set during training. After training, we can also use the model to perform recommendation and evalution on other data. Here we still use `test` as test data, but `test` can be replaced by other data with similar data structure.

We can call `recommend_k_items` to recommend k items for each user passed in this function. We set `remove_seen=True` to remove the items already seen by the user. The function returns a dataframe, containing each user and top k items recommended to them and the corresponding ranking scores.

In [10]:
with Timer() as test_time:
    test_predictions = model.recommend_k_items(test, top_k=test[DEFAULT_ITEM_COL].nunique(), remove_seen=True)
print("Took {} seconds for prediction.".format(test_time.interval))

test_predictions.head()

Took 0.7883205410000187 seconds for prediction.


Unnamed: 0,userID,itemID,prediction
0,"""007580e6-ffd9-47f3-8b7b-fe8225d12441""",Passion Powder,8.317402
1,"""007580e6-ffd9-47f3-8b7b-fe8225d12441""",Nelbcam Piroxicam,8.254385
2,"""007580e6-ffd9-47f3-8b7b-fe8225d12441""",Nospamin Drops,8.057611
3,"""007580e6-ffd9-47f3-8b7b-fe8225d12441""",Vamirex Syrup 100ml,8.002269
4,"""007580e6-ffd9-47f3-8b7b-fe8225d12441""",Panadol Extra,7.893112


## 4 Evaluation

With `topk_scores` predicted by the model, we can evaluate how LightGCN performs on this test set.

In [11]:
eval_map = map_at_k(test, test_predictions, k=TOP_K_SPLIT_TRAIN_TEST)
eval_ndcg = ndcg_at_k(test, test_predictions, k=TOP_K_SPLIT_TRAIN_TEST)
eval_rndcg = rndcg_at_k(test, test_predictions, k=TOP_K_SPLIT_TRAIN_TEST, rndcg_multiplier=RNDCG_MULTIPLIER)
eval_precision = precision_at_k(test, test_predictions, k=TOP_K_SPLIT_TRAIN_TEST)
eval_recall = recall_at_k(test, test_predictions, k=TOP_K_SPLIT_TRAIN_TEST)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "RNDCG:\t%f" % eval_rndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.066422
NDCG:	0.407489
RNDCG:	0.315133
Precision@K:	0.398123
Recall@K:	0.104514


## [Skipped] Infer embeddings

With `infer_embedding` method of LightGCN model, we can export the embeddings of users and items in the training set to CSV files for future use.

In [12]:
# model.infer_embedding(user_file, item_file)

## 5 Train, Predict, and Evaluate on Whole Dataset

Earlier, we had split train (for model training) and test (for evaluation). In implementation, we have train = whole dataset, and we can evaluate on test = whole dataset.

In [13]:
## Data
data_whole = ImplicitCF(train=df, test=df, seed=SEED)

## Hyperparameters
hparams = prepare_hparams(yaml_file,
                          n_layers=N_LAYERS,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=LEARNING_RATE,
                          eval_epoch=EVAL_EPOCH,
                          top_k=TOP_K_WHOLE,
                         )

## Train
model_whole = LightGCN(hparams, data, seed=SEED)
with Timer() as train_time:
    model_whole.fit()
print("Took {} seconds for training.".format(train_time.interval))

## Predict
with Timer() as test_time:
    all_predictions_whole = model_whole.recommend_k_items(df, top_k=df_num_items, remove_seen=False)
print("Took {} seconds for prediction.".format(test_time.interval))

  df = train if test is None else train.append(test)


Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.
Epoch 1 (train)3.9s: train loss = 0.47370 = (mf)0.47347 + (embed)0.00023
Epoch 2 (train)3.1s: train loss = 0.30517 = (mf)0.30459 + (embed)0.00058
Epoch 3 (train)3.1s: train loss = 0.27698 = (mf)0.27629 + (embed)0.00069
Epoch 4 (train)3.1s: train loss = 0.24917 = (mf)0.24830 + (embed)0.00086
Epoch 5 (train)3.1s + (eval)0.1s: train loss = 0.22872 = (mf)0.22767 + (embed)0.00106, 
Epoch 6 (train)3.1s: train loss = 0.21639 = (mf)0.21517 + (embed)0.00123
Epoch 7 (train)3.1s: train loss = 0.20507 = (mf)0.20368 + (embed)0.00139
Epoch 8 (train)3.0s: train loss = 0.19531 = (mf)0.19374 + (embed)0.00156
Epoch 9 (train)3.1s: train loss = 0.18221 = (mf)0.18046 + (embed)0.00174
Epoch 10 (train)3.0s + (eval)0.0s: train loss = 0.17575 = (mf)0.17382 + (embed)0.00194, 
Epoch 11 (train)3.0s: train loss = 0.16834 = (mf)0.16623 + (embed)0.00211
Epoch 12 (train)3.0s: train loss = 0.16005 = (mf)0.15776 + (embed)0

In [27]:
# TOP_K_WHOLE = 20
# RNDCG_MULTIPLIER = 2

## Evaluate
eval_map_whole = map_at_k(df, all_predictions_whole, k=TOP_K_WHOLE)
eval_ndcg_whole = ndcg_at_k(df, all_predictions_whole, k=TOP_K_WHOLE)
eval_rndcg_whole = rndcg_at_k(df, all_predictions_whole, k=TOP_K_WHOLE, rndcg_multiplier=RNDCG_MULTIPLIER)
eval_precision_whole = precision_at_k(df, all_predictions_whole, k=TOP_K_WHOLE)
eval_recall_whole = recall_at_k(df, all_predictions_whole, k=TOP_K_WHOLE)

print("MAP:\t%f" % eval_map_whole,
      "NDCG:\t%f" % eval_ndcg_whole,
      "RNDCG:\t%f" % eval_rndcg_whole,
      "Precision@K:\t%f" % eval_precision_whole,
      "Recall@K:\t%f" % eval_recall_whole, sep='\n')

MAP:	0.150734
NDCG:	0.830005
RNDCG:	0.407915
Precision@K:	0.793452
Recall@K:	0.173308


In [26]:
# SAVE_ALL_RECS = True
# SAVE_NEW_RECS = True

# Helper portion: get table of median price of every product
if SAVE_ALL_RECS or SAVE_NEW_RECS:
    prod_prices = df_all_cols[[COL_ITEM, "price"]]
    prod_prices = prod_prices.groupby(COL_ITEM, sort=False).median()
    prod_prices.rename(columns = {"price": "median price"}, inplace=True)
    prod_prices.index.names = [DEFAULT_ITEM_COL]
    df_extra = df_all_cols[[COL_USER, COL_ITEM, "sold_count", "price"]]
    df_extra[COL_RATING] = df_extra["sold_count"] * df_extra["price"]
    df_extra.rename(columns = {COL_USER: DEFAULT_USER_COL, COL_ITEM: DEFAULT_ITEM_COL, COL_RATING: DEFAULT_RATING_COL}, inplace=True)

## Save recommendations
if SAVE_ALL_RECS:
    top_k_predictions_whole = get_top_k_items(all_predictions_whole, col_rating=DEFAULT_PREDICTION_COL, k=TOP_K_WHOLE)
    top_k_predictions_whole.drop(DEFAULT_PREDICTION_COL, axis = 1, inplace = True)
    # add column of true sales, price, and rank of predicted products
    top_k_predictions_whole = top_k_predictions_whole.merge(prod_prices, how = 'left', on = [DEFAULT_ITEM_COL])
    top_all_true = get_top_k_items(df_extra, k=df_num_items)
    top_k_predictions_whole = top_k_predictions_whole.merge(top_all_true.drop("price", axis=1), how = 'left', on = [DEFAULT_USER_COL, DEFAULT_ITEM_COL], suffixes = (None, "_true"))
    top_k_predictions_whole["rank_true"] = top_k_predictions_whole["rank_true"].convert_dtypes()
    top_k_predictions_whole.rename(columns =\
        {DEFAULT_USER_COL: COL_USER, DEFAULT_ITEM_COL: "predicted " + COL_ITEM,\
        DEFAULT_RATING_COL: "predicted " + COL_ITEM + "'s true " + COL_RATING,\
        "rank_true": "predicted " + COL_ITEM + "'s true rank",\
        "sold_count": "predicted " + COL_ITEM + "'s true sold_count",\
        "median price": "predicted " + COL_ITEM + "'s median price"\
        }, inplace = True)
    # add columns of true top-ranked products
    top_all_true.rename(columns =\
        {DEFAULT_USER_COL: COL_USER, DEFAULT_ITEM_COL: "true " + COL_ITEM,\
        DEFAULT_RATING_COL: "true "  + COL_ITEM + "'s " + COL_RATING,\
        "sold_count": "true " + COL_ITEM + "'s sold_count",\
        "price": "true " + COL_ITEM + "'s price"}, inplace = True)
    top_k_predictions_whole = top_k_predictions_whole.merge(top_all_true, how = 'left', on = [COL_USER, 'rank'])
    # reorder columns
    top_k_predictions_whole = top_k_predictions_whole\
        [[COL_USER, "rank", "true " + COL_ITEM, "true " + COL_ITEM + "'s " + COL_RATING,\
        "true " + COL_ITEM + "'s sold_count", "true " + COL_ITEM + "'s price",\
        "predicted " + COL_ITEM, "predicted "  + COL_ITEM + "'s true " + COL_RATING,\
        "predicted " + COL_ITEM + "'s true rank", "predicted " + COL_ITEM + "'s true sold_count",\
        "predicted " + COL_ITEM + "'s median price"]]
    # save to csv
    top_k_predictions_whole.to_csv(THIS_ENGINE_NAME + "_" + COUNTRY + "_top_" + str(TOP_K_WHOLE) + "_all_prod_recs.csv")

if SAVE_NEW_RECS:
    new_predictions_whole = all_predictions_whole.merge(df, on=[DEFAULT_USER_COL,DEFAULT_ITEM_COL], indicator=True, how="left").query('_merge=="left_only"').drop('_merge', axis=1).drop([DEFAULT_RATING_COL], axis=1)
    top_k_predictions_whole = get_top_k_items(new_predictions_whole, col_rating=DEFAULT_PREDICTION_COL, k=TOP_K_WHOLE)
    top_k_predictions_whole.drop(DEFAULT_PREDICTION_COL, axis = 1, inplace = True)
    # add column of true price of predicted products
    top_k_predictions_whole = top_k_predictions_whole.merge(prod_prices, how = 'left', on = [DEFAULT_ITEM_COL])
    top_k_predictions_whole.rename(columns = {DEFAULT_USER_COL: COL_USER, DEFAULT_ITEM_COL: "predicted new " + COL_ITEM, "median price": "predicted new " + COL_ITEM + "'s median price"}, inplace=True)
    # add columns of true top-ranked products
    top_k_true = get_top_k_items(df_extra, k=TOP_K_WHOLE)
    top_k_true.rename(columns =\
        {DEFAULT_USER_COL: COL_USER, DEFAULT_ITEM_COL: "true " + COL_ITEM,\
        DEFAULT_RATING_COL: "true "  + COL_ITEM + "'s " + COL_RATING,\
        "sold_count": "true " + COL_ITEM + "'s sold_count",\
        "price": "true " + COL_ITEM + "'s price"}, inplace = True)
    top_k_predictions_whole = top_k_predictions_whole.merge(top_k_true, on = [COL_USER, 'rank'])
    # reorder columns
    top_k_predictions_whole = top_k_predictions_whole\
        [[COL_USER, "rank", "true " + COL_ITEM, "true " + COL_ITEM + "'s " + COL_RATING,\
        "true " + COL_ITEM + "'s sold_count", "true " + COL_ITEM + "'s price",\
        "predicted new " + COL_ITEM, "predicted new " + COL_ITEM + "'s median price"]]
    # save to csv
    top_k_predictions_whole.to_csv(THIS_ENGINE_NAME + "_" + COUNTRY + "_top_" + str(TOP_K_WHOLE) + "_new_prod_recs.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_extra[COL_RATING] = df_extra["sold_count"] * df_extra["price"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_extra.rename(columns = {COL_USER: DEFAULT_USER_COL, COL_ITEM: DEFAULT_ITEM_COL, COL_RATING: DEFAULT_RATING_COL}, inplace=True)


                                   userID                           itemID  \
0  "007580e6-ffd9-47f3-8b7b-fe8225d12441"                Peace Tonic 200ml   
1  "007580e6-ffd9-47f3-8b7b-fe8225d12441"              Monomin Tonic 200ml   
2  "007580e6-ffd9-47f3-8b7b-fe8225d12441"               Karaole Powder x10   
3  "007580e6-ffd9-47f3-8b7b-fe8225d12441"  Neoskin Triple Action Cream 30g   
4  "007580e6-ffd9-47f3-8b7b-fe8225d12441"    Panadol (Regular) 500mg 10x10   

   rank  
0     1  
1     2  
2     3  
3     4  
4     5  
                              median price
itemID                                    
Coartem 80/480mg Tablets x6        2550.00
Cap Ampiclox 500mg (Beecham)       7375.00
Ampiflux Suspension                1727.00
Dermovate Cream 20g                 865.00
Emzoclox 500mg Capsules x100       2811.19
