# Bilateral Variational Autoencoder (BiVAE)

## 0 Global Settings and Imports

In [1]:
import sys
import os
import torch
import cornac
import pandas as pd
import numpy as np
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED
from recommenders.utils.constants import (
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_PREDICTION_COL,
    DEFAULT_RELEVANCE_COL,
    DEFAULT_SIMILARITY_COL,
    DEFAULT_ITEM_FEATURES_COL,
    DEFAULT_ITEM_SIM_MEASURE,
    DEFAULT_K,
    DEFAULT_THRESHOLD,
)
DEFAULT_RNDCG_MULTIPLIER = 3

print("System version: {}".format(sys.version))
print("PyTorch version: {}".format(torch.__version__))
print("Cornac version: {}".format(cornac.__version__))

  from .autonotebook import tqdm as notebook_tqdm


FM model is only supported on Linux.
Windows executable can be found at http://www.libfm.org.
System version: 3.8.15 | packaged by conda-forge | (default, Nov 22 2022, 08:51:59) 
[Clang 14.0.6 ]
PyTorch version: 1.13.1
Cornac version: 1.15.4


In [2]:
THIS_ENGINE_NAME = "bivae_field_8"

DATA_FILE_NAME = "../Data/20230813T210908_sales_6mo_basic_single_events_removed.csv"
# DATA_FILE_NAME = "../Data/20230811T031507_sales_12mo_basic_single_events_removed.csv"
# DATA_FILE_NAME = "../Data/20230809T003134_sales_25mo_basic_single_events_removed.csv"
COL_USER = "location_id"
COL_ITEM = "product"
COL_RATING = "sold_count"

# country
COUNTRY = "nigeria"

# top k items to recommend, for train & test
TOP_K_SPLIT_TRAIN_TEST = 10
# top k items to recommend, for final product recommendation output
TOP_K_WHOLE = 10
SAVE_ALL_RECS = False
SAVE_NEW_RECS = False

# fraction of location_skus to include in training dataset
TRAIN_FRAC = 0.75

# top MULTIPLIER * k items are considered relevant for nDCG
RNDCG_MULTIPLIER = 3

# Model parameters
LATENT_DIM = 50
ENCODER_DIMS = [100]
ACT_FUNC = "tanh"
LIKELIHOOD = "pois"
NUM_EPOCHS = 500
BATCH_SIZE = 128
LEARNING_RATE = 0.001

### Revised/copied code from packages

In [3]:
def _get_rating_column(relevancy_method: str, **kwargs) -> str:
    r"""Helper utility to simplify the arguments of eval metrics
    Attemtps to address https://github.com/microsoft/recommenders/issues/1737.

    Args:
        relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
            top k items are directly provided, so there is no need to compute the relevancy operation.

    Returns:
        str: rating column name.
    """
    if relevancy_method != "top_k":
        if "col_rating" not in kwargs:
            raise ValueError("Expected an argument `col_rating` but wasn't found.")
        col_rating = kwargs.get("col_rating")
    else:
        col_rating = kwargs.get("col_rating", DEFAULT_RATING_COL)
    return col_rating


def get_top_k_items(
    dataframe, col_user=DEFAULT_USER_COL, col_rating=DEFAULT_RATING_COL, k=DEFAULT_K
):
    """Get the input customer-item-rating tuple in the format of Pandas
    DataFrame, output a Pandas DataFrame in the dense format of top k items
    for each user.

    Note:
        If it is implicit rating, just append a column of constants to be
        ratings.

    Args:
        dataframe (pandas.DataFrame): DataFrame of rating data (in the format
        customerID-itemID-rating)
        col_user (str): column name for user
        col_rating (str): column name for rating
        k (int or None): number of items for each user; None means that the input has already been
        filtered out top k items and sorted by ratings and there is no need to do that again.

    Returns:
        pandas.DataFrame: DataFrame of top k items for each user, sorted by `col_user` and `rank`
    """
    # Sort dataframe by col_user and (top k) col_rating
    if k is None:
        top_k_items = dataframe
    else:
        top_k_items = (
            dataframe.sort_values([col_user, col_rating], ascending=[True, False])
            .groupby(col_user, as_index=False)
            .head(k)
            .reset_index(drop=True)
        )
    # Add ranks
    top_k_items["rank"] = top_k_items.groupby(col_user, sort=False).cumcount() + 1
    return top_k_items


def merge_ranking_true_pred_new(
    rating_true,
    rating_pred,
    col_user,
    col_item,
    col_rating,
    col_prediction,
    relevancy_method,
    k=DEFAULT_K,
    rndcg_multiplier=DEFAULT_RNDCG_MULTIPLIER,
    threshold=DEFAULT_THRESHOLD,
):
    """Filter truth and prediction data frames on common users

    Args:
        rating_true (pandas.DataFrame): True DataFrame
        rating_pred (pandas.DataFrame): Predicted DataFrame
        col_user (str): column name for user
        col_item (str): column name for item
        col_rating (str): column name for rating
        col_prediction (str): column name for prediction
        relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
            top k items are directly provided, so there is no need to compute the relevancy operation.
        k (int): number of top k items per user (optional)
        threshold (float): threshold of top items per user (optional)

    Returns:
        pandas.DataFrame: DataFrame of recommendation hits, sorted by `col_user` and `rank`
    """

    # Make sure the prediction and true data frames have the same set of users
    common_users = set(rating_true[col_user]).intersection(set(rating_pred[col_user]))
    rating_true_common = rating_true[rating_true[col_user].isin(common_users)]
    rating_pred_common = rating_pred[rating_pred[col_user].isin(common_users)]

    # Return hit items in prediction data frame with ranking information. This is used for calculating NDCG and MAP.
    # Use first to generate unique ranking values for each item. This is to align with the implementation in
    # Spark evaluation metrics, where index of each recommended items (the indices are unique to items) is used
    # to calculate penalized precision of the ordered items.
    if relevancy_method == "top_k":
        top_k = k
    elif relevancy_method == "by_threshold":
        top_k = threshold
    elif relevancy_method is None:
        top_k = None
    else:
        raise NotImplementedError("Invalid relevancy_method")
    df_hit = get_top_k_items(
        dataframe=rating_pred_common,
        col_user=col_user,
        col_rating=col_prediction,
        k=top_k,
    )
    rating_true_common_top_mult_k = get_top_k_items(
        dataframe=rating_true_common,
        col_user=col_user,
        col_rating=col_rating,
        k=RNDCG_MULTIPLIER * top_k,
    )[[col_user, col_item]]
    df_hit = pd.merge(df_hit, rating_true_common_top_mult_k, on=[col_user, col_item])[
        [col_user, col_item, "rank"]
    ]

    return df_hit


def rndcg_at_k(
    rating_true,
    rating_pred,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_prediction=DEFAULT_PREDICTION_COL,
    relevancy_method="top_k",
    k=DEFAULT_K,
    rndcg_multiplier=RNDCG_MULTIPLIER,
    threshold=DEFAULT_THRESHOLD,
    score_type="binary",
    discfun_type="loge",
    **kwargs
):
    """Normalized Discounted Cumulative Gain (nDCG).

    Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain

    Args:
        rating_true (pandas.DataFrame): True DataFrame
        rating_pred (pandas.DataFrame): Predicted DataFrame
        col_user (str): column name for user
        col_item (str): column name for item
        col_rating (str): column name for rating
        col_prediction (str): column name for prediction
        relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
            top k items are directly provided, so there is no need to compute the relevancy operation.
        k (int): number of top k items per user
        threshold (float): threshold of top items per user (optional)
        score_type (str): type of relevance scores ['binary', 'raw', 'exp']. With the default option 'binary', the
            relevance score is reduced to either 1 (hit) or 0 (miss). Option 'raw' uses the raw relevance score.
            Option 'exp' uses (2 ** RAW_RELEVANCE - 1) as the relevance score
        discfun_type (str): type of discount function ['loge', 'log2'] used to calculate DCG.

    Returns:
        float: nDCG at k (min=0, max=1).
    """
    col_rating = _get_rating_column(relevancy_method, **kwargs)
    df_hit = merge_ranking_true_pred_new(
        rating_true=rating_true,
        rating_pred=rating_pred,
        col_user=col_user,
        col_item=col_item,
        col_rating=col_rating,
        col_prediction=col_prediction,
        relevancy_method=relevancy_method,
        k=k,
        rndcg_multiplier=rndcg_multiplier,
        threshold=threshold,
    )

    if df_hit.shape[0] == 0:
        return 0.0

    df_dcg = df_hit.merge(rating_pred, on=[col_user, col_item]).merge(
        rating_true, on=[col_user, col_item], how="outer", suffixes=("_left", None)
    )

    if score_type == "binary":
        df_dcg["rel"] = 1
    elif score_type == "raw":
        df_dcg["rel"] = df_dcg[col_rating]
    elif score_type == "exp":
        df_dcg["rel"] = 2 ** df_dcg[col_rating] - 1
    else:
        raise ValueError("score_type must be one of 'binary', 'raw', 'exp'")

    if discfun_type == "loge":
        discfun = np.log
    elif discfun_type == "log2":
        discfun = np.log2
    else:
        raise ValueError("discfun_type must be one of 'loge', 'log2'")

    # Calculate the actual discounted gain for each record
    df_dcg["dcg"] = df_dcg["rel"] / discfun(1 + df_dcg["rank"])

    # Calculate the ideal discounted gain for each record
    df_idcg = df_dcg.sort_values([col_user, col_rating], ascending=False)
    df_idcg["irank"] = df_idcg.groupby(col_user, as_index=False, sort=False)[
        col_rating
    ].rank("first", ascending=False)
    df_idcg["idcg"] = df_idcg["rel"] / discfun(1 + df_idcg["irank"])

    # Calculate the actual DCG for each user
    df_user = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"})

    # Calculate the ideal DCG for each user
    df_user = df_user.merge(
        df_idcg.groupby(col_user, as_index=False, sort=False)
        .head(k)
        .groupby(col_user, as_index=False, sort=False)
        .agg({"idcg": "sum"}),
        on=col_user,
    )

    # DCG over IDCG is the normalized DCG
    df_user["ndcg"] = df_user["dcg"] / df_user["idcg"]
    return df_user["ndcg"].mean()

## 1 Data


### 1.1 Load and split data

To evaluate the performance of item recommendation, we adopted the provided `python_random_split` tool for the consistency.  Data is randomly split into training and test sets with the ratio of 75/25.


Note that Cornac also cover different [built-in schemes](https://cornac.readthedocs.io/en/latest/eval_methods.html) for model evaluation.

In [4]:
data_all_cols = pd.read_csv(DATA_FILE_NAME)
data_all_cols = data_all_cols[data_all_cols["country"] == COUNTRY]

data = data_all_cols[[COL_USER, COL_ITEM, COL_RATING]]

data.head()

Unnamed: 0,location_id,product,sold_count
0,"""45caa0e9-32b8-4c6d-9730-6b9241c28ab0""",Coartem 80/480mg Tablets x6,6
1,"""710ba16d-3f48-401e-b318-eef8d71844ab""",Coartem 80/480mg Tablets x6,11
2,"""a613511f-204b-4b4b-ad44-7cb83446c268""",Coartem 80/480mg Tablets x6,2
3,"""ce5ab12b-79fd-4584-8328-63a0e8f5c038""",Coartem 80/480mg Tablets x6,11
4,"""24dd2d85-3dbf-4171-b564-7cc0d8f92465""",Coartem 80/480mg Tablets x6,10


Here, we use quantity sold, not revenue of sales, for each product.

In [5]:
train, test = python_random_split(data, TRAIN_FRAC, seed = SEED)

### 1.2 Cornac Dataset

To work with models implemented in Cornac, we need to construct an object from [Dataset](https://cornac.readthedocs.io/en/latest/data.html#module-cornac.data.dataset) class.

Dataset Class in Cornac serves as the main object that the models will interact with.  In addition to data transformations, Dataset provides a bunch of useful iterators for looping through the data, as well as supporting different negative sampling techniques.

In [6]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 361
Number of items: 1781




## 2 Training

The BiVAE has a few important parameters that we need to consider:

- `k`: dimension of the latent space (i.e. the size of $\bf{\theta}_u$  and  $\bf{\beta}_i$ ).
- `encoder_structure`: dimension(s) of hidden layer(s) of the user and item encoders.
- `act_fn`: non-linear activation function used in the encoders.
- `likelihood`: choice of the likelihood function being optimized.
- `n_epochs`: number of passes through training data.
- `batch_size`: size of mini-batches of data during training.
- `learning_rate`: step size in the gradient update rules.

To train the model, we simply need to call the `fit()` method.

In [7]:
bivae = cornac.models.BiVAECF(
    k=LATENT_DIM,
    encoder_structure=ENCODER_DIMS,
    act_fn=ACT_FUNC,
    likelihood=LIKELIHOOD,
    n_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    seed=SEED,
    use_gpu=torch.cuda.is_available(),
    verbose=True
)

In [8]:
with Timer() as t:
    bivae.fit(train_set)
print("Took {} seconds for training.".format(t))

100%|██████████| 500/500 [02:22<00:00,  3.50it/s, loss_i=0.469, loss_u=2.55]

Took 143.0007 seconds for training.





## 3 Prediction

Now that our model is trained, we can produce the ranked lists for recommendation.  Every recommender models in Cornac provide `rate()` and `rank()` methods for predicting item rated value as well as item ranked list for a given user.  To make use of the current evaluation schemes, we will through `predict()` and `predict_ranking()` functions inside `cornac_utils` to produce the predictions.

Note that BiVAE model is effectively designed for item ranking.  Hence, we only measure the performance using ranking metrics.

In [9]:
with Timer() as t:
    all_predictions = predict_ranking(bivae, train, usercol=COL_USER, itemcol=COL_ITEM, remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 0.4034 seconds for prediction.


In [10]:
all_predictions.head()

Unnamed: 0,location_id,product,prediction
34826,"""5de61526-8bb1-4c94-a637-30461e397a25""",Amlosam 5mg Tablet,0.00084
34827,"""5de61526-8bb1-4c94-a637-30461e397a25""",Wellman Original Tablets x30,0.24065
34828,"""5de61526-8bb1-4c94-a637-30461e397a25""",Benylin Dry Cough 100ml Syrup,0.260115
34829,"""5de61526-8bb1-4c94-a637-30461e397a25""",Doxycap 100mg Capsules x100,0.109588
34830,"""5de61526-8bb1-4c94-a637-30461e397a25""",Durex Extra Safe x3[D/C],0.083123


In [11]:
# Explore min
all_predictions.sort_values(by="prediction", ascending=True).head()

Unnamed: 0,location_id,product,prediction
103108,"""72d85136-5b62-48d2-a677-c5b10e091f2a""",Keytifen Eye Drops,7.782439e-07
103678,"""72d85136-5b62-48d2-a677-c5b10e091f2a""",Jawaclox Drops,7.782439e-07
103625,"""72d85136-5b62-48d2-a677-c5b10e091f2a""",Cartol 200mg Tablet x100,7.782439e-07
103506,"""72d85136-5b62-48d2-a677-c5b10e091f2a""",Surexime 60ml Suspension,7.782439e-07
104197,"""72d85136-5b62-48d2-a677-c5b10e091f2a""",Kifaru 100mg Tablet,8.562724e-07


In [12]:
# Explore max
all_predictions.sort_values(by="prediction", ascending=False).head()

Unnamed: 0,location_id,product,prediction
47590,"""53adbd2d-b6ed-4d1f-89be-b21832b4fa21""",Ciprotab 500mg Tablets x10,0.998725
185769,"""88395a11-a04e-437d-ade6-37b12c54c783""",Artequick 62.5/375mg Tablets x4,0.994728
55318,"""96bdd2f2-044d-4060-95d2-e653045b2f6f""",De-Deon's Syrup 280ml,0.99393
47706,"""53adbd2d-b6ed-4d1f-89be-b21832b4fa21""",Vasoprin 75mg Tablets x100,0.992954
244252,"""38b5a6fc-63ae-4a92-812f-7d97d1308d0a""",Amatem 80/480 Softgel x6,0.992365


## 4 Evaluation / Testing

In [13]:
# eval_map = map_at_k(test, all_predictions, col_user=COL_USER, col_item=COL_ITEM, col_rating=COL_RATING, k=TOP_K_SPLIT_TRAIN_TEST)
# eval_ndcg = ndcg_at_k(test, all_predictions, col_user=COL_USER, col_item=COL_ITEM, col_rating=COL_RATING, k=TOP_K_SPLIT_TRAIN_TEST)
# eval_rndcg = rndcg_at_k(test, all_predictions, col_user=COL_USER, col_item=COL_ITEM, col_rating=COL_RATING, k=TOP_K_SPLIT_TRAIN_TEST, rndcg_multiplier=RNDCG_MULTIPLIER)
# eval_precision = precision_at_k(test, all_predictions, col_user=COL_USER, col_item=COL_ITEM, col_rating=COL_RATING, k=TOP_K_SPLIT_TRAIN_TEST)
# eval_recall = recall_at_k(test, all_predictions, col_user=COL_USER, col_item=COL_ITEM, col_rating=COL_RATING, k=TOP_K_SPLIT_TRAIN_TEST)

# print("MAP:\t%f" % eval_map,
#       "NDCG:\t%f" % eval_ndcg,
#       "RNDCG:\t%f" % eval_rndcg,
#       "Precision@K:\t%f" % eval_precision,
#       "Recall@K:\t%f" % eval_recall, sep='\n')

## 5 Train, Predict, and Evaluate on Whole Dataset

Earlier, we had split train (for model training) and test (for evaluation). In implementation, we have train = whole dataset, and we can evaluate on test = whole dataset.

In [14]:
## Data
data_set = cornac.data.Dataset.from_uir(data.itertuples(index=False), seed=SEED)
print('Number of users: {}'.format(data_set.num_users))
print('Number of items: {}'.format(data_set.num_items))

## Train
bivae_whole = cornac.models.BiVAECF(
    k=LATENT_DIM,
    encoder_structure=ENCODER_DIMS,
    act_fn=ACT_FUNC,
    likelihood=LIKELIHOOD,
    n_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    seed=SEED,
    use_gpu=torch.cuda.is_available(),
    verbose=True
)
with Timer() as t:
    bivae_whole.fit(data_set)
print("Took {} seconds for training.".format(t))

## Predict
with Timer() as t:
    all_predictions_whole = predict_ranking(bivae, data, usercol=COL_USER, itemcol=COL_ITEM, remove_seen=False)
print("Took {} seconds for prediction.".format(t))

Number of users: 361
Number of items: 1863


100%|██████████| 500/500 [02:34<00:00,  3.23it/s, loss_i=0.537, loss_u=2.98]


Took 154.8771 seconds for training.
Took 0.1545 seconds for prediction.


In [17]:
# Explore min
all_predictions_whole.sort_values(by="prediction", ascending=True).head()

Unnamed: 0,location_id,product,prediction
75543,"""72d85136-5b62-48d2-a677-c5b10e091f2a""",Surexime 60ml Suspension,7.782439e-07
75716,"""72d85136-5b62-48d2-a677-c5b10e091f2a""",Jawaclox Drops,7.782439e-07
75662,"""72d85136-5b62-48d2-a677-c5b10e091f2a""",Cartol 200mg Tablet x100,7.782439e-07
75123,"""72d85136-5b62-48d2-a677-c5b10e091f2a""",Keytifen Eye Drops,7.782439e-07
76235,"""72d85136-5b62-48d2-a677-c5b10e091f2a""",Kifaru 100mg Tablet,8.562724e-07


In [18]:
# Explore max
all_predictions_whole.sort_values(by="prediction", ascending=False).head()

Unnamed: 0,location_id,product,prediction
14481,"""53adbd2d-b6ed-4d1f-89be-b21832b4fa21""",Ciprotab 500mg Tablets x10,0.998725
23386,"""96bdd2f2-044d-4060-95d2-e653045b2f6f""",Ciprotab 500mg Tablets x10,0.998623
165866,"""88395a11-a04e-437d-ade6-37b12c54c783""",Ciprotab 500mg Tablets x10,0.998487
228201,"""38b5a6fc-63ae-4a92-812f-7d97d1308d0a""",Ciprotab 500mg Tablets x10,0.998449
219296,"""bc1f9ebd-8c5b-48aa-9880-53b635a6deb8""",Ciprotab 500mg Tablets x10,0.997716


In [19]:
# TOP_K_WHOLE = 100
# RNDCG_MULTIPLIER = 3

## Evaluate
# eval_map_whole = map_at_k(data, all_predictions_whole, col_user=COL_USER, col_item=COL_ITEM, col_rating=COL_RATING, k=TOP_K_WHOLE)
# eval_ndcg_whole = ndcg_at_k(data, all_predictions_whole, col_user=COL_USER, col_item=COL_ITEM, col_rating=COL_RATING, k=TOP_K_WHOLE)
# eval_rndcg_whole = rndcg_at_k(data, all_predictions_whole, col_user=COL_USER, col_item=COL_ITEM, col_rating=COL_RATING, k=TOP_K_WHOLE, rndcg_multiplier=RNDCG_MULTIPLIER)
# eval_precision_whole = precision_at_k(data, all_predictions_whole, col_user=COL_USER, col_item=COL_ITEM, col_rating=COL_RATING, k=TOP_K_WHOLE)
# eval_recall_whole = recall_at_k(data, all_predictions_whole, col_user=COL_USER, col_item=COL_ITEM, col_rating=COL_RATING, k=TOP_K_WHOLE)

# print("MAP:\t%f" % eval_map_whole,
#       "NDCG:\t%f" % eval_ndcg_whole,
#       "RNDCG:\t%f" % eval_rndcg_whole,
#       "Precision@K:\t%f" % eval_precision_whole,
#       "Recall@K:\t%f" % eval_recall_whole, sep='\n')

In [20]:
# SAVE_ALL_RECS = True
# SAVE_NEW_RECS = True

## Save recommendations
# if SAVE_ALL_RECS:
#     top_k_predictions_whole = get_top_k_items(all_predictions_whole, col_user=COL_USER, col_rating=DEFAULT_PREDICTION_COL, k=TOP_K_WHOLE)
#     top_k_predictions_whole.drop(DEFAULT_PREDICTION_COL, axis = 1, inplace = True)
#     # add column of true sales and rank of predicted products
#     top_all_true = get_top_k_items(data, col_user=COL_USER, col_rating=COL_RATING, k=data_set.num_items)
#     top_k_predictions_whole = top_k_predictions_whole.merge(top_all_true, how = 'left', on = [COL_USER, COL_ITEM], suffixes = (None, "_true"))
#     top_k_predictions_whole["rank_true"] = top_k_predictions_whole["rank_true"].convert_dtypes()
#     top_k_predictions_whole.rename(columns = {COL_ITEM: "predicted " + COL_ITEM, COL_RATING: "predicted " + COL_ITEM + "'s true " + COL_RATING, "rank_true": "predicted " + COL_ITEM + "'s true rank"}, inplace = True)
#     # add columns of true top-ranked products
#     top_all_true.rename(columns = {COL_ITEM: "true " + COL_ITEM, COL_RATING: "true "  + COL_ITEM + "'s " + COL_RATING}, inplace = True)
#     top_k_predictions_whole = top_k_predictions_whole.merge(top_all_true, how = 'left', on = [COL_USER, 'rank'])
#     # reorder columns
#     top_k_predictions_whole = top_k_predictions_whole[[COL_USER, "rank", "true " + COL_ITEM, "true " + COL_ITEM + "'s " + COL_RATING, "predicted " + COL_ITEM, "predicted "  + COL_ITEM + "'s true " + COL_RATING, "predicted " + COL_ITEM + "'s true rank"]]
#     # save to csv
#     top_k_predictions_whole.to_csv(THIS_ENGINE_NAME + "_" + COUNTRY + "_top_" + str(TOP_K_WHOLE) + "_all_prod_recs.csv")
# if SAVE_NEW_RECS:
#     new_predictions_whole = all_predictions_whole.merge(data, on=[COL_USER,COL_ITEM], indicator=True, how="left").query('_merge=="left_only"').drop('_merge', axis=1).drop([COL_RATING], axis=1)
#     top_k_predictions_whole = get_top_k_items(new_predictions_whole, col_user=COL_USER, col_rating=DEFAULT_PREDICTION_COL, k=TOP_K_WHOLE)
#     top_k_predictions_whole.drop(DEFAULT_PREDICTION_COL, axis = 1, inplace = True)
#     top_k_predictions_whole.rename(columns = {COL_ITEM: "predicted new " + COL_ITEM}, inplace=True)
#     top_k_true = get_top_k_items(data, col_user=COL_USER, col_rating=COL_RATING, k=TOP_K_WHOLE)
#     top_k_true.rename(columns = {COL_ITEM: "true " + COL_ITEM, COL_RATING: "true "  + COL_ITEM + "'s " + COL_RATING}, inplace = True)
#     top_k_predictions_whole = top_k_predictions_whole.merge(top_k_true, on = [COL_USER, 'rank'])
#     top_k_predictions_whole = top_k_predictions_whole[[COL_USER, "rank", "true " + COL_ITEM, "true " + COL_ITEM + "'s " + COL_RATING, "predicted new " + COL_ITEM]]
#     top_k_predictions_whole.to_csv(THIS_ENGINE_NAME + "_" + COUNTRY + "_top_" + str(TOP_K_WHOLE) + "_new_prod_recs.csv")