In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os

import pandas as pd
from tqdm.autonotebook import tqdm

tqdm.pandas()

# How to use this notebook 
- This notebook describes the process to model and bulid the recommend service
- It contains some EDA and is a playground for my experiments
- This notebook can be executed from top to bottom to arrive at a served service

# Approach to train a Restaurant Recommender

## Choose method
- I came across <a href="https://arxiv.org/abs/1507.08439">LightFM paper</a> months ago and was fascinated about how intuitive it is. Based on that paper, LightFM does not only able to incorporate metadata from users and items, it also is capable of outputing semantics embeddings. Furthermore, LightFM is made to exploit implicit feedback, which is the type of rating we have for this problem.
- Apart from LightFM, a content-based recommendation system comes top of my mind as I see the sparse interactions where 75% of our users are one-off.
    
## Define rating
- There are three kinds of rating I have tried:
    - The count number of reservations
    - The weigthed count number of reservations (based on both average of that user and average of that restaurant)
    - A random rating, to estimate impact of rating on our performance metrics (AUC and Precision At K)
- Since the experimental result suggests no significant change in performance w.r.t. to rating calculation, the final model sticks with the basic one (count reservations).

## Split train test
- As in this scope I don't exploit contextual information to recommend, I try to model the taste of users.
- Given the above objective, I apply a simple ratings drop-out for test set

## Fit model
- LightFM offers a variety of hyper-parameters. Reading the docs gives me enough information to select a decent default parameter including `loss='warp'` and `n_components=50`.
- I try some quick random searches and find no significant improvement.
- I try incorporating the categorical metadata of restaurants as features, but no significance found either.
- Therefore, I believe that the next experiment should be to re-examine the way we feed data into model training. Then to change the input and maybe combine other approaches.

## Serve model
- As the time of this writing, BentoML is a new (2 years old) and promising open-source package to help us ship our model.
- As BentoML has not yet introduced a pre-defined framework for LightFM, I implement two custom artifacts to encapsulate LightFM modeling objects: dataset and model. It is convenient that the LightFM artifacts can be persisted using Python `pickle` package (just like Scikit-Learn).
- In case input user not found by LightFM model, there is a Popular Recommender as fallback

## Shipping
- As they are shared many environment requirements, I extend the bentoml docker image to build a new one ready to run our scripts
- The training outputs artifacts to a shared volume so that later processes can pick up

# Load data

In [None]:
DATA_RAW_DIR = "../data/datasets/"

In [None]:
rez_df = pd.read_csv(DATA_RAW_DIR + "reservations.csv", dtype={"rez_id": str})

rez_df["reservation_time"] = pd.to_datetime(
    rez_df["reservation_time"], unit="s", utc=True
)
rez_df["booking_time"] = pd.to_datetime(rez_df["booking_time"], unit="s", utc=True)

In [None]:
rez_df

# EDA

## Identify low-frequency restaurants

Assumption: Those low-frequency restaurants are not safe to recommend because we know little about them.

In [None]:
res_freq = rez_df.groupby(["RestaurantUID"]).agg(
    {"rez_id": ["nunique"], "hashed_email": ["nunique"]}
)
res_freq.columns = ["_".join(col).strip() for col in res_freq.columns.values]

In [None]:
res_freq.describe(include="all", percentiles=[0.05, 0.1, 0.2, 0.5, 0.75]).T

If we recommend only restaurants that have more than 10 users visited, we drop about 20% restaurants.

## Identify low-frequency users

In [None]:
user_freq = rez_df.groupby(["hashed_email"]).agg(
    {"rez_id": ["nunique"], "RestaurantUID": ["nunique"]}
)
user_freq.columns = ["_".join(col).strip() for col in user_freq.columns.values]

In [None]:
user_freq.describe(include="all", percentiles=[0.25, 0.5, 0.75, 0.8, 0.9]).T

If we apply LightFM to recommend for only users visiting more than 1 restaurant, we will not touch 75% of our uses! This finding also indicates that we suffer from cold-start problem and might need to add a content-based recommender system to handle.

# Preprocess

## Count reservations as rating

In [None]:
rating_df = rez_df.groupby(["hashed_email", "RestaurantUID"]).agg(
    {"rez_id": ["nunique"]}
)
rating_df.columns = ["_".join(col).strip() for col in rating_df.columns.values]
rating_df = rating_df.reset_index()
rating_df

In [None]:
rating_df.describe(include="all", percentiles=[0.25, 0.5, 0.75, 0.8, 0.9, 0.95]).T

Only 5% users booked a restaurant more than once.

The first naive implicit rater is to clip the `rez_id_nunique` in the range (1, 5)

In [None]:
rating_df["rating"] = rating_df["rez_id_nunique"].clip(lower=1, upper=5)

## Weighted average rating

In [None]:
rating_df["hashed_email_mean_rating"] = rating_df.groupby(["hashed_email"])[
    "rating"
].transform("mean")
rating_df["RestaurantUID_mean_rating"] = rating_df.groupby(["RestaurantUID"])[
    "rating"
].transform("mean")

In [None]:
rating_df["rating_wa"] = (
    rating_df["rating"] / rating_df["hashed_email_mean_rating"]
    + rating_df["rating"] / rating_df["RestaurantUID_mean_rating"]
) / 2

In [None]:
rating_df.describe(include="all").T

## Random rating

To test the impact of implementing implicit rating

In [None]:
import numpy as np

In [None]:
random_range = list(range(1, 5))
rating_df["rating_random"] = np.random.choice(random_range, size=rating_df.shape[0])

# Getting restaurant metadata

## Load res_cats

In [None]:
cat_df = pd.read_csv(DATA_RAW_DIR + "restaurant_category.csv", dtype=str)
res_cats_rel_df = pd.read_csv(DATA_RAW_DIR + "res_cats_relationship.csv", dtype=str)

In [None]:
cat_df

In [None]:
_cols = ["id", "name"]
_cat_df = cat_df[_cols].rename(columns={"id": "parent_id", "name": "cat_parent_name"})
cat_cross_df = cat_df.merge(_cat_df, how="left", on="parent_id")
cat_cross_df["cat_parent_name"] = cat_cross_df["cat_parent_name"].str.upper()

In [None]:
cat_denom_df = res_cats_rel_df.merge(
    cat_cross_df, how="left", left_on="cat_id", right_on="id"
)

cat_denom_df = cat_denom_df.drop(["id"], axis=1)
cat_denom_df = cat_denom_df.rename(
    columns={
        "parent_id": "cat_parent_id",
        "name": "cat_name",
        "country_code": "cat_country_code",
    }
)
cat_denom_df["cat_name"] = cat_denom_df["cat_name"].str.lower()

In [None]:
res_agg_df = cat_denom_df.groupby(
    ["RestaurantUID", "cat_country_code"], as_index=False
)["cat_id"].count()
res_agg_df = res_agg_df.drop(columns=["cat_id"])

assert res_agg_df["RestaurantUID"].duplicated().sum() == 0

In [None]:
res_agg_df

## Format res_cats

In [None]:
res_features = res_agg_df.to_dict(orient="records")

# Prepare rating format 

In [None]:
rating_col = "rating"
cols = ["hashed_email", "RestaurantUID", rating_col]
rating_dicts = rating_df[cols].to_dict(orient="records")

In [None]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit(
    (x["hashed_email"] for x in rating_dicts),
    (x["RestaurantUID"] for x in rating_dicts),
)

In [None]:
dataset.fit_partial(
    items=(x["RestaurantUID"] for x in res_features),
    item_features=(x["cat_country_code"] for x in res_features),
)

In [None]:
num_users, num_items = dataset.interactions_shape()
print("Num users: {}, Num restaurants: {}.".format(num_users, num_items))

In [None]:
(interactions, weights) = dataset.build_interactions(
    ((x["hashed_email"], x["RestaurantUID"], x[rating_col]) for x in rating_dicts)
)

print(repr(interactions))

In [None]:
sparsity = weights.nnz / (weights.shape[0] * weights.shape[1])

In [None]:
sparsity

#### Build item features

In [None]:
item_features = dataset.build_item_features(
    ((x["RestaurantUID"], [x["cat_country_code"]]) for x in res_features)
)

In [None]:
print(repr(item_features))

# Split train test

In [None]:
from lightfm.cross_validation import random_train_test_split

In [None]:
train, test = random_train_test_split(weights, test_percentage=0.2, random_state=13)

# Fit model

In [None]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score

## WARP

In [None]:
model = LightFM(
    no_components=50, learning_schedule="adagrad", learning_rate=0.01, loss="warp"
)

model.fit_partial(train, epochs=10, sample_weight=train, verbose=True)

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(
    model, test, k=10, train_interactions=train, check_intersections=True,
).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test, train_interactions=train,).mean()

print("Precision: train %.2f, test %.2f." % (train_precision, test_precision))
print("AUC: train %.2f, test %.2f." % (train_auc, test_auc))

# Hyper-param tuning

In [None]:
import itertools


def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(16, 64),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp", "warp-kos"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "user_alpha": np.random.exponential(1e-8),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(5, 50),
        }


def random_search(train, test, num_samples=10, num_threads=1):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    test: np.float32 coo_matrix of shape [n_users, n_items]
        Test data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (auc_score, hyperparameter dict, fitted model)

    """

    i = 1
    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        print(f"{i} - Evaluating {hyperparams}...")
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        model.fit(train, epochs=num_epochs, num_threads=num_threads)

        score_auc = auc_score(
            model, test, train_interactions=train, num_threads=num_threads
        ).mean()
        score_pak = precision_at_k(
            model, test, k=10, train_interactions=train, check_intersections=True,
        ).mean()

        hyperparams["num_epochs"] = num_epochs

        yield (score_pak, score_auc, hyperparams, model)

        i += 1

In [None]:
random_search_results = []
rs = random_search(train, test, num_samples=20, num_threads=2)
for result in rs:
    random_search_results.append(result)

In [None]:
rs_result_df = pd.DataFrame(random_search_results).drop(columns=[3])
rs_result_df = pd.concat([rs_result_df, rs_result_df[2].apply(pd.Series)], axis=1)
rs_result_df = rs_result_df.drop(columns=[2])
rs_result_df = rs_result_df.rename(columns={0: "precision_at_10", 1: "auc"})

In [None]:
rs_result_df.sort_values(["precision_at_10"], ascending=False)

Hyper-parameter tuning does not help increase the model performance.

# Popular recommender

In [None]:
pop_res = rez_df.groupby(["RestaurantUID"]).agg({"rez_id": ["nunique"]})
pop_res.columns = ["_".join(cols) for cols in pop_res.columns]

In [None]:
pop_rec_df = pop_res.sort_values(["rez_id_nunique"], ascending=False).iloc[:10]

In [None]:
pop_rec = list(pop_rec_df.index)

In [None]:
pop_rec

# Persist model

In [None]:
import pickle

OUTPUT_MODEL_DIR = "../models/lfm/"
!mkdir -p $OUTPUT_MODEL_DIR

with open(OUTPUT_MODEL_DIR + "lfm_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open(OUTPUT_MODEL_DIR + "lfm_dataset.pkl", "wb") as f:
    pickle.dump(dataset, f)

## Load

In [None]:
with open(OUTPUT_MODEL_DIR + "lfm_model.pkl", "rb") as f:
    model = pickle.load(f)

In [None]:
with open(OUTPUT_MODEL_DIR + "lfm_dataset.pkl", "rb") as f:
    dataset = pickle.load(f)

# Predict

## Predict individual

In [None]:
items_map = np.array(list(dataset.mapping()[2].keys()))
users_map = np.array(list(dataset.mapping()[0].keys()))

In [None]:
users_map

In [None]:
def sample_recommendation(model, user_ids, users_map, items_map, k: int):
    n_items = len(items_map)
    for user_id in user_ids:
        scores = model.predict(user_id, np.arange(n_items))
        top_items = items_map[np.argsort(-scores)]
        user = users_map[user_id]
        print(f"Recommendations for user {user}:")

        for x in top_items[:k]:
            print("   %s" % x)

In [None]:
sample_recommendation(model, [10], users_map, items_map, k=10)

# Persist model

In [None]:
%%writefile pop_rec_artifact.py

import os
from bentoml.utils import cloudpickle
from bentoml.service.artifacts import BentoServiceArtifact


class PopRecArtifact(BentoServiceArtifact):
    def __init__(self, name):
        super(PopRecArtifact, self).__init__(name)
        self._model = None

    def pack(self, model, metadata=None):
        self._model = model
        return self

    def get(self):
        return self._model

    def save(self, directory):
        path = self._file_path(directory)
        with open(path, "wb") as file:
            cloudpickle.dump(self._model, file)

    def load(self, path):
        with open(self._file_path(path), "rb") as file:
            model = cloudpickle.load(file)
        return self.pack(model)

    def _file_path(self, base_path):
        return os.path.join(base_path, self.name + ".pkl")

In [None]:
%%writefile bento_lfm_artifacts.py

import os
from bentoml.utils import cloudpickle
from bentoml.service.artifacts import BentoServiceArtifact


class LightFMModelArtifact(BentoServiceArtifact):
    def __init__(self, name):
        super(LightFMModelArtifact, self).__init__(name)
        self._model = None

    def pack(self, model, metadata=None):
        self._model = model
        return self

    def get(self):
        return self._model

    def save(self, directory):
        path = self._file_path(directory)
        with open(path, "wb") as file:
            cloudpickle.dump(self._model, file)

    def load(self, path):
        with open(self._file_path(path), "rb") as file:
            model = cloudpickle.load(file)
        return self.pack(model)

    def _file_path(self, base_path):
        return os.path.join(base_path, self.name + ".pkl")


class LightFMDatasetArtifact(LightFMModelArtifact):
    def __init__(self, name):
        super(LightFMDatasetArtifact, self).__init__(name)
        self._model = None

In [None]:
%%writefile bento_lfm_service.py

from bento_lfm_artifacts import LightFMModelArtifact, LightFMDatasetArtifact
from pop_rec_artifact import PopRecArtifact
from bentoml import BentoService, env, api, artifacts
from bentoml.adapters import JsonInput, JsonOutput
from bentoml.exceptions import BadInput

import numpy as np
from string import punctuation


@env(infer_pip_packages=True)
@artifacts(
    [
        LightFMModelArtifact(name="model"),
        LightFMDatasetArtifact(name="dataset"),
        PopRecArtifact(name="pop_rec"),
    ]
)
class LightFMRecService(BentoService):
    @api(input=JsonInput(), output=JsonOutput(), batch=True)
    def recommend(self, input_data):
        model = self.artifacts.model
        pop_rec = self.artifacts.pop_rec
        dataset = self.artifacts.dataset
        items_map = dataset.mapping()[2]
        users_map = dataset.mapping()[0]
        if isinstance(input_data, dict):
            input_data = [input_data]
        if len(input_data) > 1:
            return [{"message": BadInput(f"input has too many elements")}]
        input_data = input_data[0]
        if not isinstance(input_data, dict):
            return [{"message": BadInput(f"input type is not allowed")}]
        hashed_email = input_data.get('hashed_email')
        if hashed_email is None or any(p in hashed_email for p in punctuation):
            return [{"message": BadInput(f"input {hashed_email} is not accepted")}]
        recommendations = self._recommend(
            model, hashed_email, users_map, items_map, k=3, pop_rec=pop_rec
        )
        result = {
            "meta": input_data,
            "data": {"recommendations": recommendations},
        }
        return [result]

    @staticmethod
    def _recommend(model, user_id, users_map, items_map, k: int, pop_rec):
        items_arr = np.array(list(items_map.keys()))
        n_items = len(items_map)

        _user_id = users_map.get(user_id)
        if _user_id is None:
            return pop_rec[:k]
        scores = model.predict(_user_id, np.arange(n_items))
        top_items = items_arr[np.argsort(-scores)]
        return list(top_items[:k])


In [None]:
from bento_lfm_service import LightFMRecService

lfm_service = LightFMRecService()

lfm_service.pack("model", model)
lfm_service.pack("dataset", dataset)
lfm_service.pack("pop_rec", pop_rec)

In [None]:
lfm_service.recommend({'hashed_email': 'a'})

# Save predict service to disk

In [None]:
saved_path = lfm_service.save(version="v0.1")

# REST API Model Serving

In [None]:
!bentoml serve LightFMRecService:v0.1

# Predict for all users

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
CHUNK_SIZE = 10000

_item_ids = np.arange(dataset.item_features_shape()[0])
_user_ids = chunks(np.arange(dataset.user_features_shape()[0]), CHUNK_SIZE)

In [None]:
def lfm_predict(model, user_ids, item_ids, k: int):
    user_ids_ = np.repeat(np.int32(user_ids), len(item_ids))
    item_ids_ = np.int32(np.tile(item_ids, len(user_ids)))
    predictions = model.predict(user_ids_, item_ids_)
    predictions_df = pd.DataFrame({'user_id': user_ids_, 'item_id': item_ids_, 'score': predictions })
    output = predictions_df.sort_values('score', ascending=False).groupby(['user_id']).head(k)
    return output

In [None]:
OUTPUT_PRED_DIR = "../data/output"
i = 0
for _uids in tqdm(_user_ids):
    uids = list(_uids)
    _output = lfm_predict(model, uids, _item_ids, k=10)
    output_fpath = f'{OUTPUT_PRED_DIR}/output_predictions_{str(i).zfill(4)}.parquet'
    print(f"Saving {output_fpath}")
    _output.to_parquet(output_fpath)
    i += 1