# Matrix Factorization

The first model to compare with baseline is matrix factorization. Since we need to train implicit data, we consider Alternating Least Squares (ALS) as MF model which simple and truely competitive.

In [218]:
import os
import pickle

import numpy as np
import pandas as pd
from implicit.als import AlternatingLeastSquares
from scipy import sparse
from tqdm.auto import tqdm

from utils.metrics import *

## Load data

In [69]:
train = pd.read_parquet("./data/train.parquet")
valid = pd.read_parquet("./data/valid.parquet")
test = pd.read_parquet("./data/test.parquet")
most_common_beers = pd.read_parquet("./data/most_common_beers.parquet")

In [3]:
train_users = train["review_profilename"].unique()
valid_users = valid["review_profilename"].unique()
test_users = test["review_profilename"].unique()

In [178]:
baseline_topk = most_common_beers[:10]["beer_name"].tolist()

In [4]:
with open("./data/user_mapper.dict", "rb") as f:
    user_mapper = pickle.load(f)
with open("./data/item_mapper.dict", "rb") as f:
    item_mapper = pickle.load(f)

In [5]:
train_mat = np.zeros((len(user_mapper), len(item_mapper)))
valid_mat = np.zeros((len(user_mapper), len(item_mapper)))
test_mat = np.zeros((len(user_mapper), len(item_mapper)))

In [6]:
for _, user, item, rating in tqdm(train.itertuples(), total=len(train)):
    train_mat[user, item] = 1

for _, user, item, rating in tqdm(valid.itertuples(), total=len(valid)):
    valid_mat[user, item] = 1

for _, user, item, rating in tqdm(test.itertuples(), total=len(test)):
    test_mat[user, item] = 1

  0%|          | 0/1022381 [00:00<?, ?it/s]

  0%|          | 0/220993 [00:00<?, ?it/s]

  0%|          | 0/222265 [00:00<?, ?it/s]

## Modeling

In [None]:
def predict(
    user_ids: np.ndarray,
    train_mat: sparse.csr_matrix,
    model: AlternatingLeastSquares,
    popular_items: list,
    top_k: int,
) -> np.ndarray:
    # Make recommendations based on the model
    rec = model.recommend(
        user_ids, train_mat[user_ids], N=top_k, filter_already_liked_items=True
    )

    # Substitutes for cold users with the most popular items
    rec_items = np.array(
        [
            popular_items if np.all(scores == 0) else items
            for items, scores in zip(rec[0], rec[1])
        ]
    )

    return rec_items

In [None]:
train_csr = sparse.csr_matrix(train_mat)
valid_csr = sparse.csr_matrix(valid_mat)
test_csr = sparse.csr_matrix(test_mat)

In [209]:
# Hyper-parameters
factors = 128
regularization = 0.01

In [210]:
model = AlternatingLeastSquares(
    factors=factors,
    regularization=regularization,
    iterations=200,
    random_state=22,
)

In [211]:
model.fit(train_csr)

  0%|          | 0/200 [00:00<?, ?it/s]

In [212]:
pred = predict(
    user_ids=test_users,
    train_mat=train_csr,
    model=model,
    popular_items=baseline_topk,
    top_k=10,
)

In [213]:
map_at_10 = map_at_k(actual=test_mat[test_users], pred=pred, top_k=10)

print(f"MAP@10 for baseline: {map_at_10:.6f}")

  0%|          | 0/9085 [00:00<?, ?it/s]

MAP@10 for baseline: 0.053626


## Write scripts

### Prepare the applications

In [219]:
TRAINING_APP_DIR = "training_app"
os.makedirs(TRAINING_APP_DIR, exist_ok=True)

In [None]:
%%writefile {TRAINING_APP_DIR}/train.py

import fire
import hypertune
from implicit.als import AlternatingLeastSquares
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix

# Load matrices and files
most_common_beers = pd.read_parquet("./data/most_common_beers.parquet")
train_mat = np.load("../data/matrices/train_mat.npy")
valid_mat = np.load("../data/matrices/valid_mat.npy")

with open("./data/user_mapper.dict", "rb") as f:
    user_mapper = pickle.load(f)
with open("./data/item_mapper.dict", "rb") as f:
    item_mapper = pickle.load(f)

# Preprocess
baseline_topk = most_common_beers[:10]["beer_name"].tolist()
valid_users = np.where(valid_mat.sum(axis=1) > 0)

train_csr = csr_matrix(train_mat)
valid_csr = csr_matrix(valid_mat)

def _topk(arr: np.ndarray, k: int) -> np.ndarray:
    r"""Returns indices of k largest element of the given input matrix along
    the horizontal axis.
    Parameters
    ----------
    input : np.ndarray
        _description_
    k : int
        _description_
    Returns
    -------
    np.ndarray
        _description_
    """
    return np.argsort(arr)[:, -k:][:, ::-1]

def map_at_k(actual: np.ndarray, pred: np.ndarray, top_k: int, is_score=False) -> float:
    r"""Mean average precision at k.
    Parameters
    ----------
    actual : np.ndarray
        A matrix with actual values.
    pred : np.ndarray
        A matrix with predictions.
    top_k : int
    Returns
    -------
    float
        Mean average precision at k
    """
    if is_score:
        if not _assert_same_dimension(actual, pred):
            raise AssertionError("Two input matrices should have same dimension.")
    else:
        if len(actual) != len(pred):
            raise AssertionError("Two input matrices should have same length.")

    map_ = 0

    num_users = len(pred)
    if is_score:
        top_k_items = _topk(arr=pred, k=top_k)
    else:
        top_k_items = pred[:, :top_k]
    
    for i in range(num_users):
        actual_item = set(actual[i].nonzero()[0])
        pred_item = top_k_items[i]

        map_ += _ap_at_k(actual=actual_item, pred=pred_item, top_k=top_k)
        
    return map_ / num_users


def _ap_at_k(actual: np.array, pred: np.array, top_k: int) -> float:
    r"""Avearge precision at k
    Parameters
    ----------
    actual : np.array
        A list of item are to be predicted
    pred : np.array
        A list of predicted items
    top_k : int
    Returns
    -------
    float
        Average precision at k
    """

    if len(pred) > top_k:
        pred = pred[:top_k]

    p, cnt = 0, 0

    if not actual:
        return 0.0

    for idx, item in enumerate(pred):
        if item in actual:
            cnt += 1
            p += cnt / (idx + 1)

    return 0.0 if cnt == 0 else p / min(cnt, len(actual))

def _assert_same_dimension(actual: np.ndarray, pred: np.ndarray) -> bool:
    r"""Check the actual matrix and the prediction have same dimension.
    Parameters
    ----------
    actual : np.ndarray
        Actual values
    pred : np.ndarray
        Predicted values
    Returns
    -------
    bool
    """
    return actual.shape == pred.shape

def train(job_dir: str, factors: int, regularization: float, iterations:int, is_tune: bool) -> None:
    model = AlternatingLeastSquares(
        factors=factors,
        regularization=regularization,
        iterations=iterations,
        random_state=0,
    )
    model.fit(train_csr)
    
    if is_tune:
        pred = predict(
            user_ids=valid_users,
            train_mat=train_csr,
            model=model,
            popular_items=baseline_topk,
            top_k=10,
        )
    

def predict(
    user_ids: np.ndarray,
    train_mat: csr_matrix,
    model: AlternatingLeastSquares,
    popular_items: list,
    top_k: int,
) -> np.ndarray:
    # Make recommendations based on the model
    rec = model.recommend(
        user_ids, train_mat[user_ids], N=top_k, filter_already_liked_items=True
    )

    # Substitutes for cold users with the most popular items
    rec_items = np.array(
        [
            popular_items if np.all(scores == 0) else items
            for items, scores in zip(rec[0], rec[1])
        ]
    )

    return rec_items

