# Matrix Factorization

The first model to compare with baseline is matrix factorization. Since we need to train implicit data, we consider Alternating Least Squares (ALS) as MF model which simple and truely competitive.

In [218]:
import os
import pickle

import numpy as np
import pandas as pd
from implicit.als import AlternatingLeastSquares
from scipy import sparse
from tqdm.auto import tqdm

from utils.metrics import *

## Load data

In [69]:
train = pd.read_parquet("./data/train.parquet")
valid = pd.read_parquet("./data/valid.parquet")
test = pd.read_parquet("./data/test.parquet")
most_common_beers = pd.read_parquet("./data/most_common_beers.parquet")

In [3]:
train_users = train["review_profilename"].unique()
valid_users = valid["review_profilename"].unique()
test_users = test["review_profilename"].unique()

In [178]:
baseline_topk = most_common_beers[:10]["beer_name"].tolist()

In [4]:
with open("./data/user_mapper.dict", "rb") as f:
    user_mapper = pickle.load(f)
with open("./data/item_mapper.dict", "rb") as f:
    item_mapper = pickle.load(f)

In [5]:
train_mat = np.zeros((len(user_mapper), len(item_mapper)))
valid_mat = np.zeros((len(user_mapper), len(item_mapper)))
test_mat = np.zeros((len(user_mapper), len(item_mapper)))

In [6]:
for _, user, item, rating in tqdm(train.itertuples(), total=len(train)):
    train_mat[user, item] = 1

for _, user, item, rating in tqdm(valid.itertuples(), total=len(valid)):
    valid_mat[user, item] = 1

for _, user, item, rating in tqdm(test.itertuples(), total=len(test)):
    test_mat[user, item] = 1

  0%|          | 0/1022381 [00:00<?, ?it/s]

  0%|          | 0/220993 [00:00<?, ?it/s]

  0%|          | 0/222265 [00:00<?, ?it/s]

## Modeling

In [None]:
def predict(
    user_ids: np.ndarray,
    train_mat: sparse.csr_matrix,
    model: AlternatingLeastSquares,
    popular_items: list,
    top_k: int,
) -> np.ndarray:
    # Make recommendations based on the model
    rec = model.recommend(
        user_ids, train_mat[user_ids], N=top_k, filter_already_liked_items=True
    )

    # Substitutes for cold users with the most popular items
    rec_items = np.array(
        [
            popular_items if np.all(scores == 0) else items
            for items, scores in zip(rec[0], rec[1])
        ]
    )

    return rec_items

In [None]:
train_csr = sparse.csr_matrix(train_mat)
valid_csr = sparse.csr_matrix(valid_mat)
test_csr = sparse.csr_matrix(test_mat)

In [209]:
# Hyper-parameters
factors = 128
regularization = 0.01

In [210]:
model = AlternatingLeastSquares(
    factors=factors,
    regularization=regularization,
    iterations=200,
    random_state=22,
)

In [211]:
model.fit(train_csr)

  0%|          | 0/200 [00:00<?, ?it/s]

In [234]:
test_users

array([31260,  4865, 23128, ..., 21243, 22791, 17589])

In [212]:
pred = predict(
    user_ids=test_users,
    train_mat=train_csr,
    model=model,
    popular_items=baseline_topk,
    top_k=10,
)

In [213]:
map_at_10 = map_at_k(actual=test_mat[test_users], pred=pred, top_k=10)

print(f"MAP@10 for baseline: {map_at_10:.6f}")

  0%|          | 0/9085 [00:00<?, ?it/s]

MAP@10 for baseline: 0.053626


## Prepare the hyperparameter tuning job

In [312]:
REGION = "us-east1"

PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

ARTIFACT_STORE = f"gs://{PROJECT_ID}-beer-artifact-store"

DATA_ROOT = f"{ARTIFACT_STORE}/data"
JOB_DIR_ROOT = f"{ARTIFACT_STORE}/jobs"
API_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"

In [313]:
os.environ["JOB_DIR_ROOT"] = JOB_DIR_ROOT
os.environ["PROJECT_ID"] = PROJECT_ID
os.environ["REGION"] = REGION

In [316]:
!gsutil ls | grep ^{ARTIFACT_STORE}/$ || gsutil mb -l {REGION} {ARTIFACT_STORE}

Creating gs://qwiklabs-asl-04-5e165f533cac-beer-artifact-store/...


In [317]:
!gsutil -m cp ./data/* {DATA_ROOT}

Copying file://./data/item_mapper.dict [Content-Type=application/octet-stream]...
Copying file://./data/item_mapper.parquet [Content-Type=application/octet-stream]...
Copying file://./data/most_common_beers.parquet [Content-Type=application/octet-stream]...
Copying file://./data/test.parquet [Content-Type=application/octet-stream]...   
Copying file://./data/train.parquet [Content-Type=application/octet-stream]...  
Copying file://./data/user_mapper.dict [Content-Type=application/octet-stream]...
Copying file://./data/user_mapper.parquet [Content-Type=application/octet-stream]...
Copying file://./data/valid.parquet [Content-Type=application/octet-stream]...  
- [8/8 files][ 16.7 MiB/ 16.7 MiB] 100% Done                                    
Operation completed over 8 objects/16.7 MiB.                                     
