# Matrix Factorization

The first model to compare with baseline is matrix factorization. Since we need to train implicit data, we consider Alternating Least Squares (ALS) as MF model which simple and truely competitive.

In [20]:
import pickle

import numpy as np
import pandas as pd

from implicit.als import AlternatingLeastSquares
from scipy import sparse
from tqdm.auto import tqdm

## Load data

In [4]:
train = pd.read_parquet("./data/train.parquet")
valid = pd.read_parquet("./data/valid.parquet")
test = pd.read_parquet("./data/test.parquet")

In [12]:
with open("./data/user_mapper.dict", "rb") as f:
    user_mapper =  pickle.load(f)
with open("./data/item_mapper.dict", "rb") as f:
    item_mapper =  pickle.load(f)

In [14]:
train_mat = np.zeros((len(user_mapper), len(item_mapper)))
valid_mat = np.zeros((len(user_mapper), len(item_mapper)))
test_mat = np.zeros((len(user_mapper), len(item_mapper)))

In [18]:
for _, user, item, rating in tqdm(train.itertuples(), total=len(train)):
    train_mat[user, item] = 1

for _, user, item, rating in tqdm(valid.itertuples(), total=len(valid)):
    valid_mat[user, item] = 1

for _, user, item, rating in tqdm(test.itertuples(), total=len(test)):
    test_mat[user, item] = 1

  0%|          | 0/1022381 [00:00<?, ?it/s]

  0%|          | 0/220993 [00:00<?, ?it/s]

  0%|          | 0/222265 [00:00<?, ?it/s]

In [22]:
train_csr = sparse.csr_matrix(train_mat)
valid_csr = sparse.csr_matrix(valid_mat)
test_csr = sparse.csr_matrix(test_mat)

## Modeling

In [23]:
alpha = 40.

train_csr = train_csr * alpha
valid_csr = valid_csr * alpha

In [30]:
model = AlternatingLeastSquares(
    factors=64,
    regularization=0.01,
    iterations=1,
    random_state=0,
)

In [31]:
model.fit(train_csr)

  0%|          | 0/1 [00:00<?, ?it/s]

In [35]:
model.recommend(0, train_csr[0], N=32, filter_already_liked_items=True)

(array([33277, 46691, 12087, 33035, 15644, 22092, 36867, 40038, 25845,
        40202, 44728, 48637,  7141, 29597, 55921, 21488, 16609, 22056,
         3583, 23543, 29736, 15391,  3133, 21049, 25645, 45841, 37218,
        20363, 46455, 44057, 41039, 43984], dtype=int32),
 array([1.3365017, 1.2689261, 1.2597394, 1.2145134, 1.2069377, 1.1848722,
        1.1714611, 1.1572101, 1.1547256, 1.148337 , 1.1452827, 1.121942 ,
        1.1141772, 1.1061432, 1.0975329, 1.0939169, 1.0913292, 1.0862386,
        1.0818   , 1.0556585, 1.053215 , 1.0524387, 1.0522645, 1.0451792,
        1.0431015, 1.0397195, 1.0386037, 1.0368847, 1.0258074, 1.0253582,
        1.024171 , 1.0189586], dtype=float32))

In [29]:
model.iterations

1

In [28]:
model.fit(train_csr)

  0%|          | 0/1 [00:00<?, ?it/s]