# BEMB on the Movie Lens Dataset
**Note**: since the creator of movie-lens dataset does not permit public redistribution of the dataset, you need to download them [here](https://grouplens.org/datasets/movielens/25m/) manually.

In [38]:
import os
import numpy as np
import pandas as pd
import torch

from sklearn.preprocessing import LabelEncoder

from torch_choice.data import ChoiceDataset
from bemb.model import LitBEMBFlex

## Load the Movie Lens Dataset

Please set the `DATA_PATH` to where you place the downloaded dataset.

In [3]:
DATA_PATH = './ml-latest-small/'

In [16]:
df = pd.read_csv(os.path.join(DATA_PATH, 'ratings.csv'))

In [17]:
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [18]:
user_encoder = LabelEncoder().fit(df['userId'].values)
item_encoder = LabelEncoder().fit(df['movieId'].values)

In [27]:
num_users = len(user_encoder.classes_)
num_items = len(item_encoder.classes_)

print(f'{num_users=:}')
print(f'{num_items=:}')

num_users=610
num_items=9724


In [28]:
user_index = torch.LongTensor(user_encoder.transform(df['userId'].values))
item_index = torch.LongTensor(item_encoder.transform(df['movieId'].values))

In [29]:
df['rating'].median()

3.5

In [30]:
label = torch.LongTensor(df['rating'] > 3.5)

In [31]:
dataset = ChoiceDataset(user_index=user_index, item_index=item_index, label=label)

No `session_index` is provided, assume each choice instance is in its own session.


## Split Train/Validation/Test Datasets

In [39]:
idx = np.random.permutation(len(dataset))
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
train_idx = idx[:train_size]
val_idx = idx[train_size: train_size + val_size]
test_idx = idx[train_size + val_size:]

dataset_list = [dataset[train_idx], dataset[val_idx], dataset[test_idx]]
print(dataset_list)

[ChoiceDataset(label=[80668], item_index=[80668], user_index=[80668], session_index=[80668], item_availability=[], device=cpu), ChoiceDataset(label=[10083], item_index=[10083], user_index=[10083], session_index=[10083], item_availability=[], device=cpu), ChoiceDataset(label=[10085], item_index=[10085], user_index=[10085], session_index=[10085], item_availability=[], device=cpu)]


In [50]:
LATENT_DIM = 5  # the dimension of alpha and theta.
bemb = LitBEMBFlex(
    learning_rate=0.01,  # set the learning rate, feel free to play with different levels.
    pred_item=False, 
    num_seeds=4,  # number of Monte Carlo samples for estimating the ELBO.
    utility_formula='lambda_item + theta_user * alpha_item',  # the utility formula.
    num_users=num_users,
    num_items=num_items,
    # num_user_obs=dataset.user_obs.shape[1],
    # num_item_obs=dataset.item_obs.shape[1],
    # whether to turn on obs2prior for each parameter.
    obs2prior_dict={'lambda_item': False, 'theta_user': False, 'alpha_item': False},
    # the dimension of latents, since the utility is an inner product of theta and alpha, they should have
    # the same dimension.
    coef_dim_dict={'lambda_item': 1, 'theta_user': LATENT_DIM, 'alpha_item': LATENT_DIM}
)

BEMB: utility formula parsed:
[{'coefficient': ['lambda_item'], 'observable': None},
 {'coefficient': ['theta_user', 'alpha_item'], 'observable': None}]


In [51]:
bemb = bemb.fit_model(dataset_list, batch_size=len(dataset) // 20, num_epochs=50, num_workers=0)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type     | Params
-----------------------------------
0 | model | BEMBFlex | 122 K 
-----------------------------------
122 K     Trainable params
0         Non-trainable params
122 K     Total params
0.491     Total estimated model params size (MB)


Bayesian EMBedding Model with U[user, item, session] = lambda_item + theta_user * alpha_item
Total number of parameters: 122788.
With the following coefficients:
ModuleDict(
  (lambda_item): BayesianCoefficient(num_classes=9724, dimension=1, prior=N(0, I))
  (theta_user): BayesianCoefficient(num_classes=610, dimension=5, prior=N(0, I))
  (alpha_item): BayesianCoefficient(num_classes=9724, dimension=5, prior=N(0, I))
)
[]
[Training dataset] ChoiceDataset(label=[80668], item_index=[80668], user_index=[80668], session_index=[80668], item_availability=[], device=cpu)
[Validation dataset] ChoiceDataset(label=[10083], item_index=[10083], user_index=[10083], session_index=[10083], item_availability=[], device=cpu)
[Testing dataset] ChoiceDataset(label=[10085], item_index=[10085], user_index=[10085], session_index=[10085], item_availability=[], device=cpu)
                                                                           

  rank_zero_warn(
  rank_zero_warn(


Epoch 49: 100%|██████████| 20/20 [00:00<00:00, 40.13it/s, loss=6.25e+04, v_num=5, val_acc=0.652, val_ll=-.657]
time taken: 25.43624210357666


  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 21/21 [00:00<00:00, 158.58it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.6351016360932077
         test_ll            -0.6604216664423728
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
