# Collaborative filtering

An attempt to match fastai's performance on [MovieLens 100K](https://grouplens.org/datasets/movielens/100k/), but with pure pytorch, not using fastai's library.

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
import fastai
import fastai.collab
import fastai.datasets
import fastai.tabular.transform
import math
import numpy
import os
import pandas
import time
import torch
from torch import nn
import typing
import matplotlib.pyplot as plt

In [2]:
dev = torch.device("cpu") # Seems to be much faster than GPU for this application

## Prepare data ##

In [3]:
import zipfile
zip_path = fastai.datasets.download_data("http://files.grouplens.org/datasets/movielens/ml-100k.zip", ext="")
dest_dir = zip_path.parent
data_dir = os.path.splitext(zip_path)[0]
if not os.path.exists(data_dir):
    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(dest_dir)

In [4]:
col_names = ("user", "item", "rating", "timestamp")
train_df = pandas.read_csv(os.path.join(data_dir, "ua.base"), sep="\t", names=col_names)
test_df = pandas.read_csv(os.path.join(data_dir, "ua.test"), sep="\t", names=col_names)
concat_df = pandas.concat((train_df, test_df))
n_item = concat_df["item"].max()
n_user = concat_df["user"].max()
print(f"n_item: {n_item}, n_user: {n_user}")

class MovieLensDataset(torch.utils.data.Dataset):
    def __init__(self, df: pandas.DataFrame, device: torch.device):
        # Indices into embeddings need to have dtype "long".
        self.ids_tensor = torch.tensor(df[["user", "item"]].to_numpy(), dtype=torch.long, device=device)
        self.ratings_tensor = torch.tensor(df[["rating"]].to_numpy(), dtype=torch.float, device=device)
        
    def __len__(self):
        return len(self.ids_tensor)

    def __getitem__(self, idx):
        return self.ids_tensor[idx], self.ratings_tensor[idx]
        

train_dataset = MovieLensDataset(train_df, dev)
test_dataset = MovieLensDataset(test_df, dev)

n_item: 1682, n_user: 943


In [5]:
batch_size = 64
num_epochs = 10

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
test_inputs, test_labels = test_dataset[:]

## fastai benchmark
fastai claims state of the art performance, so let's start with that and see how it does.
This is based on <https://github.com/fastai/course-v3/blob/master/nbs/dl1/lesson4-collab.ipynb>.

In [6]:
# Hacked up copy of CollabDataBunch.from_df because I want to use test_df as the validation set.
user_name   = concat_df.columns[0]
item_name   = concat_df.columns[1]
rating_name = concat_df.columns[2]
cat_names = [user_name,item_name]
num_train = len(train_df)
src = (fastai.collab.CollabList.from_df(concat_df, cat_names=cat_names, procs=fastai.tabular.transform.Categorify)
        .split_by_idxs(train_idx=numpy.arange(num_train), valid_idx=numpy.arange(num_train, num_train + len(test_df)))
        .label_from_df(cols=rating_name))
data_bunch = src.databunch(path=".", bs=batch_size, val_bs=batch_size, device=dev)
assert len(data_bunch.dl(fastai.basic_data.DatasetType.Train).dl.dataset.x) == num_train
data_bunch.show_batch()

user,item,target
592,405,4.0
659,319,3.0
561,510,3.0
519,350,5.0
194,89,3.0


In [22]:
fastai_learn = fastai.collab.collab_learner(data_bunch, n_factors=40, y_range=[0,5.5], wd=1e-1)
fastai_learn.fit_one_cycle(num_epochs, 1e-2)

epoch,train_loss,valid_loss,time
0,0.930389,1.043625,00:08
1,0.881159,0.995233,00:08
2,0.870824,0.959754,00:08
3,0.841655,0.946414,00:08
4,0.787724,0.941334,00:08
5,0.717227,0.917258,00:08
6,0.632812,0.90219,00:08
7,0.522469,0.895359,00:08
8,0.416748,0.898977,00:08
9,0.333631,0.900505,00:08


In [23]:
fastai_pred = fastai_learn.get_preds(ds_type=fastai.data_block.DatasetType.Valid)
torch.nn.functional.mse_loss(*fastai_pred).item()

## My own implementation

In [28]:
class Fitter:
    def __init__(self, model: nn.Module, loss_func: nn.Module, optim: torch.optim.Optimizer):
        self.model = model
        self.loss_func = loss_func
        self.optim = optim
        self.losses = []

    def fit(self, num_epochs: int):
        print("epoch | train_loss | test_loss | time")
        for epoch in range(num_epochs):
            start = time.time()
            train_loss = torch.tensor([0.0], dtype=float, device=dev)
            self.model.train()
            for batch_idx, (inputs, targets) in enumerate(train_loader, 0):
                train_loss += self._one_batch(inputs, targets)

            self.model.eval()
            with torch.no_grad():
                # Convert from 1-based to 0-based index.
                pred = self.model(test_inputs[:, 0] - 1, test_inputs[:, 1] - 1)
                test_loss = self.loss_func(pred, test_labels).item()

            num_batches = batch_idx + 1
            self.losses.append((train_loss / num_batches, test_loss))
            print("%5d |      %.3f |     %.3f |   %ds |" % (
                epoch,
                self.losses[-1][0],
                self.losses[-1][1],
                int(time.time() - start)))

    def _one_batch(self, inputs: torch.tensor, targets: torch.tensor) -> torch.tensor:
        # Convert from 1-based to 0-based index.
        users, items = inputs[:, 0] - 1, inputs[:, 1] - 1
        pred = self.model(users, items)
        loss = self.loss_func(pred, targets)
        loss.backward()
        self.optim.step()
        self.optim.zero_grad()
        return loss

class FitterOneCycle(Fitter):
    def __init__(self, *args):
        super().__init__(*args)
        self.scheduler = None
    
    def fit(self, num_epochs: int):
        self.scheduler = torch.optim.lr_scheduler.OneCycleLR(
            self.optim, max_lr=1e-2,
            final_div_factor=25e4, # Copying fastai
            epochs=num_epochs,
            steps_per_epoch=math.ceil(len(train_df) / batch_size))
        super().fit(num_epochs)

    def _one_batch(self, inputs: torch.tensor, targets: torch.tensor) -> torch.tensor:
        loss = super()._one_batch(inputs, targets)
        self.scheduler.step()
        return loss

In [29]:
def trunc_normal_(x: torch.tensor, mean: float=0., std: float=1.) -> torch.tensor:
    "Truncated normal initialization."
    # From https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/12
    return x.normal_().fmod_(2).mul_(std).add_(mean)

In [51]:
class ScaledDotProdBias(nn.Module):
    """Same as DotProdBias, but scale the output to be within y_range."""
    def __init__(self, n_user: int, n_item: int, embedding_dim: int, y_range: typing.Tuple[int, int], trunc_normal: bool=False):
        super().__init__()
        self.user_emb = nn.Embedding(num_embeddings=n_user, embedding_dim=embedding_dim)
        self.item_emb = nn.Embedding(num_embeddings=n_item, embedding_dim=embedding_dim)
        self.user_bias = nn.Embedding(num_embeddings=n_user, embedding_dim=1)
        self.item_bias = nn.Embedding(num_embeddings=n_item, embedding_dim=1)
        if trunc_normal:
            # Based on
            # https://github.com/fastai/fastai1/blob/6a5102ef7bdefa9058d0481ab311f48b21cbc6fc/fastai/layers.py#L285
            for e in (self.user_emb, self.item_emb, self.user_bias, self.item_bias):
                with torch.no_grad(): trunc_normal_(e.weight, std=0.01)
        self.y_min, self.y_max = y_range
    
    def forward(self, users: torch.LongTensor, items: torch.LongTensor) -> torch.FloatTensor:
        dot_prods = (self.user_emb(users) * self.item_emb(items)).sum(dim=1)
        u_bias = self.user_bias(users)
        u_bias_view = u_bias.view(u_bias.size(0))
        i_bias = self.item_bias(items)
        i_bias_view = i_bias.view(i_bias.size(0))
        biased = dot_prods + u_bias_view + i_bias_view
        res = self.y_min + (self.y_max - self.y_min) * nn.functional.sigmoid(biased)
        import pdb; pdb.set_trace()
        return res

In [52]:
model = ScaledDotProdBias(n_user, n_item, 40, (0.5, 5.5), trunc_normal=True).to(dev)
fitter_one_cycle = FitterOneCycle(
    model,
    nn.MSELoss(),
    torch.optim.AdamW(model.parameters(), betas=(0.9, 0.99)))
fitter_one_cycle.fit(num_epochs)

epoch | train_loss | test_loss | time
> [0;32m<ipython-input-51-df6831001428>[0m(25)[0;36mforward[0;34m()[0m
[0;32m     21 [0;31m        [0mi_bias_view[0m [0;34m=[0m [0mi_bias[0m[0;34m.[0m[0mview[0m[0;34m([0m[0mi_bias[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;36m0[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     22 [0;31m        [0mbiased[0m [0;34m=[0m [0mdot_prods[0m [0;34m+[0m [0mu_bias_view[0m [0;34m+[0m [0mi_bias_view[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     23 [0;31m        [0mres[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0my_min[0m [0;34m+[0m [0;34m([0m[0mself[0m[0;34m.[0m[0my_max[0m [0;34m-[0m [0mself[0m[0;34m.[0m[0my_min[0m[0;34m)[0m [0;34m*[0m [0mnn[0m[0;34m.[0m[0mfunctional[0m[0;34m.[0m[0msigmoid[0m[0;34m([0m[0mbiased[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     24 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;

BdbQuit: 

In [56]:
best_prev = 0.831, 0.920
prev = fitter_one_cycle.losses

## TODO ##
 * look into OneCycleLR parameters vs fastai's implementation
    * learning rates are the same
    * fastai's opt.mom is the same as pytorch's betas[0]
    * what is betas[1]?
 * But something is likely different either in OneCycleLR and/or in AdamW, because I think those are the main differences.
