# Collaborative filtering

An attempt to match fastai's performance on [MovieLens 100K](https://grouplens.org/datasets/movielens/100k/), but with pure pytorch, not using fastai's library.

In [2]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
import fastai
import fastai.collab
import fastai.datasets
import fastai.tabular.transform
import math
import numpy
import os
import pandas
import time
import torch
from torch import nn
import typing
import matplotlib.pyplot as plt

In [3]:
dev = torch.device("cpu") # Seems to be much faster than GPU for this application

## Prepare data ##

In [4]:
import zipfile
zip_path = fastai.datasets.download_data("http://files.grouplens.org/datasets/movielens/ml-100k.zip", ext="")
dest_dir = zip_path.parent
data_dir = os.path.splitext(zip_path)[0]
if not os.path.exists(data_dir):
    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(dest_dir)

In [5]:
col_names = ("user", "item", "rating", "timestamp")
train_df = pandas.read_csv(os.path.join(data_dir, "ua.base"), sep="\t", names=col_names)
test_df = pandas.read_csv(os.path.join(data_dir, "ua.test"), sep="\t", names=col_names)
concat_df = pandas.concat((train_df, test_df))
n_item = concat_df["item"].max()
n_user = concat_df["user"].max()
print(f"n_item: {n_item}, n_user: {n_user}")

class MovieLensDataset(torch.utils.data.Dataset):
    def __init__(self, df: pandas.DataFrame, device: torch.device):
        # Indices into embeddings need to have dtype "long".
        self.ids_tensor = torch.tensor(df[["user", "item"]].to_numpy(), dtype=torch.long, device=device)
        self.ratings_tensor = torch.tensor(df[["rating"]].to_numpy(), dtype=torch.float, device=device)
        
    def __len__(self):
        return len(self.ids_tensor)

    def __getitem__(self, idx):
        return self.ids_tensor[idx], self.ratings_tensor[idx]
        

train_dataset = MovieLensDataset(train_df, dev)
test_dataset = MovieLensDataset(test_df, dev)

n_item: 1682, n_user: 943


In [6]:
batch_size = 64
num_epochs = 10

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
test_inputs, test_labels = test_dataset[:]

## fastai benchmark
fastai claims state of the art performance, so let's start with that and see how it does.
This is based on <https://github.com/fastai/course-v3/blob/master/nbs/dl1/lesson4-collab.ipynb>.

In [7]:
# Hacked up copy of CollabDataBunch.from_df because I want to use test_df as the validation set.
user_name   = concat_df.columns[0]
item_name   = concat_df.columns[1]
rating_name = concat_df.columns[2]
cat_names = [user_name,item_name]
num_train = len(train_df)
src = (fastai.collab.CollabList.from_df(concat_df, cat_names=cat_names, procs=fastai.tabular.transform.Categorify)
        .split_by_idxs(train_idx=numpy.arange(num_train), valid_idx=numpy.arange(num_train, num_train + len(test_df)))
        .label_from_df(cols=rating_name))
data_bunch = src.databunch(path=".", bs=batch_size, val_bs=batch_size, device=dev)
assert len(data_bunch.dl(fastai.basic_data.DatasetType.Train).dl.dataset.x) == num_train
data_bunch.show_batch()

user,item,target
778,69,2.0
270,815,4.0
632,1183,2.0
825,25,4.0
141,15,5.0


In [9]:
fastai_learn = fastai.collab.collab_learner(data_bunch, n_factors=40, y_range=[0,5.5], wd=1e-1)
fastai_learn.fit_one_cycle(num_epochs, 1e-2)

epoch,train_loss,valid_loss,time
0,0.956673,1.053949,00:07
1,0.913251,0.989121,00:07
2,0.896868,0.958328,00:08
3,0.83453,0.954619,00:07
4,0.776227,0.926298,00:07
5,0.714198,0.914857,00:07
6,0.622492,0.903829,00:07
7,0.51837,0.899892,00:08
8,0.412492,0.89885,00:07
9,0.35179,0.899691,00:07


self.opt.lr = 0.003, self.opt.mom = 0.925
self.opt.lr = 0.008, self.opt.mom = 0.875
self.opt.lr = 0.010, self.opt.mom = 0.850
self.opt.lr = 0.010, self.opt.mom = 0.855
self.opt.lr = 0.008, self.opt.mom = 0.869
self.opt.lr = 0.006, self.opt.mom = 0.889
self.opt.lr = 0.004, self.opt.mom = 0.911
self.opt.lr = 0.002, self.opt.mom = 0.931
self.opt.lr = 0.000, self.opt.mom = 0.945
self.opt.lr = 0.000, self.opt.mom = 0.950


In [11]:
fastai_pred = fastai_learn.get_preds(ds_type=fastai.data_block.DatasetType.Valid)
torch.nn.functional.mse_loss(*fastai_pred).item()

## My own implementation

In [12]:
class Fitter:
    def __init__(self, model: nn.Module, loss_func: nn.Module, optim: torch.optim.Optimizer):
        self.model = model
        self.loss_func = loss_func
        self.optim = optim

    def fit(self, num_epochs: int):
        print("epoch | train_loss | test_loss | time")
        for epoch in range(num_epochs):
            start = time.time()
            train_loss = torch.tensor([0.0], dtype=float, device=dev)
            for batch_idx, (inputs, targets) in enumerate(train_loader, 0):
                train_loss += self._one_batch(inputs, targets)

            with torch.no_grad():
                pred = self.model(test_inputs[:, 0], test_inputs[:, 1])
                test_loss = self.loss_func(pred, test_labels).item()

            num_batches = batch_idx + 1
            print("%5d |      %.3f |     %.3f |   %ds |" % (
                epoch,
                train_loss / num_batches,
                test_loss,
                int(time.time() - start)))

    def _one_batch(self, inputs: torch.tensor, targets: torch.tensor) -> torch.tensor:
        self.optim.zero_grad()
        print(inputs)
        print(inputs.shape)
        try:
            pred = self.model(inputs[:, 0], inputs[:, 1])
        except IndexError:
            print("inputs.shape: %s" % inputs.shape)
            print("inputs: %s" % inputs)
            raise
        loss = self.loss_func(pred, targets)
        return loss

class FitterOneCycle(Fitter):
    def __init__(self, *args):
        super().__init__(*args)
        self.scheduler = None
    
    def fit(self, num_epochs: int):
        self.scheduler = torch.optim.lr_scheduler.OneCycleLR(
            self.optim, max_lr=self.optim.defaults["lr"], epochs=num_epochs,
            steps_per_epoch=math.ceil(len(train_df) / batch_size))
        super().fit(num_epochs)

    def _one_batch(self, inputs: torch.tensor, targets: torch.tensor) -> torch.tensor:
        loss = super()._one_batch(inputs, targets)
        loss.backward()
        self.scheduler.step()
        self.optim.step()
        return loss

In [9]:
def trunc_normal_(x: torch.tensor, mean: float=0., std: float=1.) -> torch.tensor:
    "Truncated normal initialization."
    # From https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/12
    return x.normal_().fmod_(2).mul_(std).add_(mean)

In [10]:
class ScaledDotProdBias(nn.Module):
    """Same as DotProdBias, but scale the output to be within y_range."""
    def __init__(self, n_user: int, n_item: int, embedding_dim: int, y_range: typing.Tuple[int, int], trunc_normal: bool=False):
        super().__init__()
        self.user_emb = nn.Embedding(num_embeddings=n_user, embedding_dim=embedding_dim)
        self.item_emb = nn.Embedding(num_embeddings=n_item, embedding_dim=embedding_dim)
        self.user_bias = nn.Embedding(num_embeddings=n_user, embedding_dim=1)
        self.item_bias = nn.Embedding(num_embeddings=n_item, embedding_dim=1)
        if trunc_normal:
            # Based on
            # https://github.com/fastai/fastai1/blob/6a5102ef7bdefa9058d0481ab311f48b21cbc6fc/fastai/layers.py#L285
            for e in (self.user_emb, self.item_emb, self.user_bias, self.item_bias):
                with torch.no_grad(): trunc_normal_(e.weight, std=0.01)
        self.y_min, self.y_max = y_range
    
    def forward(self, users: torch.LongTensor, items: torch.LongTensor) -> torch.FloatTensor:
        # Convert from 1-based to 0-based index.
        users, items = users - 1, items - 1
        dot_prods = (self.user_emb(users) * self.item_emb(items)).sum(dim=1)
        biased = dot_prods + self.user_bias(users) + self.item_bias(items)
        return self.y_min + (self.y_max - self.y_min) * nn.functional.sigmoid(biased)

In [53]:
model_scaled_dot_prod_bias = ScaledDotProdBias(n_user, n_item, 40, (-0.5, 5.5)).to(dev)
fitter_scaled_dot_prod_bias = Fitter(model_scaled_dot_prod_bias, nn.MSELoss(), torch.optim.Adam(model_scaled_dot_prod_bias.parameters(), lr=5e-3, betas=(0.9, 0.99)))
fitter_scaled_dot_prod_bias.fit(num_epochs)

epoch | train_loss | test_loss | time
    0 |      7.755 |     6.754 | 4s
    1 |      5.117 |     5.289 | 3s
    2 |      3.834 |     4.599 | 3s
    3 |      2.908 |     3.990 | 3s
    4 |      2.060 |     3.375 | 3s
    5 |      1.512 |     2.918 | 3s
    6 |      1.257 |     2.601 | 3s
    7 |      1.138 |     2.368 | 3s
    8 |      1.079 |     2.189 | 3s
    9 |      1.045 |     2.043 | 3s


In [54]:
model_scaled_dot_prod_bias_path = "mdoels/model_scaled_dot_prod_bias.pth"
torch.save(model_scaled_dot_prod_bias.state_dict(), model_scaled_dot_prod_bias_path)

In [77]:
model_scaled_dot_prod_bias_one_cycle = ScaledDotProdBias(n_user, n_item, 40, (-0.5, 5.5)).to(dev)
fitter_one_cycle = FitterOneCycle(model_scaled_dot_prod_bias_one_cycle, nn.MSELoss(), torch.optim.Adam(model_scaled_dot_prod_bias_one_cycle.parameters(), lr=5e-3, betas=(0.9, 0.99)))
fitter_one_cycle.fit(num_epochs)

epoch | train_loss | test_loss | time
    0 |      9.008 |     8.912 |   2s |
    1 |      8.019 |     7.623 |   3s |
    2 |      5.932 |     5.857 |   3s |
    3 |      4.281 |     4.870 |   3s |
    4 |      3.307 |     4.285 |   3s |
    5 |      2.513 |     3.843 |   3s |
    6 |      1.925 |     3.539 |   4s |
    7 |      1.580 |     3.375 |   3s |
    8 |      1.414 |     3.313 |   3s |
    9 |      1.355 |     3.304 |   3s |


In [10]:
# Let's tweak y-range to start at 0.5
model_scaled_dot_prod_bias_one_cycle = ScaledDotProdBias(n_user, n_item, 40, (0.5, 5.5)).to(dev)
fitter_one_cycle = FitterOneCycle(model_scaled_dot_prod_bias_one_cycle, nn.MSELoss(), torch.optim.Adam(model_scaled_dot_prod_bias_one_cycle.parameters(), lr=1e-2, betas=(0.9, 0.99)))
fitter_one_cycle.fit(num_epochs*2)

epoch | train_loss | test_loss | time
    0 |      6.203 |     6.216 |   3s |
    1 |      5.811 |     5.814 |   3s |
    2 |      4.914 |     5.001 |   3s |
    3 |      3.687 |     3.921 |   3s |
    4 |      2.425 |     2.863 |   3s |
    5 |      1.561 |     2.172 |   3s |
    6 |      1.259 |     1.759 |   3s |
    7 |      1.114 |     1.512 |   3s |
    8 |      1.028 |     1.323 |   3s |
    9 |      0.974 |     1.204 |   3s |
   10 |      0.937 |     1.125 |   3s |
   11 |      0.910 |     1.077 |   3s |
   12 |      0.890 |     1.044 |   3s |
   13 |      0.874 |     1.018 |   3s |
   14 |      0.862 |     1.003 |   3s |
   15 |      0.851 |     0.994 |   3s |
   16 |      0.843 |     0.988 |   3s |
   17 |      0.837 |     0.987 |   3s |
   18 |      0.832 |     0.986 |   3s |
   19 |      0.830 |     0.986 |   3s |


In [11]:
# Let's try weight decay
model_scaled_dot_prod_bias_one_cycle = ScaledDotProdBias(n_user, n_item, 40, (0.5, 5.5)).to(dev)
fitter_one_cycle = FitterOneCycle(model_scaled_dot_prod_bias_one_cycle, nn.MSELoss(), torch.optim.Adam(model_scaled_dot_prod_bias_one_cycle.parameters(), lr=1e-2, betas=(0.9, 0.99), weight_decay=1e-2))
fitter_one_cycle.fit(num_epochs*2)

epoch | train_loss | test_loss | time
    0 |      5.271 |     3.430 |   2s |
    1 |      1.727 |     1.257 |   3s |
    2 |      1.129 |     1.227 |   3s |
    3 |      1.124 |     1.225 |   3s |
    4 |      1.126 |     1.232 |   3s |
    5 |      1.127 |     1.227 |   3s |
    6 |      1.127 |     1.231 |   3s |
    7 |      1.127 |     1.228 |   3s |
    8 |      1.127 |     1.230 |   4s |
    9 |      1.127 |     1.228 |   3s |
   10 |      1.125 |     1.233 |   3s |
   11 |      1.125 |     1.227 |   3s |
   12 |      1.124 |     1.226 |   3s |
   13 |      1.123 |     1.228 |   3s |
   14 |      1.122 |     1.226 |   3s |
   15 |      1.122 |     1.226 |   3s |
   16 |      1.121 |     1.226 |   3s |
   17 |      1.118 |     1.228 |   3s |
   18 |      1.117 |     1.228 |   3s |
   19 |      1.116 |     1.228 |   2s |


In [13]:
# Let's try trunc_normal initialization
model_scaled_dot_prod_bias_one_cycle = ScaledDotProdBias(n_user, n_item, 40, (0.5, 5.5), trunc_normal=True).to(dev)
fitter_one_cycle = FitterOneCycle(model_scaled_dot_prod_bias_one_cycle, nn.MSELoss(), torch.optim.Adam(model_scaled_dot_prod_bias_one_cycle.parameters(), lr=1e-2, betas=(0.9, 0.99)))
fitter_one_cycle.fit(num_epochs*2)

epoch | train_loss | test_loss | time
    0 |      1.317 |     1.161 |   3s |
    1 |      1.009 |     0.997 |   2s |
    2 |      0.900 |     0.944 |   3s |
    3 |      0.878 |     0.944 |   3s |
    4 |      0.885 |     0.955 |   3s |
    5 |      0.894 |     0.970 |   3s |
    6 |      0.895 |     0.971 |   3s |
    7 |      0.896 |     0.972 |   3s |
    8 |      0.891 |     0.965 |   3s |
    9 |      0.887 |     0.962 |   4s |
   10 |      0.879 |     0.959 |   3s |
   11 |      0.873 |     0.954 |   3s |
   12 |      0.865 |     0.946 |   3s |
   13 |      0.858 |     0.942 |   3s |
   14 |      0.851 |     0.937 |   3s |
   15 |      0.845 |     0.935 |   3s |
   16 |      0.839 |     0.935 |   3s |
   17 |      0.834 |     0.934 |   3s |
   18 |      0.831 |     0.934 |   3s |
   19 |      0.829 |     0.934 |   3s |


In [14]:
# Wow, that really sped things up! Seems we're over-fitting after 5 epochs now. Let's try weight_decay again.
model_scaled_dot_prod_bias_one_cycle = ScaledDotProdBias(n_user, n_item, 40, (0.5, 5.5), trunc_normal=True).to(dev)
fitter_one_cycle = FitterOneCycle(model_scaled_dot_prod_bias_one_cycle, nn.MSELoss(), torch.optim.Adam(model_scaled_dot_prod_bias_one_cycle.parameters(), lr=1e-2, betas=(0.9, 0.99), weight_decay=1e-2))
fitter_one_cycle.fit(num_epochs*2)

epoch | train_loss | test_loss | time
    0 |      1.392 |     1.336 |   3s |
    1 |      1.164 |     1.230 |   3s |
    2 |      1.122 |     1.226 |   3s |
    3 |      1.125 |     1.227 |   3s |
    4 |      1.126 |     1.230 |   3s |
    5 |      1.127 |     1.226 |   3s |
    6 |      1.127 |     1.235 |   3s |
    7 |      1.128 |     1.229 |   3s |
    8 |      1.127 |     1.227 |   3s |
    9 |      1.126 |     1.230 |   3s |
   10 |      1.125 |     1.230 |   3s |
   11 |      1.125 |     1.233 |   3s |
   12 |      1.125 |     1.222 |   3s |
   13 |      1.124 |     1.228 |   3s |
   14 |      1.122 |     1.223 |   3s |
   15 |      1.122 |     1.227 |   3s |
   16 |      1.121 |     1.227 |   3s |
   17 |      1.119 |     1.227 |   3s |
   18 |      1.117 |     1.228 |   3s |
   19 |      1.115 |     1.228 |   3s |


In [33]:
# Hmm, under-fitting now :-(
# Let's try using fastai's "true_wd" algorithm.
# I think that's been added to torch as AdamW, so let's use that instead of Adam.
model_scaled_dot_prod_bias_one_cycle = ScaledDotProdBias(n_user, n_item, 40, (0.5, 5.5), trunc_normal=True).to(dev)
fitter_one_cycle = FitterOneCycle(
    model_scaled_dot_prod_bias_one_cycle, nn.MSELoss(),
    torch.optim.AdamW(model_scaled_dot_prod_bias_one_cycle.parameters(), lr=1e-2, betas=(0.9, 0.99)))
fitter_one_cycle.fit(num_epochs)

epoch | train_loss | test_loss | time
    0 |      1.264 |     1.072 |   3s |
    1 |      0.935 |     0.953 |   4s |
    2 |      0.896 |     0.958 |   4s |
    3 |      0.893 |     0.959 |   4s |
    4 |      0.886 |     0.944 |   6s |
    5 |      0.876 |     0.938 |   5s |
    6 |      0.863 |     0.929 |   5s |
    7 |      0.850 |     0.924 |   4s |
    8 |      0.839 |     0.923 |   4s |
    9 |      0.831 |     0.923 |   4s |


In [22]:
# try increasing weight_decay to what fastai uses.
model_scaled_dot_prod_bias_one_cycle = ScaledDotProdBias(n_user, n_item, 40, (0.5, 5.5), trunc_normal=True).to(dev)
print(model_scaled_dot_prod_bias_one_cycle)
fitter_one_cycle = FitterOneCycle(
    model_scaled_dot_prod_bias_one_cycle, nn.MSELoss(),
    torch.optim.AdamW(model_scaled_dot_prod_bias_one_cycle.parameters(), lr=1e-2, betas=(0.9, 0.99),
    weight_decay=1e-1))
fitter_one_cycle.fit(num_epochs)

ScaledDotProdBias(
  (user_emb): Embedding(943, 40)
  (item_emb): Embedding(1682, 40)
  (user_bias): Embedding(943, 1)
  (item_bias): Embedding(1682, 1)
)
epoch | train_loss | test_loss | time
    0 |      1.268 |     1.083 |   3s |
    1 |      0.956 |     0.979 |   3s |
    2 |      0.920 |     0.974 |   3s |
    3 |      0.919 |     0.969 |   3s |
    4 |      0.913 |     0.965 |   3s |
    5 |      0.906 |     0.959 |   3s |
    6 |      0.897 |     0.950 |   3s |
    7 |      0.886 |     0.947 |   3s |
    8 |      0.877 |     0.945 |   3s |
    9 |      0.869 |     0.945 |   3s |


In [13]:
# Let's try using fastai's model with my fitter.
model_fastai = fastai.collab.EmbeddingDotBias(40, n_user, n_item, (0.5, 5.5))
fitter_one_cycle = FitterOneCycle(
    model_fastai, nn.MSELoss(),
    torch.optim.AdamW(model_fastai.parameters(), lr=1e-2, betas=(0.9, 0.99), weight_decay=1e-1))
fitter_one_cycle.fit(num_epochs)

epoch | train_loss | test_loss | time
tensor([[ 488,  136],
        [ 843,  582],
        [  75,  235],
        [ 275,  496],
        [ 157,  476],
        [  90,   86],
        [ 378,  313],
        [ 585, 1021],
        [ 770,  473],
        [ 416,  174],
        [  16,  160],
        [ 326,  427],
        [ 537,  515],
        [ 537,  509],
        [ 567,  657],
        [ 474,  651],
        [ 655,  607],
        [ 270,   90],
        [ 821,  427],
        [ 268,  235],
        [ 747,   30],
        [ 294,  240],
        [  13,  669],
        [ 452,   66],
        [ 682,  801],
        [ 385, 1135],
        [ 895,  885],
        [ 605,  873],
        [ 524,  402],
        [ 365,  846],
        [ 806,  588],
        [ 525,  147],
        [ 452,  430],
        [ 436,   38],
        [ 406,  274],
        [ 311, 1041],
        [ 573,   50],
        [ 183,  375],
        [ 197,  385],
        [  42,  282],
        [  85,  414],
        [ 703,  300],
        [  21,  262],
        [  13,  

NameError: name 'pred' is not defined

In [None]:
# That's basically identical to the performance of my model, so the difference is not
# in the model architecture or parameter initialization but in the implementation of
# fastai's fit_one_cycle vs FitterOneCycle

In [None]:
torch.optim.AdamW(model_scaled_dot_prod_bias_one_cycle.parameters(), lr=1e-2, betas=(0.9, 0.99),
    weight_decay=1e-1)

## TODO ##
 * look into OneCycleLR parameters vs fastai's implementation
    * max_lr is the same (1e-2).
    * fastai has wd=1e-1, I'm using the default in torch.optim.AdamW which is 1e-2.
    * moms vs base_momentum / max_momentum is the same.
    * div_factor is the same.
    * pct_start is the same.
    * Not entirely sure, but I think final_div vs final_div_factor is the same.
 * But something is likely different either in OneCycleLR and/or in AdamW, because I think those are the main differences.
