In [12]:
# hide
# skip
! [ -e /content ] && pip install -Uqq fastai # upgrade fastai on colab

In [13]:
# default_exp collab
# default_class_lvl 3

In [14]:
# export
from fastai.tabular.all import *
from fastai.collab import *

In [15]:
# hide
from nbdev.showdoc import *

# Collaborative filtering
> Tools to add to the [fastai collab](https://docs.fast.ai/collab.html) to make the learning transferable

## Loading `users`/`items` embeddings from a pretrained model

In a collab model, to load a pretrained vocabulary, we need to adapt the embeddings of the  vocabulary used for the pre-training to the vocabulary of our current collab corpus.

In [16]:
# export 
def match_embeds(
    old_wgts:dict, # Embedding weights of the pretrained model
    old_vocab:dict, # Vocabulary (tokens and labels) of the corpus used for pretraining
    new_vocab:list # Current collab corpus vocabulary (`items` and `users`)
) -> 
    pass

SyntaxError: invalid syntax (2806520920.py, line 5)

## Create a `Learner`

In [7]:
# export
class CollabLearner(Learner):
    "Basic class for a `Learner` in Collab."
    def save(self, file, **kwargs):
        "Save model and optimizer state (if `with_opt`) to `self.path/self.model_dir/file`"
        file = join_path_file(file, self.path/self.model_dir, ext='.pth')
        vocab_file = join_path_file('collab_vocab', self.path/self.model_dir, ext='.pkl')
        save_model(file, self.model, getattr(self,'opt', None), **kwargs)
        save_pickle(vocab_file, self.dls.classes)
        return file
    
    def load_vocab(self,
        wgts_fname:str, #Filename of the saved weights
        vocab_fname:str, # Saved vocabulary filename in pickle format
        model=None # Model to load parameters from, deafults to `learner.model`
    ):
        "Load the vocabulary (`users` and/or `items`) from a pretrained model and adapt it to the collab vocabulary."
        pass

In [10]:
show_doc(CollabLearner)

<h2 id="CollabLearner" class="doc_header"><code>class</code> <code>CollabLearner</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>CollabLearner</code>(**`dls`**, **`model`**, **`loss_func`**=*`None`*, **`opt_func`**=*`Adam`*, **`lr`**=*`0.001`*, **`splitter`**=*`trainable_params`*, **`cbs`**=*`None`*, **`metrics`**=*`None`*, **`path`**=*`None`*, **`model_dir`**=*`'models'`*, **`wd`**=*`None`*, **`wd_bn_bias`**=*`False`*, **`train_bn`**=*`True`*, **`moms`**=*`(0.95, 0.85, 0.95)`*) :: `Learner`

Basic class for a `Learner` in Collab.

It works exactly as a normal `learner`, the only difference is that it also saves the `items` vocabulary used by `self.model`

The following function lets us quickly create a `Learner` for collaborative filtering from the data.

In [64]:
# export
@delegates(Learner.__init__)
def collab_learner(dls, n_factors=50, use_nn=False, emb_szs=None, layers=None, config=None, y_range=None, loss_func=None, **kwargs):
    "Create a Learner for collaborative filtering on `dls`."
    emb_szs = get_emb_sz(dls, ifnone(emb_szs, {}))
    if loss_func is None: loss_func = MSELossFlat()
    if config is None: config = tabular_config()
    if y_range is not None: config['y_range'] = y_range
    if layers is None: layers = [n_factors]
    if use_nn: model = EmbeddingNN(emb_szs=emb_szs, layers=layers, **config)
    else:      model = EmbeddingDotBias.from_classes(n_factors, dls.classes, y_range=y_range)
    return CollabLearner(dls, model, loss_func=loss_func, **kwargs)

If `use_nn=False`, the model used is an `EmbeddingDotBias` with `n_factors` and `y_range`. Otherwise, it's a `EmbeddingNN` for which you can pass `emb_szs` (will be inferred from the `dls` with `get_emb_sz` if you don't provide any), `layers` (defaults to `[n_factors]`) `y_range`, and a `config` that you can create with `tabular_config` to customize your model. 

`loss_func` will default to `MSELossFlat` and all the other arguments are passed to `Learner`.

In [65]:
path = untar_data(URLs.ML_SAMPLE)
ratings = pd.read_csv(path/'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,73,1097,4.0,1255504951
1,561,924,3.5,1172695223
2,157,260,3.5,1291598691
3,358,1210,5.0,957481884
4,130,316,2.0,1138999234


In [66]:
dls = CollabDataLoaders.from_df(ratings, bs=64)
dls.show_batch()

Unnamed: 0,userId,movieId,rating
0,654,2396,4.5
1,150,733,3.5
2,262,1073,3.5
3,150,316,4.0
4,564,2858,5.0
5,518,1097,5.0
6,119,1923,5.0
7,608,1193,5.0
8,439,593,5.0
9,423,592,2.5


In [68]:
with tempfile.TemporaryDirectory() as d:
    learn = collab_learner(dls, y_range=(0,5), path=d)
    learn.fit(1)
    
    # Test save created a file
    learn.save('tmp')
    assert (Path(d)/'models/tmp.pth').exists()
    assert (Path(d)/'models/collab_vocab.pkl').exists()

epoch,train_loss,valid_loss,time
0,2.483533,2.210738,00:00


In [76]:
# hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_utils.ipynb.
Converted 01_layers.ipynb.
Converted 02_text.models.core.ipynb.
Converted 03_text.learner.ipynb.
Converted 04_metrics.ipynb.
Converted 05_collab.ipynb.
Converted index.ipynb.
