In [None]:
#default_exp collab

In [None]:
#export
from fastai2.basics import *
from fastai2.data.all import *
from fastai2.tabular.all import *

# Collaborative filtering

> Tools to quickly get the data and train models suitable for collaborative filtering

In [None]:
#export
class TabularCollab(TabularPandas):
    with_cont=False

In [None]:
class CollabDataBunch(DataBunch):
    "Base `DataBunch` for collaborative filtering."
    @delegates(DataBunch.from_dblock)
    @classmethod
    def from_df(cls, ratings, valid_pct=0.2, user_name=None, item_name=None, rating_name=None, seed=None, path='.', **kwargs):
        "Create a `DataBunch` suitable for collaborative filtering from `ratings`."
        user_name   = ifnone(user_name,  ratings.columns[0])
        item_name   = ifnone(item_name,  ratings.columns[1])
        rating_name = ifnone(rating_name,ratings.columns[2])
        cat_names = [user_name,item_name]
        splits = RandomSplitter(valid_pct=valid_pct, seed=seed)(range_of(ratings))
        to = TabularCollab(ratings, [Categorify], cat_names, y_names=[rating_name], block_y=TransformBlock(), splits=splits)
        return to.databunch(path=path, **kwargs)

In [None]:
path = untar_data(URLs.ML_SAMPLE)
ratings = pd.read_csv(path/'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,73,1097,4.0,1255504951
1,561,924,3.5,1172695223
2,157,260,3.5,1291598691
3,358,1210,5.0,957481884
4,130,316,2.0,1138999234


In [None]:
dbunch = CollabDataBunch.from_df(ratings, bs=64)
dbunch.show_batch()

Unnamed: 0,userId,movieId,rating
0,19,296,5.0
1,355,4963,3.5
2,472,1580,5.0
3,472,3578,4.0
4,220,1200,4.0
5,452,4306,4.0
6,102,2858,5.0
7,243,1923,3.0
8,88,296,3.5
9,380,3114,5.0


In [None]:
#export
class EmbeddingDotBias(Module):
    "Base dot model for collaborative filtering."
    def __init__(self, n_factors, n_users, n_items, y_range=None):
        self.y_range = y_range
        (self.u_weight, self.i_weight, self.u_bias, self.i_bias) = [Embedding(*o) for o in [
            (n_users, n_factors), (n_items, n_factors), (n_users,1), (n_items,1)
        ]]

    def forward(self, x):
        users,items = x[:,0],x[:,1]
        dot = self.u_weight(users)* self.i_weight(items)
        res = dot.sum(1) + self.u_bias(users).squeeze() + self.i_bias(items).squeeze()
        if self.y_range is None: return res
        return torch.sigmoid(res) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]

In [None]:
dbunch.classes

{'userId': (#101) [#na#,15,17,19,23,30,48,56,73,77...],
 'movieId': (#101) [#na#,1,10,32,34,39,47,50,110,150...]}

In [None]:
model = EmbeddingDotBias(50, len(dbunch.classes['userId']), len(dbunch.classes['movieId']), y_range=(0,5)) 

In [None]:
from fastai2.callback.all import *

In [None]:
learn = Learner(dbunch, model, loss_func=MSELossFlat())

In [None]:
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,time
0,2.271334,2.193583,00:01


In [None]:
#export 
@delegates()
class EmbeddingNN(TabularModel):
    "Subclass `TabularModel` to create a NN suitable for collaborative filtering."
    def __init__(self, emb_szs, layers, **kwargs):
        super().__init__(emb_szs=emb_szs, n_cont=0, out_sz=1, layers=layers, **kwargs)
        
    def forward(self, x): return super().forward(x, None)

In [None]:
emb_szs = get_emb_sz(dbunch.train_ds, {})
model = EmbeddingNN(emb_szs, [50])

In [None]:
learn = Learner(dbunch, model, loss_func=MSELossFlat())
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,time
0,3.351461,1.609919,00:01


In [None]:
# To port
class CollabLearner(Learner):
    "`Learner` suitable for collaborative filtering."
    def get_idx(self, arr:Collection, is_item:bool=True):
        "Fetch item or user (based on `is_item`) for all in `arr`. (Set model to `cpu` and no grad.)"
        m = self.model.eval().cpu()
        requires_grad(m,False)
        u_class,i_class = self.data.train_ds.x.classes.values()
        classes = i_class if is_item else u_class
        c2i = {v:k for k,v in enumerate(classes)}
        try: return tensor([c2i[o] for o in arr])
        except Exception as e: 
            print(f"""You're trying to access {'an item' if is_item else 'a user'} that isn't in the training data.
                  If it was in your original data, it may have been split such that it's only in the validation set now.""")

    def bias(self, arr:Collection, is_item:bool=True):
        "Bias for item or user (based on `is_item`) for all in `arr`. (Set model to `cpu` and no grad.)"
        idx = self.get_idx(arr, is_item)
        m = self.model
        layer = m.i_bias if is_item else m.u_bias
        return layer(idx).squeeze()

    def weight(self, arr:Collection, is_item:bool=True):
        "Weight for item or user (based on `is_item`) for all in `arr`. (Set model to `cpu` and no grad.)"
        idx = self.get_idx(arr, is_item)
        m = self.model
        layer = m.i_weight if is_item else m.u_weight
        return layer(idx)

def collab_learner(data, n_factors:int=None, use_nn:bool=False, emb_szs:Dict[str,int]=None, layers:Collection[int]=None, 
                   ps:Collection[float]=None, emb_drop:float=0., y_range:OptRange=None, use_bn:bool=True, 
                   bn_final:bool=False, **learn_kwargs)->Learner:
    "Create a Learner for collaborative filtering on `data`."
    emb_szs = data.get_emb_szs(ifnone(emb_szs, {}))
    u,m = data.train_ds.x.classes.values()
    if use_nn: model = EmbeddingNN(emb_szs=emb_szs, layers=layers, ps=ps, emb_drop=emb_drop, y_range=y_range, 
                                   use_bn=use_bn, bn_final=bn_final, **learn_kwargs)
    else:      model = EmbeddingDotBias(n_factors, len(u), len(m), y_range=y_range)
    return CollabLearner(data, model, **learn_kwargs)