In [1]:
from fastai.collab import *
from fastai.tabular.all import *

In [2]:
path = untar_data(URLs.ML_100k)

In [3]:
# file: (path/'u.data')
ratings = pd.read_csv(path/'u.data', delimiter = '\t', header = None,
                     names = ['user','movie','rating','timestamp'])

In [4]:
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
path

Path('/jet/.fastai/data/ml-100k')

In [6]:
movies = pd.read_csv(path/'u.item',  delimiter='|', encoding='latin-1',
                     usecols=(0,1), names=('movie','title'), header=None)
movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


We can then build a DataLoaders object from this table. By default, it takes the first column for the user, the second column for the item (here our movies), and the third column for the ratings. We need to change the value of item_name in our case to use the titles instead of the IDs:

In [8]:
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,541,Multiplicity (1996),2
1,655,Dead Man Walking (1995),3
2,823,Sabrina (1995),4
3,26,Brassed Off (1996),4
4,908,"Professional, The (1994)",3
5,709,True Romance (1993),4
6,416,Speed (1994),4
7,416,Dead Presidents (1995),2
8,206,"Sweet Hereafter, The (1997)",4
9,248,Lost Highway (1997),4


In [9]:
dls.classes

{'user': (#944) ['#na#',1,2,3,4,5,6,7,8,9...],
 'title': (#1665) ['#na#',"'Til There Was You (1997)",'1-900 (1994)','101 Dalmatians (1996)','12 Angry Men (1957)','187 (1997)','2 Days in the Valley (1996)','20,000 Leagues Under the Sea (1954)','2001: A Space Odyssey (1968)','3 Ninjas: High Noon At Mega Mountain (1998)'...]}

In [10]:
n_users  = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
n_factors = 5

user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)

In [11]:
one_hot_3 = one_hot(3, n_users).float()
one_hot_3.shape

torch.Size([944])

In [12]:
user_factors.t() @ one_hot_3

tensor([ 0.7756,  0.2625,  1.2275, -0.6782, -0.8173])

In [13]:
user_factors.shape

torch.Size([944, 5])

In [14]:
user_factors.t()

tensor([[-0.3800, -0.7628,  0.4724,  ..., -0.3376,  0.7041,  0.8372],
        [-1.7024, -0.6571, -1.6362,  ..., -2.0544,  0.2906, -1.1918],
        [-1.1707, -0.0225, -0.6392,  ..., -0.3854, -2.5266, -1.0807],
        [ 1.4767,  0.1466, -0.7109,  ...,  1.1526, -0.0102,  0.1481],
        [-0.9508,  1.1271,  1.1835,  ..., -0.8675, -0.1681,  1.9589]])

In [15]:
x,y = dls.one_batch()
x.shape

torch.Size([64, 2])

In [16]:
x

tensor([[ 295, 1258],
        [ 712, 1261],
        [ 101,  346],
        [ 458, 1020],
        [  90, 1502],
        [ 141,  413],
        [ 478, 1496],
        [ 291, 1058],
        [ 346, 1561],
        [ 655, 1318],
        [ 524,  230],
        [ 892,  143],
        [ 222,  892],
        [ 622, 1353],
        [  64,   41],
        [ 605, 1525],
        [ 496,  357],
        [ 405,  446],
        [ 749,  657],
        [ 393,   41],
        [ 521,   44],
        [  10, 1157],
        [ 727, 1517],
        [ 235,  383],
        [ 666, 1285],
        [ 716,  381],
        [  82,  716],
        [ 606,  747],
        [ 407,  224],
        [ 189,  172],
        [ 766, 1330],
        [ 666,  514],
        [ 815,  318],
        [ 539,  208],
        [ 648,  495],
        [ 640,  313],
        [ 708,  236],
        [ 881, 1391],
        [  72,  390],
        [ 618,  444],
        [ 756, 1271],
        [ 918,  208],
        [ 642,  754],
        [   6,   32],
        [ 588, 1330],
        [ 

In [17]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return (users * movies).sum(dim=1)

In [18]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [19]:
# get the ratings (dot product of Embeddings)
model.forward(x)

tensor([-6.3593e-04,  6.2995e-04, -5.4226e-04, -3.1168e-04, -6.0699e-05,
         2.4764e-04, -3.9330e-04,  1.9839e-04, -4.8185e-05,  6.5504e-04,
         1.7797e-04,  1.4673e-04,  1.2680e-04, -5.1123e-05, -7.7283e-04,
         7.5981e-04, -5.2227e-04,  5.1376e-04,  3.3687e-04, -4.8362e-05,
        -9.2234e-04, -9.1463e-04,  3.1629e-04,  5.4318e-04,  4.2958e-05,
         4.3836e-04,  3.6945e-04, -2.6159e-04,  4.2337e-04,  7.3019e-04,
        -1.8177e-04,  6.0052e-05, -7.4559e-04, -2.2332e-04,  6.7353e-04,
         1.4248e-04, -7.8639e-05, -4.6588e-04, -1.2821e-04,  2.3686e-04,
         5.2923e-04,  8.8596e-05,  1.3002e-05,  6.8206e-04, -8.3276e-05,
         3.2076e-04, -1.2537e-04, -4.7495e-04, -1.1106e-04,  1.7017e-03,
        -5.5158e-05, -3.3646e-04, -4.7152e-04,  3.5287e-04,  6.6172e-04,
         1.7742e-04,  2.1854e-04,  7.0050e-05,  6.8948e-04,  6.7303e-04,
         3.4983e-04,  7.1142e-04, -5.3112e-04, -4.0505e-04],
       grad_fn=<SumBackward1>)

The first thing we can do to make this model a little bit better is to force those predictions to be between 0 and 5. For this, we just need to use sigmoid_range, like in <

#### def sigmoid_range(x, lo, hi): return torch.sigmoid(x) * (hi-lo) + lo

In [20]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return sigmoid_range((users * movies).sum(dim=1), *self.y_range)

In [21]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [22]:
model.forward(x)

tensor([2.7483, 2.7507, 2.7495, 2.7502, 2.7502, 2.7505, 2.7501, 2.7500, 2.7486,
        2.7504, 2.7504, 2.7500, 2.7487, 2.7497, 2.7501, 2.7521, 2.7506, 2.7510,
        2.7493, 2.7499, 2.7511, 2.7508, 2.7504, 2.7490, 2.7496, 2.7505, 2.7510,
        2.7495, 2.7505, 2.7497, 2.7488, 2.7504, 2.7500, 2.7512, 2.7495, 2.7509,
        2.7502, 2.7504, 2.7500, 2.7504, 2.7507, 2.7489, 2.7505, 2.7498, 2.7494,
        2.7512, 2.7498, 2.7513, 2.7507, 2.7489, 2.7504, 2.7504, 2.7501, 2.7494,
        2.7497, 2.7497, 2.7509, 2.7507, 2.7500, 2.7506, 2.7496, 2.7507, 2.7514,
        2.7498], grad_fn=<AddBackward0>)

In [23]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.95611,0.994699,00:09
1,0.869753,0.895843,00:09
2,0.667565,0.860896,00:09
3,0.484944,0.865341,00:10
4,0.374164,0.869615,00:10


This is a reasonable start, but we can do better. One obvious missing piece is that some users are just more positive or negative in their recommendations than others, and some movies are just plain better or worse than others. But in our dot product representation we do not have any way to encode either of these things. If all you can say about a movie is, for instance, that it is very sci-fi, very action-oriented, and very not old, then you don't really have any way to say whether most people like it.

In [24]:
class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.user_bias = Embedding(n_users, 1)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.movie_bias = Embedding(n_movies, 1)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        res = (users * movies).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
        return sigmoid_range(res, *self.y_range)

In [25]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.920003,0.934255,00:10
1,0.802547,0.859081,00:10
2,0.628734,0.856559,00:11
3,0.404553,0.882457,00:10
4,0.280303,0.889669,00:10


### Add Weight decay, or L2 regularization, 

loss_with_wd = loss + wd * (parameters**2).sum()

parameters.grad += wd * 2 * parameters

In [26]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.942179,0.952204,00:11
1,0.8622,0.864449,00:10
2,0.729734,0.824194,00:10
3,0.59064,0.813272,00:10
4,0.506663,0.813304,00:10


In [27]:
learn.model

DotProductBias(
  (user_factors): Embedding(944, 50)
  (user_bias): Embedding(944, 1)
  (movie_factors): Embedding(1665, 50)
  (movie_bias): Embedding(1665, 1)
)

In [28]:
learn.model.movie_bias.weight.shape

torch.Size([1665, 1])

In [29]:
learn.model.movie_bias.weight.squeeze().shape

torch.Size([1665])

In [30]:
learn.model.movie_bias.weight.shape

torch.Size([1665, 1])

In [31]:
# find the movies with highest bias
movie_bias = learn.model.movie_bias.weight.squeeze()
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes['title'][i] for i in idxs]

['Titanic (1997)',
 'Shawshank Redemption, The (1994)',
 'Good Will Hunting (1997)',
 "Schindler's List (1993)",
 'Star Wars (1977)']

### Interpreting Embeddings and Biases

In [49]:
movie_bias = learn.model.movie_bias
idxs = movie_bias.argsort()[:5]
[dls.classes['title'][i] for i in idxs]

ModuleAttributeError: 'Embedding' object has no attribute 'squeeze'

In [50]:
movie_bias = model.movie_bias

In [51]:
movie_bias 

Embedding(1665, 1)

## Using fastai.collab

In [32]:
learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5))

In [33]:
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.922446,0.935214,00:10
1,0.868629,0.868332,00:11
2,0.731269,0.824124,00:11
3,0.589961,0.811589,00:10
4,0.475455,0.813132,00:10


In [34]:
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(944, 50)
  (i_weight): Embedding(1665, 50)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1665, 1)
)

In [35]:
movie_bias = learn.model.i_bias.weight.squeeze()
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes['title'][i] for i in idxs]

['Titanic (1997)',
 'Shawshank Redemption, The (1994)',
 'Star Wars (1977)',
 'Good Will Hunting (1997)',
 'L.A. Confidential (1997)']

## Deep Learning for Collaborative Filtering

In [52]:
embs = get_emb_sz(dls)
embs

[(944, 74), (1665, 102)]

In [53]:
class CollabNN(Module):
    def __init__(self, user_sz, item_sz, y_range=(0,5.5), n_act=100):
        self.user_factors = Embedding(*user_sz)
        self.item_factors = Embedding(*item_sz)
        self.layers = nn.Sequential(
            nn.Linear(user_sz[1]+item_sz[1], n_act),
            nn.ReLU(),
            nn.Linear(n_act, 1))
        self.y_range = y_range
        
    def forward(self, x):
        embs = self.user_factors(x[:,0]),self.item_factors(x[:,1])
        x = self.layers(torch.cat(embs, dim=1))
        return sigmoid_range(x, *self.y_range)

In [54]:
model = CollabNN(*embs)

In [55]:
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.01)

epoch,train_loss,valid_loss,time
0,0.967487,0.938092,00:11
1,0.889664,0.891634,00:11
2,0.866985,0.882334,00:11
3,0.802268,0.858387,00:11
4,0.762489,0.862003,00:11


In [56]:
learn = collab_learner(dls, use_nn=True, y_range=(0, 5.5), layers=[100,50])  # two hidden layers
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.9968,1.006378,00:14
1,0.935323,0.908709,00:14
2,0.897688,0.888921,00:14
3,0.831427,0.863199,00:14
4,0.768769,0.862612,00:14


###  The same two hidden layers as abobe but with Class

In [58]:
class CollabNN(Module):
    def __init__(self, user_sz, item_sz, y_range=(0,5.5), n_act=100, n_act2=50):
        self.user_factors = Embedding(*user_sz)
        self.item_factors = Embedding(*item_sz)
        self.layers = nn.Sequential(
            nn.Linear(user_sz[1]+item_sz[1], n_act),
            nn.ReLU(),
            nn.Linear(n_act, n_act2),
            nn.ReLU(),
            nn.Linear(n_act2, 1))       
        self.y_range = y_range
        
    def forward(self, x):
        embs = self.user_factors(x[:,0]),self.item_factors(x[:,1])
        x = self.layers(torch.cat(embs, dim=1))
        return sigmoid_range(x, *self.y_range)

In [59]:
model = CollabNN(*embs)

In [60]:
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.01)

epoch,train_loss,valid_loss,time
0,0.929816,0.946848,00:12
1,0.90625,0.900861,00:11
2,0.817738,0.876707,00:12
3,0.80173,0.862204,00:12
4,0.76477,0.864544,00:12
