In [1]:
from fastai.collab import *
from fastai.tabular.all import *

In [2]:
path = untar_data(URLs.ML_100k)

In [3]:
path.ls()

(#23) [Path('/root/.fastai/data/ml-100k/u2.base'),Path('/root/.fastai/data/ml-100k/ua.base'),Path('/root/.fastai/data/ml-100k/u4.base'),Path('/root/.fastai/data/ml-100k/u.item'),Path('/root/.fastai/data/ml-100k/ub.base'),Path('/root/.fastai/data/ml-100k/u.occupation'),Path('/root/.fastai/data/ml-100k/u.data'),Path('/root/.fastai/data/ml-100k/u5.test'),Path('/root/.fastai/data/ml-100k/ua.test'),Path('/root/.fastai/data/ml-100k/u3.base')...]

In [4]:
#getting the main table
ratings = pd.read_csv(path/'u.data', delimiter='\t',header=None,names=['user','movie','rating', 'timestamp'])

In [5]:
ratings.head(10)

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [6]:
movies = pd.read_csv(path/'u.item', delimiter='|',encoding='latin-1',usecols=(0,1),names=('movie','title'),header=None)
movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
ratings = ratings.merge(movies)
ratings.head(10)

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)
5,296,242,4,884196057,Kolya (1996)
6,34,242,5,888601628,Kolya (1996)
7,271,242,4,885844495,Kolya (1996)
8,201,242,4,884110598,Kolya (1996)
9,209,242,4,883589606,Kolya (1996)


In [8]:
dls = CollabDataLoaders.from_df(ratings,item_name='title',bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,130,Conspiracy Theory (1997),4
1,868,"Day the Earth Stood Still, The (1951)",2
2,437,Heavenly Creatures (1994),3
3,640,Shallow Grave (1994),5
4,454,Ben-Hur (1959),4
5,13,Star Trek: First Contact (1996),3
6,75,Jerry Maguire (1996),2
7,387,All Over Me (1997),3
8,178,Leaving Las Vegas (1995),3
9,346,Weekend at Bernie's (1989),2


In [9]:
n_users = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
n_factors=5

In [10]:
user_factors = torch.randn(n_users,n_factors)
movie_factors = torch.randn(n_movies,n_factors)

In [11]:
class DotProduct(Module):
  def __init__(self,n_users,n_movies,n_factors):
    self.user_factors = Embedding(n_users,n_factors)
    self.movie_factors = Embedding(n_movies,n_factors)
  def forward(self,x):
    users = self.user_factors(x[:,0])
    movies = self.movie_factors(x[:,1])
    return (users * movies).sum(dim=1)


In [12]:
dls.show_batch()

Unnamed: 0,user,title,rating
0,308,Copycat (1995),4
1,200,"20,000 Leagues Under the Sea (1954)",4
2,518,"People vs. Larry Flynt, The (1996)",3
3,823,True Romance (1993),5
4,600,Natural Born Killers (1994),4
5,624,"English Patient, The (1996)",5
6,931,"Lost World: Jurassic Park, The (1997)",3
7,456,"Crossing Guard, The (1995)",3
8,796,GoldenEye (1995),5
9,896,"Unbearable Lightness of Being, The (1988)",2


In [13]:
x,y = dls.one_batch()
x.shape,y.shape

(torch.Size([64, 2]), torch.Size([64, 1]))

In [14]:
model = DotProduct(n_users,n_movies,50)
learn = Learner(dls,model,loss_func=MSELossFlat())

In [15]:
learn.fit_one_cycle(5,5e-3)

epoch,train_loss,valid_loss,time
0,1.335066,1.305501,00:26
1,1.058252,1.0934,00:16
2,0.960589,0.985399,00:22
3,0.842882,0.894245,00:15
4,0.803654,0.877237,00:17


In [16]:
class DotProduct(Module):
  def __init__(self,n_users,n_movies,n_factors,y_range=(0,5.5)):
    self.user_factors = Embedding(n_users,n_factors)
    self.movie_factors = Embedding(n_movies,n_factors)
    self.y_range = y_range
  def forward(self,x):
    users = self.user_factors(x[:,0])
    movies = self.movie_factors(x[:,1])
    return sigmoid_range((users*movies).sum(dim=1),*self.y_range)

In [17]:
model = DotProduct(n_users,n_movies,50)
learn = Learner(dls,model,loss_func=MSELossFlat())


In [18]:
learn.fit_one_cycle(5,5e-3)

epoch,train_loss,valid_loss,time
0,0.968429,0.985178,00:12
1,0.888689,0.90449,00:11
2,0.673195,0.875013,00:11
3,0.481638,0.879793,00:10
4,0.369773,0.885031,00:11


In [19]:
class DotProductBias(Module):
  def __init__(self,n_users,n_movies,n_factors,y_range=(0,5.5)):
    self.user_factors = Embedding(n_users,n_factors)
    self.user_bias = Embedding(n_users,1)
    self.movie_factors = Embedding(n_movies,n_factors)
    self.movie_bias = Embedding(n_movies,1)
    self.y_range=y_range

  def forward(self,x):
    users = self.user_factors(x[:,0])
    movies = self.movie_factors(x[:,1])
    res = (users*movies).sum(dim=1,keepdim=True)
    res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
    return sigmoid_range(res,*self.y_range)

In [20]:
model = DotProductBias(n_users,n_movies,50)
learn = Learner(dls,model,loss_func=MSELossFlat())
learn.fit_one_cycle(5,5e-3)

epoch,train_loss,valid_loss,time
0,0.939246,0.924928,00:11
1,0.817479,0.857489,00:11
2,0.599988,0.863303,00:11
3,0.396745,0.89013,00:11
4,0.289002,0.89755,00:11


In [21]:
learn  = collab_learner(dls,n_factors=50,y_range=(0,5.5))
learn.fit_one_cycle(5,5e-3,wd=0.1)

epoch,train_loss,valid_loss,time
0,0.95198,0.939069,00:12
1,0.862762,0.876827,00:13
2,0.736368,0.83442,00:12
3,0.593399,0.822709,00:12
4,0.488651,0.823298,00:11


In [22]:
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(944, 50)
  (i_weight): Embedding(1665, 50)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1665, 1)
)

In [23]:
movie_bias= learn.model.i_bias.weight.squeeze()
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes['title'][i] for i in idxs]

['Titanic (1997)',
 'Shawshank Redemption, The (1994)',
 'L.A. Confidential (1997)',
 'Good Will Hunting (1997)',
 'Star Wars (1977)']

In [25]:
movie_factors = learn.model.i_weight.weight
idx = dls.classes['title'].o2i['Shawshank Redemption, The (1994)']
distances = nn.CosineSimilarity(dim=1)(movie_factors,movie_factors[idx][None])
idx = distances.argsort(descending=True)[1]
dls.classes['title'][idx]

'Everest (1998)'

In [26]:
embs = get_emb_sz(dls)

In [27]:
embs

[(944, 74), (1665, 102)]

In [28]:
class CollabNN(Module):
  def __init__(self,user_sz,item_sz,y_range=(0,5.5),n_act=100):
    self.user_factors = Embedding(*user_sz)
    self.item_factors = Embedding(*item_sz)
    self.layers = nn.Sequential(
        nn.Linear(user_sz[1]+item_sz[1],n_act),
        nn.ReLU(),
        nn.Linear(n_act,1)
    )
    self.y_range = y_range

  def forward(self,x):
    embs = self.user_factors(x[:,0]), self.item_factors(x[:,1])
    x = self.layers(torch.cat(embs,dim=1))
    return sigmoid_range(x,*self.y_range)

In [29]:
model = CollabNN(*embs)

In [30]:
learn = Learner(dls,model,loss_func=MSELossFlat())
learn.fit_one_cycle(5,5e-3,wd=0.01)

epoch,train_loss,valid_loss,time
0,0.953489,0.944369,00:16
1,0.895578,0.904125,00:15
2,0.85823,0.879013,00:15
3,0.821459,0.870891,00:15
4,0.759142,0.873342,00:15


In [31]:
learn = collab_learner(dls,use_nn=True,y_range=(0,5.5),layers=[100,50])
learn.fit_one_cycle(5,5e-3,wd=0.01)

epoch,train_loss,valid_loss,time
0,0.951279,0.960649,00:17
1,0.939261,0.902173,00:16
2,0.88666,0.87658,00:17
3,0.804502,0.861563,00:17
4,0.713297,0.866521,00:16
