In [1]:
import torch
from torch import nn
import torch.optim as optim
import pandas as pd
from pathlib import Path
from torch.utils.data import DataLoader, Dataset
import numpy as np
from functools import partial

In [2]:
path = Path('/root/.fastai/data/movie_lens_sample')
path

PosixPath('/root/.fastai/data/movie_lens_sample')

In [3]:
ratings = pd.read_csv(path/'ratings.csv')
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,73,1097,4.0,1255504951
1,561,924,3.5,1172695223
2,157,260,3.5,1291598691
3,358,1210,5.0,957481884
4,130,316,2.0,1138999234
5,580,1196,4.0,1220561546
6,544,2918,5.0,1435787004
7,213,1200,3.0,1462634054
8,176,2571,4.5,1340714691
9,481,4886,4.5,1437002227


In [4]:
torch.tensor(ratings.values)[0][2]

tensor(4., dtype=torch.float64)

In [5]:
train_len = int(ratings.shape[0] * 0.9)
train_len

5427

In [6]:
def createIndex(df, field):
    idx = ratings[field].unique()
    idx.sort()
    return {pos_id[1]:pos_id[0] for pos_id in enumerate(idx)}, {pos_id[0]:pos_id[1] for pos_id in enumerate(idx)}

userIdx, invertedUserIdx = createIndex(ratings, 'userId')
movieIdx, invertedMovieIdx = createIndex(ratings, 'movieId')

In [7]:
userIndexed = ratings['userId'].apply(lambda x: userIdx[x])
movieIndexed = ratings['movieId'].apply(lambda x: movieIdx[x])
ratingsIndexed = pd.concat([userIndexed, movieIndexed, ratings['rating']], axis=1)

In [8]:
[ui for ui in userIndexed[:10]]

[7, 80, 20, 47, 17, 85, 78, 27, 22, 70]

In [9]:
[invertedUserIdx[ui] for ui in userIndexed[:10]]

[73, 561, 157, 358, 130, 580, 544, 213, 176, 481]

In [10]:
class DatasetCollab(Dataset):
    def __init__(self, df):
        self.tensor = torch.tensor(ratingsIndexed.values).long()
        
    def __len__(self):
        return ratings.shape[0]
    
    def __getitem__(self, index):
        return self.tensor[index][:2], self.tensor[index][2]

In [11]:
train_ds = DatasetCollab(ratings[:train_len])
valid_ds = DatasetCollab(ratings[train_len:])

In [12]:
batch_size = 32
num_workers = 0
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, 
                                           num_workers=num_workers, shuffle=False)
valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=batch_size, 
                                          num_workers=num_workers, shuffle=False)

In [15]:
n_factors = 50
y_range = [0,5.5]

In [16]:
def trunc_normal_(x:torch.Tensor, mean:float=0., std:float=1.) -> torch.Tensor:
    "Truncated normal initialization."
    # From https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/12
    return x.normal_().fmod_(2).mul_(std).add_(mean)

class EmbeddingDotBias(nn.Module):
    "Base dot model for collaborative filtering."
    def __init__(self, n_factors:int, n_users:int, n_items:int, y_range=None):
        super().__init__()
        self.y_range = y_range
        (self.u_weight, self.i_weight, self.u_bias, self.i_bias) = [embedding(*o) for o in [
            (n_users, n_factors), (n_items, n_factors), (n_users,1), (n_items,1)
        ]]
        
    def forward(self, users:torch.LongTensor, items:torch.LongTensor) -> torch.Tensor:
        dot = self.u_weight(users)* self.i_weight(items)
        res = dot.sum(1) + self.u_bias(users).squeeze() + self.i_bias(items).squeeze()
        if self.y_range is None: return res
        return torch.sigmoid(res) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]
        
def embedding(ni:int,nf:int) -> nn.Module:
    "Create an embedding layer."
    emb = nn.Embedding(ni, nf)
    # See https://arxiv.org/abs/1711.09160
    with torch.no_grad(): trunc_normal_(emb.weight, std=0.01)
    return emb

In [17]:
embeddingDotBias = EmbeddingDotBias(n_factors, n_users, n_items, y_range)

In [18]:
embeddingDotBias.cuda()

EmbeddingDotBias(
  (u_weight): Embedding(101, 50)
  (i_weight): Embedding(101, 50)
  (u_bias): Embedding(101, 1)
  (i_bias): Embedding(101, 1)
)

In [19]:
train_iter = iter(train_loader)

In [20]:
user_movies, rating = train_iter.next()

In [21]:
rating

tensor([4, 3, 3, 5, 2, 4, 5, 3, 4, 4, 3, 3, 3, 3, 4, 5, 4, 2, 5, 4, 4, 5, 3, 5,
        4, 3, 4, 3, 1, 1, 5, 4])

In [22]:
user_ids = user_movies[:,0]
movie_ids = user_movies[:,1]
user_ids = user_ids.cuda()
movie_ids = movie_ids.cuda()

In [23]:
embeddingDotBias.forward(user_ids, movie_ids)

tensor([2.7568, 2.7426, 2.7431, 2.7212, 2.7458, 2.7447, 2.7535, 2.7664, 2.7607,
        2.7521, 2.7559, 2.7619, 2.7399, 2.7291, 2.7426, 2.7742, 2.7315, 2.7533,
        2.7544, 2.7382, 2.7159, 2.7315, 2.7324, 2.7518, 2.7550, 2.7561, 2.7809,
        2.7400, 2.7457, 2.7744, 2.7099, 2.7666], device='cuda:0',
       grad_fn=<AddBackward0>)

In [24]:
def train(n_epochs:int, train_loader, valid_loader, model, optimizer, criterion, save_path=None):
    valid_loss_min = np.Inf
    for epoch in range(1, n_epochs+1):
        train_loss = 0.0
        valid_loss = 0.0
        
        model.train()
        steps_counter = 0
        for batch_idx, (user_movies, rating) in enumerate(train_loader):
            user_movies, rating = user_movies.cuda(), rating.float().cuda()
            optimizer.zero_grad()
            user_ids, movie_ids = user_movies[:,0], user_movies[:,1]
            output = model.forward(user_ids, movie_ids)
            loss = criterion(output, rating)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * user_movies.size(0)
            
            steps_counter += 1
            if steps_counter % 100 == 0:
                print(f'\rProcessed {steps_counter * batch_size} out of {len(train_loader.dataset)}')

        print(f'\rProcessed {steps_counter * batch_size} out of {len(train_loader.dataset)}')
        train_loss = train_loss/len(train_loader.dataset)
        model.eval()
        for (user_movies, rating) in valid_loader:
            user_movies, rating = user_movies.cuda(), rating.float().cuda()
            user_ids, movie_ids = user_movies[:,0], user_movies[:,1]
            output = model.forward(user_ids, movie_ids)
            loss = criterion(output, rating)
            valid_loss += loss.item() * user_movies.size(0)
            
        valid_loss = valid_loss/len(valid_loader.dataset)
        
        print(f'Epoch: {epoch} \tTraining Loss: {train_loss:.6f} \tValidation Loss: {valid_loss:.6f}')

In [25]:
n_epochs = 10
criterion = nn.MSELoss()
AdamW = partial(optim.Adam, betas=(0.9,0.99))
optimizer = AdamW(lr=1e-3, params=embeddingDotBias.parameters())

In [26]:
train(n_epochs, train_loader, valid_loader, embeddingDotBias, optimizer, criterion)

Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 1 	Training Loss: 1.646273 	Validation Loss: 1.284431
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 2 	Training Loss: 0.963038 	Validation Loss: 0.756314
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 3 	Training Loss: 0.732750 	Validation Loss: 0.692384
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 4 	Training Loss: 0.697069 	Validation Loss: 0.669210
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 5 	Training Loss: 0.676098 	Validation Loss: 0.648701
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 6 	Training Loss: 0.655396 	Validation Loss: 0.627149
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 7 	Training Loss: 0.633522 	Validation Loss: 0.604397
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 8 	Training Loss: 0.610629 	Validation Loss: 0.580717
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 9 	Training Loss: 0

In [27]:
train(n_epochs, train_loader, valid_loader, embeddingDotBias, optimizer, criterion)

Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 1 	Training Loss: 0.536772 	Validation Loss: 0.504598
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 2 	Training Loss: 0.510484 	Validation Loss: 0.477767
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 3 	Training Loss: 0.483630 	Validation Loss: 0.450568
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 4 	Training Loss: 0.456490 	Validation Loss: 0.423301
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 5 	Training Loss: 0.429376 	Validation Loss: 0.396290
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 6 	Training Loss: 0.402616 	Validation Loss: 0.369856
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 7 	Training Loss: 0.376508 	Validation Loss: 0.344264
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 8 	Training Loss: 0.351276 	Validation Loss: 0.319690
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 9 	Training Loss: 0

In [32]:
train(n_epochs, train_loader, valid_loader, embeddingDotBias, optimizer, criterion)

Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 1 	Training Loss: 0.281794 	Validation Loss: 0.252670
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 2 	Training Loss: 0.260811 	Validation Loss: 0.232637
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 3 	Training Loss: 0.240969 	Validation Loss: 0.213808
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 4 	Training Loss: 0.222310 	Validation Loss: 0.196216
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 5 	Training Loss: 0.204868 	Validation Loss: 0.179881
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 6 	Training Loss: 0.188656 	Validation Loss: 0.164792
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 7 	Training Loss: 0.173653 	Validation Loss: 0.150910
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 8 	Training Loss: 0.159816 	Validation Loss: 0.138173
Processed 3200 out of 6031
Processed 6048 out of 6031
Epoch: 9 	Training Loss: 0

# Predict

In [28]:
def predict(userId: int, movieId: int):
    userInput = torch.tensor([userIdx[userId]]).cuda().clone().detach()
    movieInput = torch.tensor([movieIdx[movieId]]).cuda().clone().detach()
    return embeddingDotBias.forward(userInput, movieInput).item()

In [29]:
predict(73, 1097)

3.9654407501220703

In [30]:
predict(561, 924)

3.501561164855957

In [34]:
for index, row in ratings[:100].iterrows():
    print(predict(int(row['userId']), int(row['movieId'])), row['rating'])

3.838888645172119 4.0
3.2899436950683594 3.5
2.838738441467285 3.5
4.705658912658691 5.0
2.3013768196105957 2.0
3.730295181274414 4.0
4.646974563598633 5.0
2.690315008163452 3.0
4.1812920570373535 4.5
3.790100336074829 4.5
2.8735952377319336 3.0
3.5290980339050293 3.0
2.928689479827881 3.0
3.100550651550293 3.0
3.948193073272705 4.0
4.284323692321777 5.0
3.9360194206237793 4.5
1.9483803510665894 2.5
4.218658447265625 5.0
4.161929130554199 4.0
4.089113235473633 4.5
4.821419715881348 5.0
2.5397820472717285 3.0
4.918349266052246 5.0
3.912986993789673 4.5
2.878478527069092 3.0
3.8700222969055176 4.5
3.651827812194824 3.5
0.6894132494926453 1.0
1.3104835748672485 1.0
5.028079509735107 5.0
4.1239848136901855 4.0
4.386385917663574 5.0
3.8329362869262695 3.0
2.9416356086730957 3.5
3.307978391647339 4.0
3.5512702465057373 3.0
4.625848770141602 5.0
4.148618221282959 4.0
3.195012331008911 4.5
4.5724287033081055 5.0
3.288559913635254 3.0
4.352843284606934 4.0
4.712601184844971 5.0
3.43427228927612