In [1]:
import pandas as pd
import matplotlib.pyplot as pltf
import numpy as np
import os
import torch
from torch.utils import data
from fastaiv07.learner import *
from fastaiv07.column_data import *


In [2]:
path = 'data/ml-latest-small/'
os.listdir(path)

['ratings.csv',
 'README.txt',
 'tags.csv',
 'movies.csv',
 'models',
 'tmp',
 'links.csv']

In [3]:
ratings = pd.read_csv(path+'/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# fastai collaborative filtering learner

In [4]:
val_idxs = get_cv_idxs(len(ratings))
wd=2e-4
n_factors = 50

In [5]:
cf = CollabFilterDataset.from_csv(path, 'ratings.csv', 'userId', 'movieId', 'rating')
learn = cf.get_learner(n_factors, val_idxs, 126, opt_fn=optim.Adam)

In [6]:
cf.__getitem__(0)

[0, 0, 4.0]

In [28]:
learn.fit(1e-2, 4, wds=wd, cycle_len=1, cycle_mult=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=15, style=ProgressStyle(description_width='initia…

 71%|███████   | 453/641 [00:03<00:01, 130.17it/s, loss=0.795]


KeyboardInterrupt: 

In [10]:
np.sqrt(0.7344228231647393)

0.8569847274979522

In [11]:
cf.__len__()

100836

## Making own collab filter model

In [12]:
class TestData(data.Dataset):

    def __init__(self, df, users_key, items_key, ratings_key):
        nonzero = df.loc[df[ratings_key] > 0]
        ratings = df[ratings_key].values.astype(np.float32)
        users = df[users_key].values
        items = df[items_key].values
        self.cols = [users, items, ratings]
        self.N = len(ratings)
            
    def __len__(self):
        return self.N
    
    def __getitem__(self, index):
        return [val[index] for val in self.cols]

In [13]:
u_uniq = ratings.userId.unique()
user2idx = {o:i for i,o in enumerate(u_uniq)}
ratings.userId = ratings.userId.apply(lambda x: user2idx[x])

m_uniq = ratings.movieId.unique()
movie2idx = {o:i for i,o in enumerate(m_uniq)}
ratings.movieId = ratings.movieId.apply(lambda x: movie2idx[x])

n_users=int(ratings.userId.nunique())
n_movies=int(ratings.movieId.nunique())

In [13]:
print_every = 10
def train(model, opt, epochs=1):
    best_acc = -1
    losses = []

    for e in range(epochs):
        for c, sample in enumerate(loader):
            model.train()
            x, y, z = sample  # move to device, e.g. GPU
            x = x.to(torch.device("cuda"))
            y = y.to(torch.device("cuda"))
            z = z.to(torch.device("cuda"))
            sample = [x, y, z]
            
            pred = model(sample[:2])

            loss = F.mse_loss(pred, sample[2])
#             opt.zero_grad()
            loss.backward()
            opt.step()

            if e % print_every == 0:
                losses.append(loss.item())
                print('Iteration %d, loss = %.4f' % (e, loss.item()))
                acc = check_accuracy_part34(loader, model)
                best_acc = acc if acc > best_acc else best_acc
            
    return losses

In [14]:
def check_accuracy_part34(loader, model):

    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    
    with torch.no_grad():
        for sample in loader:
            
            x, y, z = sample  # move to device, e.g. GPU
            x = x.to(torch.device("cuda"))
            y = y.to(torch.device("cuda"))
            z = z.to(torch.device("cuda"))
            sample = [x, y, z]
            
            x = sample[:2]
            y = sample[2]
#             x = x.to(torch.device("cuda"))  # move to device, e.g. GPU
#             y = y.to(torch.device("cuda"))
#             print(x)
            scores = model(x)
#             print(x, 'pred: ', scores, 'gt: ', y)
            _, preds = scores.max(1)
            num_correct += (preds == y.long()).sum()
            num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
#         print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))
        return acc

In [18]:
dataset = TestData(ratings_df, 'userId', 'movieId', 'rating')
dataset.__getitem__(0)
max(ratings_df['movieId'].values)

193609

In [19]:
params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 6}
loader = data.DataLoader(cf, **params)

In [None]:
check_accuracy_part34(loader, model)

Run fastai example

In [16]:
u_uniq = ratings.userId.unique()
user2idx = {o:i for i,o in enumerate(u_uniq)}
ratings.userId = ratings.userId.apply(lambda x: user2idx[x])

m_uniq = ratings.movieId.unique()
movie2idx = {o:i for i,o in enumerate(m_uniq)}
ratings.movieId = ratings.movieId.apply(lambda x: movie2idx[x])

n_users=int(ratings.userId.nunique())
n_movies=int(ratings.movieId.nunique())

In [17]:
class EmbeddingDot(nn.Module):
    def __init__(self, n_users, n_movies):
        super().__init__()
        self.u = nn.Embedding(n_users, n_factors)
        self.m = nn.Embedding(n_movies, n_factors)
        self.u.weight.data.uniform_(0,0.05)
        self.m.weight.data.uniform_(0,0.05)
        
    def forward(self, cats, conts):
        users,movies = cats[:,0],cats[:,1]
        u,m = self.u(users),self.m(movies)
        return (u*m).sum(1).view(-1, 1)

In [18]:
x = ratings.drop(['rating', 'timestamp'],axis=1)
y = ratings['rating'].astype(np.float32)

In [19]:
data = ColumnarModelData.from_data_frame(path, val_idxs, x, y, ['userId', 'movieId'], 64)

In [20]:
wd=1e-5
model = EmbeddingDot(n_users, n_movies).cuda()
opt = optim.SGD(model.parameters(), 1e-1, weight_decay=wd, momentum=0.9)

In [21]:
fit(model, data, 3, opt, F.mse_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss   
    0      1.550708   1.59935   
    1      1.080255   1.295432  
    2      0.88643    1.223699  



[1.2236994280483866]

# With bias

In [13]:
u_uniq = ratings.userId.unique()
user2idx = {o:i for i,o in enumerate(u_uniq)}
ratings.userId = ratings.userId.apply(lambda x: user2idx[x])

m_uniq = ratings.movieId.unique()
movie2idx = {o:i for i,o in enumerate(m_uniq)}
ratings.movieId = ratings.movieId.apply(lambda x: movie2idx[x])

n_users=int(ratings.userId.nunique())
n_movies=int(ratings.movieId.nunique())

In [14]:
x = ratings.drop(['rating', 'timestamp'],axis=1)
y = ratings['rating'].astype(np.float32)

In [15]:
data = ColumnarModelData.from_data_frame(path, val_idxs, x, y, ['userId', 'movieId'], 64)

In [16]:
min_rating,max_rating = ratings.rating.min(),ratings.rating.max()
min_rating,max_rating

(0.5, 5.0)

In [17]:
def get_emb(ni,nf):
    e = nn.Embedding(ni, nf)
    e.weight.data.uniform_(-0.01,0.01)
    return e

class EmbeddingDotBias(nn.Module):
    def __init__(self, n_users, n_movies):
        super().__init__()
        (self.u, self.m, self.ub, self.mb) = [get_emb(*o) for o in [
            (n_users, n_factors), (n_movies, n_factors), (n_users,1), (n_movies,1)
        ]]
        
    def forward(self, cats, conts):
        users,movies = cats[:,0],cats[:,1]
        um = (self.u(users)* self.m(movies)).sum(1)
        res = um + self.ub(users).squeeze() + self.mb(movies).squeeze()
        res = torch.sigmoid(res) * (max_rating-min_rating) + min_rating
        return res.view(-1, 1)

In [18]:
wd=2e-4
model = EmbeddingDotBias(cf.n_users, cf.n_items).cuda()
opt = optim.SGD(model.parameters(), 1e-1, weight_decay=wd, momentum=0.9)

In [19]:
fit(model, data, 3, opt, F.mse_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss   
    0      0.801465   0.815462  
    1      0.74528    0.796562  
    2      0.727433   0.787233  



[0.7872327693915423]

In [20]:
np.sqrt(0.7872327693915423)

0.8872613872988852

In [None]:
model.eval()
model()