In [1]:
import pandas as pd
from torch.utils import data
from fastaiv07.learner import *
import torch.nn.functional as F
import torch

In [2]:
USE_GPU = True

dtype = torch.float32 # we will be using float throughout this tutorial

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss
print_every = 100

print('using device:', device)

using device: cuda


In [3]:
'''
Make random data for testing
'''
def make_colab_data(n, m, max_mIJ = 5):
    ratings = np.random.randint(max_mIJ, size=(n,m))
    
    data_dict = {'row':[], 'col':[], 'm_IJ':[]}
    
    for row in range(n):
        for col in range(m):
            data_dict['row'].append(row)
            data_dict['col'].append(col)
            data_dict['m_IJ'].append(ratings[row,col])
    
    return pd.DataFrame(data_dict)

In [4]:
class TestData(data.Dataset):

    def __init__(self, df):
        nonzero_entries = df.loc[df['m_IJ'] > 0].values
            
    def __len__(self):
        return len(self.vals)
    
    def __getitem__(self, index):
        X = self.index_pairs[index]
        y = self.vals[index]
        
        return X, y

In [5]:
class EmbeddingDot(torch.nn.Module):
    def __init__(self, n_users, n_movies, n_factors=3):
        super().__init__()
        self.u = nn.Embedding(n_users, n_factors)
        self.m = nn.Embedding(n_movies, n_factors)
        self.u.weight.data.uniform_(0,0.05)
        self.m.weight.data.uniform_(0,0.05)
        
    def forward(self, indices):
        row,col = indices[0],indices[1]
        u,m = self.u(Variable(row)),self.m(Variable(col))
        return (u*m).sum(1).view(-1, 1)

In [6]:
def get_emb(ni,nf):
    e = nn.Embedding(ni, nf)
    e.weight.data.uniform_(-0.01,0.01)
    return e

class EmbeddingDotBias(nn.Module):
    def __init__(self, n_users, n_movies, n_factors=10):
        super().__init__()
        (self.u, self.m, self.ub, self.mb) = [get_emb(*o) for o in [
            (n_users, n_factors), (n_movies, n_factors), (n_users,1), (n_movies,1)
        ]]
        
    def forward(self, indices):
        users,movies = indices[0],indices[1]
        um = (self.u(users)* self.m(movies)).sum(1)
        res = um + self.ub(users).squeeze() + self.mb(movies).squeeze()
        res = torch.sigmoid(res) * (5-1) + 1
        return res.view(-1, 1)

In [7]:
class EmbeddingNet(nn.Module):
    def __init__(self, n_users, n_movies, n_factors=15, nh=10, p1=0.05, p2=0.5):
        super().__init__()
        (self.u, self.m) = [get_emb(*o) for o in [
            (n_users, n_factors), (n_movies, n_factors)]]
        self.lin1 = nn.Linear(n_factors*2, nh)
        self.lin2 = nn.Linear(nh, 1)
        self.drop1 = nn.Dropout(p1)
        self.drop2 = nn.Dropout(p2)
        
    def forward(self, indices):
        users,movies = indices[0],indices[1]
        x = self.drop1(torch.cat([self.u(users),self.m(movies)], dim=1))
        x = self.drop2(F.relu(self.lin1(x)))
        return torch.sigmoid(self.lin2(x)) * (5-0+1) + 0-0.5

In [8]:
print_every = 10
def train(model, opt, epochs=1):
    best_acc = -1

    
    for e in range(epochs):
        for c, (x, y) in enumerate(loader):
            model.train()

            pred = model(x)

#             criterion = nn.MSELoss()
            loss = F.mse_loss(pred, y.float())
#             opt.zero_grad()
            loss.backward()

            opt.step()
            

            if e % print_every == 0:
                print('Iteration %d, loss = %.4f' % (e, loss.item()))
                acc = check_accuracy_part34(loader, model)
                best_acc = acc if acc > best_acc else best_acc
            
    return best_acc

In [9]:
def check_accuracy_part34(loader, model):

    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    
    with torch.no_grad():
        for x, y in loader:
#             x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
#             y = y.to(device=device, dtype=torch.long)
            scores = model(x)
#             print(x, 'pred: ', scores, 'gt: ', y)
            _, preds = scores.max(1)
            num_correct += (preds == y.long()).sum()
            num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))
        return acc

In [10]:
params = {'batch_size': 1,
          'shuffle': True,
          'num_workers': 6}
epochs = 10
N = 25
df = make_colab_data(N,N)
testdata = TestData(df)
loader = data.DataLoader(testdata, **params)
# print(df.loc[df['m_IJ'] > 0].values)

AttributeError: 'TestData' object has no attribute 'vals'

In [None]:
wd=1e-5
model = EmbeddingDotBias(N, N, 20)
# model = EmbeddingNet(N, N, n_factors=15)
opt = optim.SGD(model.parameters(), 1e-3, weight_decay=wd, momentum=0.9)
train(model, opt, epochs=100)

In [11]:
, N, n_factors=15)
opt = optim.SGD(model.parameters(), 1e-3, weight_decay=wd, momentum=0.9)
train(model, opt, epochs=100)check_accuracy_part34(loader, model)

SyntaxError: invalid syntax (<ipython-input-11-139859670a0a>, line 3)

In [12]:
print(df.loc[df['m_IJ'] > 0].values)

[[ 0  1  2]
 [ 0  2  2]
 [ 0  3  1]
 ...
 [24 19  1]
 [24 21  2]
 [24 24  4]]
