## Logistic Matrix Factorization with PyTorch

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["user", "item"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df


In [3]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, seed=23):
        super(MF, self).__init__()
        torch.manual_seed(seed)
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)

    def forward(self, u, v):
        ### BEGIN SOLUTION
        u_emb = self.user_emb(u)
        v_emb = self.item_emb(v)
        u_bias = self.user_bias(u)
        v_bias = self.item_bias(v)
        return (u_emb * v_emb).sum(1) + u_bias.squeeze() + v_bias.squeeze()
        ### END SOLUTION

In [20]:
def train_one_epoch(model, train_df, optimizer):
    """ Trains the model for one epoch"""
    model.train()
    ### BEGIN SOLUTION
    u = torch.LongTensor(train_df.user.values)
    v = torch.LongTensor(train_df.item.values)
    y = torch.FloatTensor(train_df.rating.values)
    y_hat = model(u, v)
    loss = F.mse_loss(y_hat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    train_loss = loss.item()    
    ### END SOLUTION
    return train_loss

def valid_metrics(model, valid_df):
    """Computes validation loss and accuracy"""
    model.eval()
    ### BEGIN SOLUTION
    u = torch.LongTensor(valid_df.user.values)
    v = torch.LongTensor(valid_df.item.values)
    y = torch.FloatTensor(valid_df.rating.values)
    y_hat = model(u, v)
    valid_loss = F.mse_loss(y_hat, y).item()
    valid_acc = ((y_hat > 0.5) == y).float().mean().item() 
    ### END SOLUTION
    return valid_loss, valid_acc

In [21]:
def training(model, train_df, valid_df, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for i in range(epochs):
        train_loss = train_one_epoch(model, train_df, optimizer)
        valid_loss, valid_acc = valid_metrics(model, valid_df)
        print("train loss %.3f valid loss %.3f valid acc %.3f" % 
              (train_loss, valid_loss, valid_acc))

In [22]:
from logistic_mf import * 

In [23]:
# reading a csv into pandas
train = pd.read_csv("tiny_data/train_books_ratings_tiny.csv")
valid = pd.read_csv("tiny_data/valid_books_ratings_tiny.csv")

In [24]:
valid.head()

Unnamed: 0,user,item,rating,timestamp
0,A9KTKY6BUR8U6,0000013714,0,1357516800
1,A35OP02LIXZ84E,0000477141,0,1399939200
2,A9WX8DK93SN5,000100039X,0,1385683200
3,A36JQ1WC5JQPFQ,000100039X,0,1391990400
4,A1TYDYZO9JV5LX,000100039X,0,1403913600


In [25]:
train_df = encode_data(train, train=None)

In [26]:
valid_df = encode_data(valid, train=train)

In [27]:
valid_df.tail()

Unnamed: 0,user,item,rating,timestamp
3,4,3,0,1391990400
4,5,3,0,1403913600
5,6,3,0,1367452800
6,7,3,0,1394668800
7,8,3,0,1219708800


In [28]:
num_users = len(train_df.user.unique())
num_items = len(train_df.item.unique())
print(num_users, num_items) 

99 20


In [29]:
model = MF(num_users, num_items)

In [30]:
u = torch.LongTensor(train_df.user.values[:5])
v = torch.LongTensor(train_df.item.values[:5]) 

In [35]:
model(u, v)

tensor([-0.0648,  0.0111, -0.0763, -0.0335, -0.0367], grad_fn=<AddBackward0>)

In [36]:
model = MF(num_users, num_items, emb_size=100) 

In [37]:
training(model, train_df, valid_df, epochs=10, lr=0.1, wd=1e-5)

train loss 0.005 valid loss 0.124 valid acc 1.000
train loss 0.134 valid loss 0.144 valid acc 1.000
train loss 0.137 valid loss 0.020 valid acc 1.000
train loss 0.026 valid loss 0.518 valid acc 0.000
train loss 0.448 valid loss 0.004 valid acc 1.000
train loss 0.004 valid loss 0.119 valid acc 1.000
train loss 0.107 valid loss 0.219 valid acc 1.000
train loss 0.202 valid loss 0.210 valid acc 1.000
train loss 0.199 valid loss 0.140 valid acc 1.000
train loss 0.146 valid loss 0.051 valid acc 1.000


In [38]:
training(model, train_df, valid_df, epochs=15, lr=0.01, wd=1e-5)

train loss 0.080 valid loss 0.023 valid acc 1.000
train loss 0.055 valid loss 0.005 valid acc 1.000
train loss 0.030 valid loss 0.004 valid acc 1.000
train loss 0.011 valid loss 0.013 valid acc 1.000
train loss 0.004 valid loss 0.014 valid acc 1.000
train loss 0.008 valid loss 0.009 valid acc 1.000
train loss 0.015 valid loss 0.004 valid acc 1.000
train loss 0.015 valid loss 0.001 valid acc 1.000
train loss 0.009 valid loss 0.001 valid acc 1.000
train loss 0.004 valid loss 0.003 valid acc 1.000
train loss 0.001 valid loss 0.005 valid acc 1.000
train loss 0.002 valid loss 0.005 valid acc 1.000
train loss 0.004 valid loss 0.005 valid acc 1.000
train loss 0.006 valid loss 0.003 valid acc 1.000
train loss 0.006 valid loss 0.002 valid acc 1.000
