# Collaborative Filtering with Neural Networks

In this notebook we will write a matrix factorization model in pytorch to solve a recommendation problem. Then we will write a more general neural model for the same problem.

The MovieLens dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100004 ratings and 1296 tag applications across 9125 movies. https://grouplens.org/datasets/movielens/. To get the data:

`wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip`

## MovieLens dataset

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
PATH = Path("ml-25m/")
#PATH = Path("/data2/yinterian/ml-latest-small/")
list(PATH.iterdir())

[PosixPath('ml-25m/genome-scores.csv'),
 PosixPath('ml-25m/movies.csv'),
 PosixPath('ml-25m/genome-tags.csv'),
 PosixPath('ml-25m/tags.csv'),
 PosixPath('ml-25m/README.txt'),
 PosixPath('ml-25m/links.csv'),
 PosixPath('ml-25m/ratings.csv')]

In [3]:
! head $PATH/ratings.csv

userId,movieId,rating,timestamp
1,296,5.0,1147880044
1,306,3.5,1147868817
1,307,5.0,1147868828
1,665,5.0,1147878820
1,899,3.5,1147868510
1,1088,4.0,1147868495
1,1175,3.5,1147868826
1,1217,3.5,1147878326
1,1237,5.0,1147868839


In [4]:
data = pd.read_csv(PATH/"ratings.csv")

In [5]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


### Encoding data
We enconde the data to have contiguous ids for users and movies. You can think about this as a categorical encoding of our two categorical variables userId and movieId.

In [6]:
# split train and validation before encoding
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

In [7]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [8]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [9]:
# to check my new implementation
LOCAL_PATH = Path("images/")
df_t = pd.read_csv(LOCAL_PATH/"tiny_training2.csv")
df_v = pd.read_csv(LOCAL_PATH/"tiny_val2.csv")
print(df_t)
df_t_e = encode_data(df_t)
df_v_e = encode_data(df_v, df_t)
df_v_e
print(df_t_e)

    userId  movieId  rating
0       11        1       4
1       11       23       5
2        2       23       5
3        2        4       3
4       31        1       4
5       31       23       4
6        4        1       5
7        4        3       2
8       52        1       1
9       52        3       4
10      61        3       5
11       7       23       1
12       7        3       3
    userId  movieId  rating
0        0        0       4
1        0        1       5
2        1        1       5
3        1        2       3
4        2        0       4
5        2        1       4
6        3        0       5
7        3        3       2
8        4        0       1
9        4        3       4
10       5        3       5
11       6        1       1
12       6        3       3


In [10]:
# encoding the train and validation data
df_train = encode_data(train)
df_val = encode_data(val, train)

## Embedding layer

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [12]:
# an Embedding module containing 10 user or item embedding size 3
# embedding will be initialized at random
embed = nn.Embedding(10, 3)

In [13]:
# given a list of ids we can "look up" the embedding corresponing to each id
a = torch.LongTensor([[1,2,0,4,5,1]])
embed(a)

tensor([[[ 0.1818,  1.0356,  0.7028],
         [-0.2836,  1.5921, -0.5638],
         [-0.5413,  0.5567, -1.2315],
         [ 1.2857, -1.1825,  1.7503],
         [ 1.3675, -0.2378,  2.3989],
         [ 0.1818,  1.0356,  0.7028]]], grad_fn=<EmbeddingBackward0>)

## Matrix factorization model

In [14]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)   

## Debugging MF model

In [15]:
df_t_e

Unnamed: 0,userId,movieId,rating
0,0,0,4
1,0,1,5
2,1,1,5
3,1,2,3
4,2,0,4
5,2,1,4
6,3,0,5
7,3,3,2
8,4,0,1
9,4,3,4


In [16]:
num_users = 7
num_items = 4
emb_size = 3

user_emb = nn.Embedding(num_users, emb_size)
item_emb = nn.Embedding(num_items, emb_size)
users = torch.LongTensor(df_t_e.userId.values)
items = torch.LongTensor(df_t_e.movieId.values)

In [17]:
U = user_emb(users)
V = item_emb(items)

In [18]:
U

tensor([[-0.2383,  1.3224, -0.7119],
        [-0.2383,  1.3224, -0.7119],
        [ 0.2146,  0.7906,  0.7640],
        [ 0.2146,  0.7906,  0.7640],
        [ 1.4617,  2.2317, -0.0868],
        [ 1.4617,  2.2317, -0.0868],
        [-0.7242,  1.9817,  1.5818],
        [-0.7242,  1.9817,  1.5818],
        [-0.8624,  0.1225,  0.0449],
        [-0.8624,  0.1225,  0.0449],
        [ 1.1392, -1.2843, -0.2102],
        [ 0.3526, -0.0684,  0.4495],
        [ 0.3526, -0.0684,  0.4495]], grad_fn=<EmbeddingBackward0>)

In [19]:
# element wise multiplication
U*V 

tensor([[-0.2829, -1.8138,  0.3364],
        [ 0.1788,  1.0181,  0.3623],
        [-0.1610,  0.6087, -0.3889],
        [-0.0571,  0.7538, -0.0839],
        [ 1.7352, -3.0611,  0.0410],
        [-1.0967,  1.7181,  0.0442],
        [-0.8597, -2.7181, -0.7474],
        [ 0.2909,  2.3561,  1.0506],
        [-1.0238, -0.1680, -0.0212],
        [ 0.3464,  0.1456,  0.0298],
        [-0.4575, -1.5270, -0.1396],
        [-0.2646, -0.0526, -0.2288],
        [-0.1416, -0.0813,  0.2985]], grad_fn=<MulBackward0>)

In [20]:
# what we want is a dot product per row
(U*V).sum(1) 

tensor([-1.7604,  1.5592,  0.0588,  0.6129, -1.2849,  0.6656, -4.3252,  3.6975,
        -1.2130,  0.5218, -2.1242, -0.5460,  0.0756], grad_fn=<SumBackward1>)

In [39]:
df_train.userId

0                0
1                0
2                0
3                0
6                0
             ...  
25000088    162540
25000089    162540
25000091    162540
25000093    162540
25000094    162540
Name: userId, Length: 19999967, dtype: int64

## Training MF model

In [21]:
num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print(num_users, num_items) 

162541 56642


In [22]:
model = MF(num_users, num_items, emb_size=100) # .cuda() if you have a GPU

In [23]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.userId.values) # .cuda()
        items = torch.LongTensor(df_train.movieId.values) #.cuda()
        print("items : ",items)
        ratings = torch.FloatTensor(df_train.rating.values) #.cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) 
    test_loss(model, unsqueeze)

In [38]:
df_train.movieId.values

array([   0,    1,    2, ..., 4172,  720,  502])

In [24]:
# Here is what unsqueeze does
ratings = torch.FloatTensor(df_train.rating.values)
print(ratings.shape)
ratings = ratings.unsqueeze(1) # .cuda()
print(ratings.shape)

torch.Size([19999967])
torch.Size([19999967, 1])


In [25]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_val.userId.values) #.cuda()
    items = torch.LongTensor(df_val.movieId.values) #.cuda()
    ratings = torch.FloatTensor(df_val.rating.values) #.cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    print("users : ", users)
    print("items : ", items)
    print("y_hat : ", y_hat)
    print("rating : ", ratings)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [26]:
#train_epocs(model, epochs=10, lr=0.1)

In [27]:
#train_epocs(model, epochs=15, lr=0.01)

In [28]:
#train_epocs(model, epochs=15, lr=0.01)

## MF with bias

In [29]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [30]:
model = MF_bias(num_users, num_items, emb_size=100) #.cuda()

In [31]:
#train_epocs(model, epochs=10, lr=0.05, wd=1e-5)

In [32]:
#train_epocs(model, epochs=10, lr=0.01, wd=1e-5)

In [33]:
#train_epocs(model, epochs=10, lr=0.001, wd=1e-5)

Note that these models are susceptible to weight initialization, optimization algorithm and regularization.

## Neural Network Model

In [34]:
# Note here there is no matrix multiplication, we could potentially make the embeddings of different sizes.
# Here we could get better results by keep playing with regularization.
    
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        #print("U : ", U)
        #print("V : ", V)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return x

In [35]:
model = CollabFNet(num_users, num_items, emb_size=100) #.cuda()

In [36]:
train_epocs(model, epochs=3, lr=0.05, wd=1e-6, unsqueeze=True) 

items :  tensor([   0,    1,    2,  ..., 4172,  720,  502])
13.050689697265625
items :  tensor([   0,    1,    2,  ..., 4172,  720,  502])
2.0301215648651123
items :  tensor([   0,    1,    2,  ..., 4172,  720,  502])
8.788979530334473
users :  tensor([     0,      0,      0,  ..., 162540, 162540, 162540])
items :  tensor([2018, 1119, 1567,  ..., 1593,  717, 6542])
y_hat :  tensor([[4.2392],
        [4.3116],
        [4.4827],
        ...,
        [5.0648],
        [4.6602],
        [4.7395]], grad_fn=<AddmmBackward0>)
rating :  tensor([[3.5000],
        [4.0000],
        [4.5000],
        ...,
        [4.5000],
        [4.5000],
        [2.0000]])
test loss 2.600 


In [37]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-6, unsqueeze=True)

items :  tensor([   0,    1,    2,  ..., 4172,  720,  502])
2.6375694274902344
items :  tensor([   0,    1,    2,  ..., 4172,  720,  502])
1.387457251548767
items :  tensor([   0,    1,    2,  ..., 4172,  720,  502])
1.282362937927246
items :  tensor([   0,    1,    2,  ..., 4172,  720,  502])


KeyboardInterrupt: 

In [None]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

0.7163267731666565
0.7032808065414429
0.695513904094696
0.6967512369155884
0.6998187303543091
0.700666606426239
0.7004959583282471
0.6982167959213257
0.6955875158309937
0.694402813911438
test loss 0.796 


In [None]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

0.6919353008270264
0.6934647560119629
0.6922585368156433
0.6942275762557983
0.6926798224449158
0.6916202902793884
0.6911264061927795
0.6923496127128601
0.6922929286956787
0.6904215812683105
test loss 0.795 


# References
* This notebook is based on [lesson 5 of Jeremy Howard's Deep Learning Course](https://github.com/fastai/fastai/blob/master/courses/dl1/lesson5-movielens.ipynb)