In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import pairwise_distances 

In [3]:
rating=pd.read_csv('/content/drive/MyDrive/totaldata.txt',header = None, sep = '\t', encoding='latin-1')
train_df = pd.read_csv('/content/drive/MyDrive/training.txt',header = None, sep = '\t',encoding='latin-1')
test_df = pd.read_csv('/content/drive/MyDrive/test.txt', header = None, sep = '\t',encoding='latin-1')
rating.columns = ['userID','itemID','rating']
train_df.columns = ['userID','itemID','rating']
test_df.columns = ['userID','itemID','rating']
rating.head()

Unnamed: 0,userID,itemID,rating
0,A24FQNZ2ZCP9UH,B004DK0UDA,5.0
1,A9MYCYZT8EMMX,B00AWCNF9O,1.0
2,A1T5GAE8KIMVTF,B001V3TVEQ,5.0
3,A12KFVKK4UXYBH,B000U0CA0I,5.0
4,A9MYCYZT8EMMX,B001B7MAU4,5.0


In [4]:
le=LabelEncoder()
rating.iloc[:,0]=le.fit_transform(rating.iloc[:,0])
rating.iloc[:,1]=le.fit_transform(rating.iloc[:,1])

In [6]:
np.random.seed(3)
msk = np.random.rand(len(rating)) < 0.8
train = rating[msk].copy()
val = rating[~msk].copy()

In [7]:
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [10]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userID", "itemID"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [11]:
df_t_e = encode_data(train_df)
df_v_e = encode_data(test_df, train_df)
df_v_e
print(df_t_e)

       userID  itemID  rating
0           0       0     5.0
1           1       1     1.0
2           2       2     5.0
3           3       3     5.0
4           1       4     5.0
...       ...     ...     ...
82210    8731   11341     5.0
82211   13374   11342     5.0
82212   13136   11343     4.0
82213    1599   11344     4.0
82214    1524   11345     5.0

[82215 rows x 3 columns]


In [12]:
df_train = encode_data(train_df)
df_val = encode_data(test_df, train_df)

**Embedding Layer**

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [14]:
embed = nn.Embedding(10, 3)

In [15]:
a = torch.LongTensor([[1,2,0,4,5,1]])
embed(a)

tensor([[[ 1.2002,  0.7472, -0.7497],
         [ 1.7924, -0.5355,  1.1031],
         [-0.5283,  0.3882, -0.3147],
         [-0.2655, -1.9845, -1.5578],
         [-0.6085, -0.4083, -1.2003],
         [ 1.2002,  0.7472, -0.7497]]], grad_fn=<EmbeddingBackward>)

**Matrix factorization model**

In [16]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

**Training MF model**

In [17]:
df_t_e

Unnamed: 0,userID,itemID,rating
0,0,0,5.0
1,1,1,1.0
2,2,2,5.0
3,3,3,5.0
4,1,4,5.0
...,...,...,...
82210,8731,11341,5.0
82211,13374,11342,5.0
82212,13136,11343,4.0
82213,1599,11344,4.0


In [22]:
num_users = len(df_train.userID.unique())
num_items = len(df_train.itemID.unique())
print(num_users, num_items)

15006 11346


In [23]:
model = MF(num_users, num_items, emb_size=100)

In [30]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.userID.values) # .cuda()
        items = torch.LongTensor(df_train.itemID.values) #.cuda()
        ratings = torch.FloatTensor(df_train.rating.values) #.cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) 
    test_loss(model, unsqueeze)

In [31]:
ratings = torch.FloatTensor(df_train.rating.values)
print(ratings.shape)
ratings = ratings.unsqueeze(1) # .cuda()
print(ratings.shape)

torch.Size([82215])
torch.Size([82215, 1])


In [32]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_val.userID.values) #.cuda()
    items = torch.LongTensor(df_val.itemID.values) #.cuda()
    ratings = torch.FloatTensor(df_val.rating.values) #.cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [33]:
train_epocs(model, epochs=10, lr=0.1)

18.51939582824707
8.356239318847656
1.5677030086517334
8.205473899841309
2.299245595932007
1.400342583656311
3.536900520324707
4.510695457458496
3.705566167831421
2.058432102203369
test loss 2.544 


In [34]:
train_epocs(model, epochs=15, lr=0.01)

1.1593294143676758
0.8554468750953674
0.7289605140686035
0.7024853229522705
0.7106793522834778
0.7161605954170227
0.7055453062057495
0.6808405518531799
0.6494582891464233
0.6176015734672546
0.5883939862251282
0.5624181628227234
0.5387393236160278
0.515783965587616
0.49205508828163147
test loss 1.803 






**MF with bias**

In [35]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [36]:
model = MF_bias(num_users, num_items, emb_size=100) #.cuda()

In [37]:
train_epocs(model, epochs=10, lr=0.05, wd=1e-5)

18.521564483642578
13.923646926879883
7.7541422843933105
2.4235916137695312
1.6923866271972656
4.558290004730225
4.361111164093018
2.3268373012542725
1.0350311994552612
1.0915905237197876
test loss 2.964 


In [38]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-5)

1.8485078811645508
1.2639573812484741
0.9274323582649231
0.8150718808174133
0.8146262168884277
0.8259437680244446
0.813369870185852
0.7848475575447083
0.7609520554542542
0.7547149062156677
test loss 1.502 


**Neural Network Model**

In [39]:
# Note here there is no matrix multiplication, we could potentially make the embeddings of different sizes.
# Here we could get better results by keep playing with regularization.
    
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return x

In [40]:
model = CollabFNet(num_users, num_items, emb_size=100) #.cuda()

In [41]:
train_epocs(model, epochs=15, lr=0.05, wd=1e-6, unsqueeze=True)

16.375076293945312
18.092815399169922
1.6344852447509766
6.681528091430664
7.418051719665527
4.987536430358887
2.3338215351104736
1.4019933938980103
2.48726749420166
3.4484105110168457
2.7645647525787354
1.5787795782089233
1.1267696619033813
1.4273324012756348
1.9013453722000122
test loss 2.356 


In [42]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-6, unsqueeze=True)

2.0937299728393555
1.1531654596328735
1.0286725759506226
1.3508445024490356
1.357296109199524
1.131130337715149
0.951019823551178
0.9266494512557983
1.006837248802185
1.0771616697311401
test loss 1.468 


**Reference:**
Lesson 5 of Jeremy Howard's Deep Learning Course