<center><h1>HW2: Collaborative Filtering</h1></center>
<hr>

Name: **Firas Jolha**

Email: **f.jolha@innopolis.university**

# Install & Import Libs

In [None]:
import pandas as pd
from scipy import sparse
from torch import nn
import torch.nn.functional as f
import torch
from torch.utils.data import TensorDataset, DataLoader

from os.path import join as path_join
import numpy as np

# Data Preparation & Preprocessing

In [None]:
# Set Data Path
DATA_PATH = "data"
DATA_PATH = "."
 
# Read Training and Test Data
train_df = pd.read_csv(path_join(DATA_PATH, "train.csv"))
test_df = pd.read_csv(path_join(DATA_PATH, "test.csv"))
train_df.head()

Unnamed: 0,userId,movieId,rating
0,1,32,3.5
1,1,47,3.5
2,1,50,3.5
3,1,253,4.0
4,1,260,4.0


In [None]:
train_df.shape, test_df.shape

((761972, 3), (190819, 3))

In [None]:
# Data Exploration and Preprocessing
 
user_ids = train_df['userId']
movie_ids = train_df['movieId']
ratings = train_df['rating']
 
def map_ids(series):
  '''Resets ids of both users and items.
 
  Args:
    series (pd.Series): The series of ids to be converted
 
  Returns:
    pd.Series: a series of the same type and attributes after resetting the ids.
 
  '''
 
  uq = series.unique()
 
  return series.map(pd.Series(range(uq.size), index = uq))
 
def unmap_ids(series, original_series):  
  '''Returns back the original ids of series being converted by map_ids function.
 
  Args:
    series (pd.Series): The series of ids to be returned back.
    original_series (pd.Series): The series of ids from the original data frame.
 
  Returns:
    pd.Series: a series of the same type and attributes after returning the ids to the original values.
 
  '''
  
  uq = original_series.unique()
 
  return series.map(pd.Series(uq, index = range(uq.size)))
 
# Resetting the ids of training data
user_ids = map_ids(user_ids)
movie_ids = map_ids(movie_ids)
 
# Resetting the ids of test data
test_user_ids = map_ids(test_df['userId']) 
test_movie_ids = map_ids(test_df['movieId']) 
test_ratings = test_df['rating']
 
# Statistics of training data
n_users = np.max(user_ids) + 1
n_movies = np.max(movie_ids) + 1
 
# Statistics of test data
test_n_users = np.max(test_user_ids) + 1
test_n_movies = np.max(test_movie_ids) + 1
 
# Returning the indices back can be done using unmap_ids function
# unmap_ids(movie_ids, train_df['movieId'])
# unmap_ids(user_ids, train_df['userId'])

# Basic Collaborative Filtering

## Sparse Rating Matrix

In [None]:
# Define the training rating matrix as sparse matrix
 
R = sparse.coo_matrix(
    (ratings, (user_ids, movie_ids)),
    shape=(n_users, n_movies), 
    dtype=np.float
 )
 
R

<6687x5064 sparse matrix of type '<class 'numpy.float64'>'
	with 761972 stored elements in COOrdinate format>

In [None]:
# Define the rating matrix for test data as sparse matrix
 
R2 = sparse.coo_matrix(
    (test_ratings, (test_user_ids, test_movie_ids)),
    shape=(test_n_users, test_n_movies), 
    dtype=np.float
)
 
R2

<6674x5059 sparse matrix of type '<class 'numpy.float64'>'
	with 190819 stored elements in COOrdinate format>

## Matrix Factorization

## ALS

In [None]:
 
def update_P(P, Q, R, alpha = 0.001, lam = 0.001):
  '''Updates the values of matrix P
 
  Args:
    P: 
 
  Returns:
    np.ndarray: P itself after adjusting the values
 
  '''
 
  # assert R.shape == R_hat.shape, "R shoud equal to R_hat"
  assert P.shape[1] == Q.shape[1], "P and Q should have proper dimensions for matrix multiplication"
 
  M = np.zeros(R.shape)
 
  x1, x2 = R.nonzero()
  
  M[x1, x2] = 1
 
 
 # P_tau = P[x1, :]
 # Q_tau = Q[x2, :]
 
  # Inner Product
  #prod = np.sum((P_tau * Q_tau), axis = 1)
 
 
  #R_hat[x1, x2] = prod
 
  #res = np.multiply(R - R_hat, M)
 
  R_hat = P @ Q.T
 
 
  res = np.multiply(R_hat - R, M)
 
  gradient = alpha * (lam * P - (res @ Q))
  P += gradient
 
  return P
 
 
def update_Q(P, Q, R, alpha = 0.001, lam = 0.001):
  
  # assert R.shape == R_hat.shape, "R shoud equal to R_hat"
  assert P.shape[1] == Q.shape[1], "P and Q should have proper dimensions for matrix multiplication"
 
  M = np.zeros(R.shape)
  x1, x2 = R.nonzero()
  
  M[x1, x2] = 1
 
 # P_tau = P[x1, :]
 # Q_tau = Q[x2, :]
 
  # Inner Product
  #prod = np.sum((P_tau * Q_tau), axis = 1)
 
 
  #R_hat[x1, x2] = prod
 
  #res = np.multiply(R_hat - R, M)
 
  R_hat = P @ Q.T
 
  res = np.multiply(R_hat - R, M)
 
  gradient = alpha * (lam * Q - (res.T @ P))
  
  Q += gradient
  
 
  return Q

In [None]:
def calculate_loss(P, Q, R, lam):
 
  R_prod = (P @ Q.T)
  
  x1, x2 = R.nonzero()
  R_hat = R_prod[x1, x2]
  R_tau = R.data
  
  mu = np.mean(R_hat) # Overall average rating
  b_u = np.mean(R_prod, axis = 1)
  b_i = np.mean(R_prod, axis = 0)
  b_u = b_u[x1] - mu
  b_i = b_i[x2] - mu
 
  res = R_tau - (R_hat + mu + b_u + b_i)
 
  res = np.square(res)
 
  res = np.mean(res)
  res += lam *(np.linalg.norm(P) + np.linalg.norm(Q)) # Regularization Term 
  return res
 
# Set the dimension of latent space
k = 7# 7 
 
# Initialize the matrices P and Q randomly
P = np.random.random(size = (n_users, k))
Q = np.random.random(size = (n_movies, k))
 
# P = np.zeros((n_users, k))
# Q = np.zeros((n_movies, k))
 
# b_u = np.zeros(P.shape[0])
# b_i = np.zeros(Q.shape[0])
 
 
# R_hat = np.zeros(R.shape)
 
 
# Learning Rate
alpha = 0.0006
alpha = 1e-6 #0.000001 #40 or 50 or .00045
 
# Regularization rate
lam = 0.00001
 
# Variables to keep the best P and Q according to the lowest test loss 
best_P = P
best_Q = Q
last_loss = None
 
# Run for epochs
for iter in range(50):
 
  # Update Steps for P then Q by following ALS algorithm
  P = update_P(P, Q, R, alpha=alpha, lam=lam)
  Q = update_Q(P, Q, R, alpha=alpha, lam=lam)
 
  # Regularization rate lam = 0 for calculating the loss of test data R2
  # Regularization term is not included in test error
  test_loss = calculate_loss(P, Q, R2, lam=0)
 
  # Calculating the loss of training data R
  loss = calculate_loss(P, Q, R, lam = lam)
 
  print(f" epoch {iter}, training error {loss}, test error {test_loss} ") 
 
  # A control block for keeping the best P and Q for the lowest test loss during the training
  if last_loss:
    if last_loss > test_loss:
      best_P = P.copy()
      best_Q = Q.copy()
      last_loss = test_loss
    else:
      pass
 
  else:
    last_loss = test_loss

 epoch 0, training error 2.208494831012457, test error 2.2602729473608325 
 epoch 1, training error 2.206067454318581, test error 2.2599594238116456 
 epoch 2, training error 2.203704138913264, test error 2.259673797827935 
 epoch 3, training error 2.2014044873311094, test error 2.259415903712117 
 epoch 4, training error 2.19916810377244, test error 2.2591855764303936 
 epoch 5, training error 2.1969945940960063, test error 2.258982651609955 
 epoch 6, training error 2.194883565811776, test error 2.2588069655361784 
 epoch 7, training error 2.192834628073828, test error 2.258658355149891 
 epoch 8, training error 2.190847391673324, test error 2.258536658044651 
 epoch 9, training error 2.188921469031574, test error 2.25844171246407 
 epoch 10, training error 2.1870564741931804, test error 2.2583733572991647 
 epoch 11, training error 2.1852520228192707, test error 2.2583314320857433 
 epoch 12, training error 2.183507732180825, test error 2.258315777001822 
 epoch 13, training error 2

In [None]:
# Calculate the test loss using the best values in P and Q
calculate_loss(best_P, best_Q, R2, lam=0)

2.274492146337732

In [None]:
 
MODELS_PATH = "models"
MODELS_PATH = "."
with open(path_join(MODELS_PATH, "P_ARRAY_CF.npy"), "wb") as f:
  np.save(f, P)
with open(path_join(MODELS_PATH, "Q_ARRAY_CF.npy"), "wb") as f:
  np.save(f, Q)

In [None]:
def do_recommendation(user, P=best_P, Q=best_Q, n=5):
  '''Returns a list of top n recommendations (movies or items) given id of the user
  Args:
    user (number): The user id
    P (np.ndarray): The First matrix P in factorization equation R = P @ Q.T
    Q (np.ndarray): The Second matrix Q in factorization equation R = P @ Q.T
    n (number): Number of retrieved elements 
 
  Returns:
    list: a list of ids of top n recommendations
    list: a list of ratings of top n recommendations
 
  '''
 
  R_hat = P @ Q.T # Calculate predicted ratings
 
  ratings = R_hat[user, :] # Select the ratings of the specific user
 
  ids = np.argsort(ratings)[-n:][::-1] # Returns the top n ratings 
 
 
  return ids, ratings[ids]
 
 
def recommend(user_id, P, Q, top_n, original_item_ids):
  '''Returns a data frame consists of two columns, the first column
  involves the ids of top n recommendations (items or movies), given the user id
 
  Args:
    user_id (number): The user id
    P (np.ndarray): The First matrix P in factorization equation R = P @ Q.T
    Q (np.ndarray): The Second matrix Q in factorization equation R = P @ Q.T
    top_n (number): Number of retrieved elements 
    original_item_ids: a list of original ids of the items before mappingthem to basic indexing
 
  Returns:
    pd.DataFrame: a data frame consists of two columns, the first column
  involves the ids of top n recommendations (items or movies), given the user id
 
  '''
 
  # Do recommendation
  ids, ratings= do_recommendation(user_id, P, Q, top_n)
 
  # Remaps the ids of movies to its original index
  result = unmap_ids(pd.Series(ids), original_item_ids)
 
  result = pd.DataFrame({'Item':result.values, 'Rating':ratings})
 
  return result
  
 
# Usage
 
recommend(0, best_P, best_Q, 10, train_df['movieId'])

Unnamed: 0,Item,Rating
0,8831,2.950981
1,45950,2.949545
2,164,2.939739
3,106100,2.91601
4,1665,2.898494
5,5637,2.897347
6,4429,2.893091
7,8914,2.885189
8,60037,2.869712
9,5949,2.869378


# Neural Collabrative Filtering Model

In [None]:
class NCA(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config
    self.n_users = config['n_users']
    self.n_items = config['n_items']
    self.k = config['k']
 
    self.embed_user = nn.Embedding(self.n_users, self.k)
    self.embed_item = nn.Embedding(self.n_items, self.k)
 
    self.fc_layers = nn.ModuleList()
    for idx, (in_size, out_size) in enumerate(zip(config['layers'][:-1], config['layers'][1:])):
        self.fc_layers.append(torch.nn.Linear(in_size, out_size))
        
    
    self.dropout = nn.Dropout(0.2)
 
    self.output = nn.Linear(config['layers'][-1],  1)
    self.output_f = nn.Sigmoid()
 
  def forward(self, users, items):
    
    # users, items = x
    users_x = self.embed_user(users)
    items_x = self.embed_item(items)
 
    x = torch.cat([users_x, items_x], dim = 1) # Concatenate along the second axis
 
    for i in range(len(self.fc_layers)):
      x = self.fc_layers[i](x)
      x = nn.ReLU()(x)
      x = self.dropout(x)
 
    x = self.output(x)
    x = self.output_f(x) * config['rating_range'] + config['lowest_rating']
    return x

In [None]:
model(users, movies).shape

torch.Size([761972, 1])

In [None]:
R.data.shape

(761972,)

## Build the Model

In [None]:
# Latent Space Dimension
k = 7
 
config = {
    'n_users':n_users, # Number of Users
    'n_items': n_movies, # Number of Items
    'k': k, # Latent Space Dimension
    'layers':[k * 2, 64, 16, 8],  # sizes of fully connected layers
    'rating_range': 4,  # Range of rating (5 - 1 = 4)
    'lowest_rating':1 # The lowest rating (1)
    }
 
# Input Data
users = torch.Tensor(user_ids).int()
movies = torch.Tensor(movie_ids).int()
ratings = torch.Tensor(R.data)
 
 
# Try to use GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
# Model
model = NCA(config).to(device)
 
# Do one-hot encoding
model

NCA(
  (embed_user): Embedding(6687, 7)
  (embed_item): Embedding(5064, 7)
  (fc_layers): ModuleList(
    (0): Linear(in_features=14, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=16, bias=True)
    (2): Linear(in_features=16, out_features=8, bias=True)
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (output): Linear(in_features=8, out_features=1, bias=True)
  (output_f): Sigmoid()
)

In [None]:
 
learning_rate = 0.001
critertion = nn.MSELoss()
batch_size = 100
epochs = range(40)
 
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters())
 
data_loader = DataLoader(TensorDataset(users, movies, ratings), batch_size = batch_size)  
losses = []
for epoch in epochs:
  epoch_loss = []
  for batch_users, batch_movies, batch_ratings in data_loader:
 
    users = batch_users.to(device)
    movies = batch_movies.to(device)
    ratings = batch_ratings.to(device)
 
    optimizer.zero_grad()
 
    output = model(users, movies)[:, 0]
 
    loss = critertion(output, ratings)
 
    loss.backward()
    optimizer.step()
    
    epoch_loss.append(loss.item())
 
  avg_epoch_loss = np.mean(epoch_loss)
  losses.append(avg_epoch_loss)
  print(f"epoch {epoch}, loss = {avg_epoch_loss}")

epoch 0, loss = 1.0592916649001165
epoch 1, loss = 0.9406692212491523
epoch 2, loss = 0.9122387819745018
epoch 3, loss = 0.897270367641657
epoch 4, loss = 0.8870583937168512
epoch 5, loss = 0.8783762987204424
epoch 6, loss = 0.8677011451050834
epoch 7, loss = 0.8565393227418968
epoch 8, loss = 0.8476709221291729
epoch 9, loss = 0.8362611738705807
epoch 10, loss = 0.8246295854869712
epoch 11, loss = 0.8158248318647697
epoch 12, loss = 0.8061579541173663
epoch 13, loss = 0.7987824181392984
epoch 14, loss = 0.790384490298122
epoch 15, loss = 0.7837509512265758
epoch 16, loss = 0.7776246184087175
epoch 17, loss = 0.7718029411700298
epoch 18, loss = 0.7663959458147681
epoch 19, loss = 0.7614975130200151
epoch 20, loss = 0.7574817955924145
epoch 21, loss = 0.7539870116077462
epoch 22, loss = 0.7498597814811496
epoch 23, loss = 0.7468343569889782
epoch 24, loss = 0.7444679163846131
epoch 25, loss = 0.7414363801137122
epoch 26, loss = 0.7385940946892804
epoch 27, loss = 0.7366433689414751
epoc

In [None]:
critertion(model(users, movies)[:,0], ratings)

tensor(0.3075, grad_fn=<MseLossBackward>)

In [None]:
 
path = path_join(MODELS_PATH, "acf.pth")
 
torch.save(model.state_dict, path)