In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import sparse
from tqdm import tqdm

In [2]:
# some important variables
SEED = 42

In [3]:
# read dataset
rating_df = pd.read_table('../data/raw/ml-100k/u.data', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

rating_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
n_users = rating_df.user_id.max()
n_items = rating_df.item_id.max()

n_users, n_items

(943, 1682)

In [5]:
from sklearn.model_selection import train_test_split

rating_prep = rating_df.copy()
rating_prep.user_id -= 1
rating_prep.item_id -= 1

X_train, X_test, y_train, y_test = train_test_split(rating_prep[['user_id', 'item_id']], rating_prep.rating, train_size=0.8, random_state=SEED)

df_train = pd.concat([X_train, y_train], axis=1)
df_train = df_train.reset_index()


rank_matrix = df_train.pivot(index="user_id", columns="item_id", values="rating").fillna(0)

# add columns (items) that was skipped
for skipped_item in set(range(n_items)) - set(rank_matrix.columns):
    rank_matrix[skipped_item] = 0

# rearrange columns to transform to numpy
rank_matrix = rank_matrix[list(range(n_items))].to_numpy()

rank_matrix.shape

(943, 1682)

In [6]:
def initialize_emb(n_users: int, n_items: int, emb_dim: int = 16) -> tuple[np.ndarray, np.ndarray]:
    user_emb = np.random.rand(n_users, emb_dim)
    item_emb = np.random.rand(n_items, emb_dim)
    return user_emb, item_emb


def update_parameters(rank_matrix: np.ndarray, user_emb: np.ndarray, item_emb: np.ndarray, lr: float, reg: float = 0.02):
    emb_dim = user_emb.shape[1]
    
    for u in range(user_emb.shape[0]):
        for i in range(item_emb.shape[0]):
            if rank_matrix[u, i] > 0:
                err = rank_matrix[u, i] - np.dot(user_emb[u], item_emb[i])
                for dim in range(emb_dim):
                    user_emb[u, dim] += lr * (2 * err * item_emb[i, dim] - reg * user_emb[u, dim])
                    item_emb[i, dim] += lr * (2 * err * user_emb[u, dim] - reg * item_emb[i, dim])
    
    return user_emb, item_emb



def predict(pairs: np.ndarray, user_emb: np.ndarray, item_emb: np.ndarray):
    predictions = np.zeros(shape=pairs.shape[0])
    for idx, (u, i) in enumerate(pairs):
        predictions[idx] = np.dot(user_emb[u], item_emb[i])
    return predictions

In [7]:
# some important variables for training step
lr = 6e-4
emb_dim = 8
n_epochs = 40

# define user and item embeddings
user_emb, item_emb = initialize_emb(n_users, n_items, emb_dim=emb_dim)

In [256]:
from sklearn import metrics

for epoch in range(n_epochs):
    user_emb, item_emb = update_parameters(rank_matrix, user_emb, item_emb, lr=lr)
    predictions = predict(X_test.to_numpy(), user_emb, item_emb)
    print(f'============ Epoch {epoch + 1}/{n_epochs} ============')
    print('RMSE:', metrics.mean_squared_error(predictions, y_test.to_numpy(), squared=False))
    print('MAE:', metrics.median_absolute_error(predictions, y_test.to_numpy()))
    print()

RMSE: 1.3699763184631282
MAE: 1.017203906678891

RMSE: 1.1639339237230129
MAE: 0.8045170378797886

RMSE: 1.0898501043133253
MAE: 0.7318435924744278

RMSE: 1.051140987035558
MAE: 0.6964467143603306

RMSE: 1.0272557112898806
MAE: 0.677429727258815

RMSE: 1.0111904644599743
MAE: 0.6671102422382991

RMSE: 0.9997561251937677
MAE: 0.6614156455571976

RMSE: 0.991283258146907
MAE: 0.6543921617920914

RMSE: 0.9848126957637273
MAE: 0.6524032830684493

RMSE: 0.9797538792448303
MAE: 0.6504360361729993

RMSE: 0.9757234850615621
MAE: 0.647486196065808

RMSE: 0.9724622180996465
MAE: 0.6447641173582539

RMSE: 0.9697887764681458
MAE: 0.6421579373884305

RMSE: 0.9675728937377633
MAE: 0.6400802260483525

RMSE: 0.9657188072394702
MAE: 0.6401423092199823

RMSE: 0.964154727658077
MAE: 0.6389169916770836

RMSE: 0.9628259150920593
MAE: 0.6380349054031802

RMSE: 0.9616900021497227
MAE: 0.6360840583681231

RMSE: 0.9607137606328144
MAE: 0.6351495323587779

RMSE: 0.9598708203525872
MAE: 0.635327428114649

RMSE: 0