In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from loguru import logger

import pandas as pd
import numpy as np
import plotly.express as px

sys.path.insert(0, '..')

from src.viz import blueq_colors

# Implement

In [3]:
from src.train_utils import mse_loss, train, MetricLogCallback
from src.model import User2UserCollaborativeFiltering

# Test implementation

In [4]:
# Mock data
user_ids = [0, 0, 1, 1, 2, 2, 2]
item_ids = [0, 1, 1, 2, 3, 1, 2]
ratings = [1, 4, 4, 5, 3, 2, 4]
n_users = len(set(user_ids))
n_items = len(set(item_ids))

val_user_ids = [0, 1, 2]
val_item_ids = [2, 1, 2]
val_ratings = [2, 4, 5]

print("Mock User IDs:", user_ids)
print("Mock Item IDs:", item_ids)
print("Ratings:", ratings)

model = User2UserCollaborativeFiltering(n_users, n_items)

users = [0, 1, 2]
items = [2, 2, 0]
predictions = model.predict(users, items)
print(predictions)

Mock User IDs: [0, 0, 1, 1, 2, 2, 2]
Mock Item IDs: [0, 1, 1, 2, 3, 1, 2]
Ratings: [1, 4, 4, 5, 3, 2, 4]
[3 3 3]


In [5]:
model.fit(user_ids, item_ids, ratings)
predictions = model.predict(users, items)
print(predictions)

[4.62714989 4.         1.        ]


In [6]:
model.user_item_matrix

array([[1., 4., 0., 0.],
       [0., 4., 5., 0.],
       [0., 2., 4., 3.]])

In [7]:
model.user_similarity

array([[0.        , 0.60604322, 0.36030188],
       [0.60604322, 0.        , 0.81202071],
       [0.36030188, 0.81202071, 0.        ]])

In [8]:
user = 0
item = 2

# Compute prediction using weighted average of ratings from similar users
sim_scores = model.user_similarity[user]
print(f"{sim_scores=}")

sim_scores=array([0.        , 0.60604322, 0.36030188])


In [9]:
# Only consider users who have rated the item
user_ratings = model.user_item_matrix[:, item]
print(f"{user_ratings=}")
sim_scores = sim_scores[user_ratings != 0]
print(f"{sim_scores=}")
user_ratings = user_ratings[user_ratings != 0]
print(f"{user_ratings=}")

user_ratings=array([0., 5., 4.])
sim_scores=array([0.60604322, 0.36030188])
user_ratings=array([5., 4.])


In [10]:
# Weighted average of ratings
print(f"Weighted average: {np.dot(sim_scores, user_ratings)}")
print(f"Normalization factor: {np.sum(sim_scores)}")
print(f"Predicted rating: {np.dot(sim_scores, user_ratings) / np.sum(sim_scores)}")

Weighted average: 4.471423593469625
Normalization factor: 0.9663450945516922
Predicted rating: 4.627149885356445


# Prep data

In [11]:
train_df = pd.read_parquet("../data/train.parquet")
val_df = pd.read_parquet("../data/val.parquet")

In [12]:
from src.id_mapper import IDMapper

In [13]:
user_ids = train_df['user_id'].values
item_ids = train_df['parent_asin'].values
unique_user_ids = list(set(user_ids))
unique_item_ids = list(set(item_ids))
n_users = len(unique_user_ids)
n_items = len(unique_item_ids)

logger.info(f"{len(unique_user_ids)=:,.0f}, {len(unique_item_ids)=:,.0f}")

[32m2024-09-12 13:05:04.089[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mlen(unique_user_ids)=5,223, len(unique_item_ids)=2,653[0m


In [14]:
idm = IDMapper()
idm.fit(unique_user_ids, unique_item_ids)

In [15]:
user_indices = [idm.get_user_index(user_id) for user_id in user_ids]
item_indices = [idm.get_item_index(item_id) for item_id in item_ids]
ratings = train_df['rating'].values.tolist()

In [16]:
val_user_indices = [idm.get_user_index(user_id) for user_id in val_df['user_id']]
val_item_indices = [idm.get_item_index(item_id) for item_id in val_df['parent_asin']]
val_ratings = val_df['rating'].values.tolist()

# Train

In [17]:
model = User2UserCollaborativeFiltering(n_users, n_items)

#### Predict before train

In [18]:
user_id = 'AEHW2B54HDLZ3APBEWXHYLZ6SSYQ'
val_df.loc[lambda df: df['user_id'].eq(user_id)]

Unnamed: 0,user_id,parent_asin,rating,timestamp
34367,AEHW2B54HDLZ3APBEWXHYLZ6SSYQ,B07MYVF61Y,4.0,1654225907045


In [19]:
item_id = 'B07MYVF61Y'
user_indice = idm.get_user_index(user_id)
item_indice = idm.get_item_index(item_id)

model.predict([user_indice], [item_indice])

array([3])

#### Training loop

In [20]:
model.fit(user_indices, item_indices, ratings)

# Evaluate

In [21]:
from tqdm.notebook import tqdm

In [22]:
val_predictions = []
for i in tqdm(range(len(val_ratings))):
    user_indice = val_user_indices[i]
    item_indice = val_item_indices[i]
    prediction = model.predict([user_indice], [item_indice])
    prediction = float(prediction[0])
    val_predictions.append(prediction)

  0%|          | 0/4259 [00:00<?, ?it/s]

In [23]:
def mse(predictions, ratings):
    predictions = np.array(predictions)
    ratings = np.array(ratings)
    return np.mean((predictions - ratings) ** 2)

mse(val_predictions, val_ratings)

np.float64(2.9705283669091798)

In [24]:
val_predictions[:5]

[3.0, 3.0, 3.8216000860564616, 3.0, 3.0]

In [25]:
val_ratings[:5]

[5.0, 5.0, 5.0, 5.0, 2.0]

# Predict

In [26]:
val_predictions[2]

3.8216000860564616

In [27]:
val_df.iloc[[2]]

Unnamed: 0,user_id,parent_asin,rating,timestamp
6758,AFQAPWVESEJYTNZC23LDPQOH7QBA,B09GM4283G,5.0,1630119475785


In [28]:
user_id = 'AFQAPWVESEJYTNZC23LDPQOH7QBA'

In [29]:
item_id = 'B09GM4283G'
user_indice = idm.get_user_index(user_id)
item_indice = idm.get_item_index(item_id)

model.predict([user_indice], [item_indice])

array([3.82160009])