In [1]:
import pandas as pd
import glob
import json
from pathlib import Path

import torch
from torch.utils.data import Dataset
from torch import nn
import torch.optim as optim
import lightning as L
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [2]:
BASE_DIR = Path("../data/lastfm-dataset-360K/")

user_profiles = pd.read_csv(BASE_DIR / "usersha1-profile.tsv", sep="\t", header=None, on_bad_lines='skip')
data = pd.read_csv(BASE_DIR / "usersha1-artmbid-artname-plays.tsv", sep="\t", header=None, on_bad_lines='skip')

In [3]:
data.columns = ["user", "artist_id", "artist_name", "plays"]

In [4]:
data.user.apply(lambda x: len(x)).value_counts()

user
40    17535569
12          86
Name: count, dtype: int64

In [5]:
data = data[data.user.apply(lambda x: len(x) == 40)]

In [6]:
# noramlize user listening counts
data["plays"] = data["plays"] / data.groupby("user")["plays"].transform("sum")

In [7]:
data = data.drop(columns=["artist_id"])

In [8]:
data = data.dropna()
data = data.drop_duplicates()

In [9]:
data = data.sample(frac=0.1)

In [10]:
# drop users with less than 2 artists
data = data[data.groupby("user")["artist_name"].transform("count") > 9]

In [11]:
# remap artist names to integers
data.loc[:, "artist_id"] = data["artist_name"].astype("category").cat.codes

# remap user names to integers
data.loc[:, "user_id"] = data["user"].astype("category").cat.codes

In [12]:
class MusicDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.num_users = self.df["user_id"].nunique()
        self.num_items = self.df["artist_id"].nunique()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user_id = self.df.iloc[idx]["user_id"]
        artist_name = self.df.iloc[idx]["artist_id"]
        rating = self.df.iloc[idx, 2]
        return (
            torch.tensor(user_id, dtype=torch.long),
            torch.tensor(artist_name, dtype=torch.long),
            torch.tensor(rating, dtype=torch.float)
        )

In [13]:
class MatrixFactorization(L.LightningModule):
    def __init__(self, num_users, num_items, embedding_dim):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

    def forward(self, user_id, item_id):
        user_vector = self.user_embedding(user_id)
        item_vector = self.item_embedding(item_id)
        return (user_vector * item_vector).sum(1)

    def training_step(self, batch, batch_idx):
        user_id, item_id, rating = batch
        prediction = self(user_id, item_id)
        loss = nn.functional.mse_loss(prediction, rating)
        self.log('train_loss', loss, prog_bar=True, on_step=True, on_epoch=True)
        return loss

    def configure_optimizers(self):
        return optim.SGD(self.parameters(), lr=0.01)

In [14]:
train_df, test_df = train_test_split(data, test_size=0.2, stratify=data["user_id"], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df["user_id"], random_state=42)

In [15]:
train_df.user_id.nunique(), val_df.user_id.nunique(), test_df.user_id.nunique()

(10942, 10942, 10942)

In [20]:
train_ds = MusicDataset(train_df)
train_loader = DataLoader(train_ds, batch_size=2056, shuffle=True)

val_ds = MusicDataset(val_df)
val_loader = DataLoader(val_ds, batch_size=512)

model = MatrixFactorization(train_ds.num_users, train_ds.num_items, 10)
trainer = L.Trainer(max_epochs=10)
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name           | Type      | Params | Mode 
-----------------------------------------------------
0 | user_embedding | Embedding | 109 K  | train
1 | item_embedding | Embedding | 202 K  | train
-----------------------------------------------------
311 K     Trainable params
0         Non-trainable params
311 K     Total params
1.247     Total estimated model params size (MB)
2         Modules in train mode
0         Modules in eval mode


Epoch 4:  76%|███████▌  | 28/37 [00:29<00:09,  0.94it/s, v_num=67, train_loss_step=37.60, train_loss_epoch=38.10]
Epoch 9: 100%|██████████| 37/37 [00:03<00:00, 10.47it/s, v_num=69, train_loss_step=7.460, train_loss_epoch=7.600]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 37/37 [00:03<00:00, 10.45it/s, v_num=69, train_loss_step=7.460, train_loss_epoch=7.600]


In [25]:
# evaluate the model

def predict_for_user(model, user_id, num_items, top_nb=10):
    model.eval()  # Set model to evaluation mode
    user_id_tensor = torch.tensor([user_id] * num_items, dtype=torch.long)
    item_id_tensor = torch.tensor(range(num_items), dtype=torch.long)

    with torch.no_grad():  # Disable gradient calculation
        predictions = model(user_id_tensor, item_id_tensor)

    top = sorted(enumerate(predictions), reverse=True, key=lambda i: i[1])
    return top[:top_nb]

def evaluate_model(model, test_df, top_nb=10):
    user_ids = test_df["user_id"].unique()
    num_items = test_df["artist_id"].nunique()

    hits = 0
    for user_id in tqdm(user_ids):
        top_predictions = predict_for_user(model, user_id, num_items, top_nb)
        top_predictions = [i for i, _ in top_predictions]
        user_df = test_df[test_df["user_id"] == user_id]
        user_artists = user_df["artist_id"].values
        hits += len(set(top_predictions) & set(user_artists))

    return hits / len(test_df)

evaluate_model(model, test_df)

 13%|█▎        | 1475/10942 [02:41<17:19,  9.11it/s]


KeyboardInterrupt: 

In [37]:
test_df.loc[test_df["user_id"] == 10, "artist_id"].unique()

array([11290, 21015], dtype=int16)

In [28]:
predict_for_user(model, 0, train_ds.num_items, 10)

[(5623, tensor(7.6248)),
 (9520, tensor(7.3199)),
 (13497, tensor(7.2188)),
 (14679, tensor(7.1997)),
 (149, tensor(7.0136)),
 (10313, tensor(6.9292)),
 (15969, tensor(6.7154)),
 (17050, tensor(6.6870)),
 (10974, tensor(6.6659)),
 (10465, tensor(6.6653))]

In [27]:
def predict_for_users(model, user_ids, item_ids, top_nb=10):
    """
    Compute top-N recommendations for a batch of users in one pass.

    Args:
        model: Trained PyTorch model with a forward(user_tensor, item_tensor) method.
        user_ids: List (or tensor) of user IDs for which to get predictions.
        item_ids: List (or tensor) of item IDs for all items in the dataset.
        top_nb: Number of top recommendations to retrieve per user.

    Returns:
        A dictionary { user_id: [top recommended item IDs] } for each user in user_ids.
    """
    model.eval()  # Set model to evaluation mode

    user_ids = torch.tensor(user_ids, dtype=torch.long)
    item_ids = torch.tensor(item_ids, dtype=torch.long)

    # Repeat users and tile items so model can make a single prediction pass
    repeated_user_ids = user_ids.repeat_interleave(len(item_ids))
    tiled_item_ids = item_ids.repeat(len(user_ids))

    with torch.no_grad():
        all_predictions = model(repeated_user_ids, tiled_item_ids)

    # Reshape predictions to [num_users, num_items]
    all_predictions = all_predictions.view(len(user_ids), len(item_ids))

    # Find top-N items for each user
    _, top_indices = torch.topk(all_predictions, k=top_nb, dim=1)

    # Build the result as a dictionary { user_id: [top recommended items] }
    results = {}
    for i, u in enumerate(user_ids.tolist()):
        user_top_indices = top_indices[i].tolist()
        user_top_items = item_ids[user_top_indices].tolist()
        results[u] = user_top_items

    return results


def evaluate_model_in_batches(model, test_df, top_nb=10):
    """
    Evaluate the model by computing how many times the ground truth interactions
    appear among the top-N predictions, while operating on all users in one pass.

    Args:
        model: Trained PyTorch model with a forward(user_tensor, item_tensor) method.
        test_df: A DataFrame that at least contains 'user_id' and 'artist_id' columns.
        top_nb: Number of top recommendations to retrieve per user.

    Returns:
        The hit ratio of correct recommendations in the top-N predictions.
    """
    user_ids = test_df["user_id"].unique()
    item_ids = range(test_df["artist_id"].nunique())  # You can replace 'artist_id' with your item ID column
    recommendations = predict_for_users(model, user_ids, item_ids, top_nb=top_nb)

    acs = []
    total_interactions = len(test_df)
    for user_id in user_ids:
        user_top_items = set(recommendations[user_id])
        user_df = test_df[test_df["user_id"] == user_id]
        user_artists = set(user_df["artist_id"].values)
        ac = len(user_top_items & user_artists) / min(len(user_artists), top_nb)
        print(ac)
        acs.append(len(user_top_items & user_artists) / min(len(user_artists), top_nb))
    return sum(acs) / len(acs)

# Example usage:
hit_ratio = evaluate_model_in_batches(model, test_df, top_nb=10)
print(f"Hit Ratio: {hit_ratio}")

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
