In [1]:
from torch.utils.data import Dataset
from pathlib import Path
import requests
import zipfile
import tempfile
import pandas as pd

import torch
from sklearn import preprocessing

from torch import nn
from sklearn import model_selection
from torch.utils.data import DataLoader
import sys

from collections import defaultdict
from sklearn.metrics import root_mean_squared_error
import numpy as np

In [2]:
torch.manual_seed(0)

<torch._C.Generator at 0x106a878f0>

In [3]:
class MovieLensDataset(Dataset):
    def __init__(self,  root: str = "data", data: pd.DataFrame | None = None, download: bool = False):
        self._root = Path(root)
        self._path = self._root / "ml-100k"

        self._data = data if data is not None else self._load_data(download)

        self._genre_lbl_enc = preprocessing.LabelEncoder()
        self._genre_lbl_enc.fit(self._data["genre"])

        self.n_groups = self._genre_lbl_enc.classes_.shape[0]
        self._group_counts = torch.tensor(self._data["genre"].value_counts().sort_index().values)

        self.group_str = {i: genre for i, genre in enumerate(self._genre_lbl_enc.classes_)}

    def group_counts(self):
        return self._group_counts

    def _load_data(self, download: bool) -> pd.DataFrame:
        if not self._path.exists():
            if not download:
                raise FileNotFoundError(f"{self._path} not found")
            else:
                self._download()

        data = pd.read_csv(
            self._path / "u.data",
            sep="\t",
            header=None,
            names=["user_id", "item_id", "rating", "timestamp"],
        )
        data.drop(columns=["timestamp"], inplace=True)

        item_data = pd.read_csv(
            self._path / "u.item",
            sep="|",
            header=None,
            encoding="ISO-8859-1",
            names=["item_id", "title", "release_date", "video_release_date", "IMDb_URL", "unknown", "Action", "Adventure", "Animation", "Children", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"],
        )

        def get_random_genre(row):
            genres = row[6:].index[row[6:] == 1].tolist()  # Get list of genres where value is 1
            if genres: #Check if the list is not empty
                return np.random.choice(genres) #Return a random genre from the list
            else:
                return "Unknown" # Or handle the case where no genre is found

        item_data["genre"] = item_data.apply(get_random_genre, axis=1)

        item_data = item_data[["item_id", "genre"]]
        data = data.merge(item_data, on="item_id")

        return data


    def _download(self):
        _URL = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"

        with tempfile.TemporaryDirectory() as tmpdirname:
            with requests.get(_URL, stream=True) as r:
                r.raise_for_status()
                with open(tmpdirname + "/ml-100k.zip", "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)

            with zipfile.ZipFile(tmpdirname + "/ml-100k.zip", "r") as zip_ref:
                zip_ref.extractall(self._root)

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        row = self._data.iloc[idx]

        genre_id = self._genre_lbl_enc.transform([row["genre"]])

        return {
            "users": torch.tensor(row["user_id"], dtype=torch.long),
            "items": torch.tensor(row["item_id"], dtype=torch.long),
            "ratings": torch.tensor(row["rating"], dtype=torch.float),
            "genre_mask": nn.functional.one_hot(torch.tensor(genre_id, dtype=torch.long), num_classes=len(self._genre_lbl_enc.classes_))
        }

In [4]:
class RecommendationSystemModel(nn.Module):
    def __init__(
        self,
        num_users,
        num_movies,
        embedding_size=256,
        hidden_dim=256,
        dropout_rate=0.2,
    ):
        super(RecommendationSystemModel, self).__init__()
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.hidden_dim = hidden_dim

        # Embedding layers
        self.user_embedding = nn.Embedding(
            num_embeddings=self.num_users, embedding_dim=self.embedding_size
        )
        self.movie_embedding = nn.Embedding(
            num_embeddings=self.num_movies, embedding_dim=self.embedding_size
        )

        # Hidden layers
        self.fc1 = nn.Linear(2 * self.embedding_size, self.hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim, 1)

        # Dropout layer
        self.dropout = nn.Dropout(p=dropout_rate)

        # Activation function
        self.relu = nn.ReLU()

    def forward(self, users, movies):
        # Embeddings
        user_embedded = self.user_embedding(users)
        movie_embedded = self.movie_embedding(movies)

        # Concatenate user and movie embeddings
        combined = torch.cat([user_embedded, movie_embedded], dim=1)

        # Pass through hidden layers with ReLU activation and dropout
        x = self.relu(self.fc1(combined))
        x = self.dropout(x)
        output = self.fc2(x)

        return output

In [6]:
df = MovieLensDataset(download=True)._data
# remove Unknown genre
df = df[df.genre != "Unknown"]

lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()

df.user_id = lbl_user.fit_transform(df.user_id.values)
df.item_id = lbl_movie.fit_transform(df.item_id.values)

In [72]:
df_train, df_val = model_selection.train_test_split(
    df, test_size=0.1, random_state=3, stratify=df.rating.values
)

In [73]:
# Sampling down all genres except for Drama
df_train = pd.concat([df_train[df_train.genre == "Drama"], df_train[df_train.genre != "Drama"].sample(frac=0.1)])

In [63]:
df_train.genre.value_counts()

genre
Drama          23264
Comedy          1595
Thriller         808
Romance          697
Adventure        648
Action           648
Sci-Fi           417
Crime            296
Children         295
War              279
Horror           255
Musical          184
Western          151
Mystery          123
Animation        104
Documentary       67
Film-Noir         63
Fantasy           43
Name: count, dtype: int64

In [79]:
BATCH_SIZE = 32

train_dataset = MovieLensDataset(data=df_train)
valid_dataset = MovieLensDataset(data=df_val)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [87]:
def loss_per_group(sq_err, mask, risk_groups: list[int] | None = None) -> torch.Tensor:
    mask_means = mask.float().mean(dim=0)
    if risk_groups is not None:
        mask_means = mask_means * torch.tensor([1 if i in risk_groups else 0 for i in range(mask_means.shape[0])], device=mask_means.device)

    mask = mask * mask_means
    mask = mask.detach()

    loss = sq_err + 1e-8
    loss = loss.view(-1, 1) * mask

    # remove columns with all zeros
    # loss = loss[:, mask.sum(dim=0) > 0]
    loss = loss ** 2

    return loss.sum(dim=0).mean()

sq_err_ls = []
lpg_ls = []


def loss_func(output, target, mask, lmbd: float, risk_groups: list[int] | None = None) -> torch.Tensor:
    sq_err = torch.pow(output - target, 2)
    lpg = loss_per_group(sq_err, mask, risk_groups)

    sq_err_ls.append(sq_err.mean().item())
    lpg_ls.append(lpg.item())

    f = sq_err.mean() * (1 - lmbd) + lpg * lmbd
    return f

#loss_func = nn.MSELoss()

In [88]:
recommendation_model = RecommendationSystemModel(
    num_users=len(lbl_user.classes_),
    num_movies=len(lbl_movie.classes_),
    embedding_size=128,
    hidden_dim=256,
    dropout_rate=0.1,
).to(device)

optimizer = torch.optim.Adam(recommendation_model.parameters(), lr=1e-3)

EPOCHS = 2

# Function to log progress
def log_progress(epoch, step, total_loss, log_progress_step, data_size, losses):
    avg_loss = total_loss / log_progress_step
    sys.stderr.write(
        f"\r{epoch+1:02d}/{EPOCHS:02d} | Step: {step}/{data_size} | Avg Loss: {avg_loss:<6.9f}"
    )
    sys.stderr.flush()
    losses.append(avg_loss)

total_loss = 0
log_progress_step = 100
losses = []
train_dataset_size = len(train_dataset)
print(f"Training on {train_dataset_size} samples...")

recommendation_model.train()
for e in range(EPOCHS):
    step_count = 0  # Reset step count at the beginning of each epoch
    for i, train_data in enumerate(train_loader):
        output = recommendation_model(
            train_data["users"].to(device), train_data["items"].to(device)
        )
        # Reshape the model output to match the target's shape
        output = output.squeeze()  # Removes the singleton dimension
        ratings = (
            train_data["ratings"].to(torch.float32).to(device)
        )  # Assuming ratings is already 1D

        mask = train_data["genre_mask"].to(device)

        loss = loss_func(output, ratings, mask.squeeze(), 0, None)
        total_loss += loss.sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        # Increment step count by the actual size of the batch
        step_count += len(train_data["users"])

        # Check if it's time to log progress
        if (
            step_count % log_progress_step == 0 or i == len(train_loader) - 1
        ):  # Log at the end of each epoch
            log_progress(
                e, step_count, total_loss, log_progress_step, train_dataset_size, losses
            )
            total_loss = 0

01/02 | Step: 1600/29251 | Avg Loss: 0.537972047

Training on 29251 samples...


02/02 | Step: 29251/29251 | Avg Loss: 0.134126921

In [40]:
y_pred = []
y_true = []
cats = []

recommendation_model.eval()

with torch.no_grad():
    for i, valid_data in enumerate(val_loader):
        cats.extend(valid_data["genre_mask"].cpu().numpy())
        output = recommendation_model(
            valid_data["users"].to(device), valid_data["items"].to(device)
        )
        ratings = valid_data["ratings"].to(device)
        y_pred.extend(output.cpu().numpy())
        y_true.extend(ratings.cpu().numpy())

# Calculate RMSE
rms = root_mean_squared_error(y_true, y_pred)
print(f"RMSE: {rms:.4f}")

# Get RMSE for each genre
genre_rmses = defaultdict(list)
for cat, pred, true in zip(cats, y_pred, y_true):
    genre_rmses[cat.argmax()].append((pred - true) ** 2)

for genre, rmses in genre_rmses.items():
    genre_rmses[genre] = np.sqrt(np.mean(rmses))

print("RMSE per genre:")
for genre, rmse in genre_rmses.items():
    print(f"{train_dataset._genre_lbl_enc.inverse_transform([genre])[0]:<15}: {rmse:.4f}")

# compute mean difference between genres
diff = []
for genre1 in genre_rmses:
    for genre2 in genre_rmses:
        if genre1 != genre2:
            diff.append(abs(genre_rmses[genre1] - genre_rmses[genre2]))

sum(diff) / len(diff)

RMSE: 1.0354
RMSE per genre:
Action         : 1.0343
Drama          : 0.9656
Comedy         : 1.1048
Thriller       : 1.0040
Children       : 1.1943
Adventure      : 1.0414
Musical        : 1.1092
Romance        : 1.0462
War            : 1.0040
Crime          : 1.0520
Sci-Fi         : 1.0445
Horror         : 1.0401
Mystery        : 0.9910
Documentary    : 1.0013
Western        : 1.1606
Animation      : 1.0675
Film-Noir      : 0.9796
Fantasy        : 0.8469


np.float32(0.084586136)

In [22]:
def calculate_precision_recall(user_ratings, k, threshold):
    user_ratings.sort(key=lambda x: x[0], reverse=True)
    n_rel = sum(true_r >= threshold for _, true_r in user_ratings)
    n_rec_k = sum(est >= threshold for est, _ in user_ratings[:k])
    n_rel_and_rec_k = sum((true_r >= threshold) and (est >= threshold) for est, true_r in user_ratings[:k])

    precision = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
    recall = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
    return precision, recall

user_ratings_comparison = defaultdict(list)

with torch.no_grad():
    for valid_data in val_loader:
        users = valid_data["users"].to(device)
        movies = valid_data["items"].to(device)
        ratings = valid_data["ratings"].to(device)
        output = recommendation_model(users, movies)

        for user, pred, true in zip(users, output, ratings):
            user_ratings_comparison[user.item()].append((pred[0].item(), true.item()))

user_precisions = dict()
user_based_recalls = dict()

k = 50
threshold = 3

for user_id, user_ratings in user_ratings_comparison.items():
    precision, recall = calculate_precision_recall(user_ratings, k, threshold)
    user_precisions[user_id] = precision
    user_based_recalls[user_id] = recall


average_precision = sum(prec for prec in user_precisions.values()) / len(user_precisions)
average_recall = sum(rec for rec in user_based_recalls.values()) / len(user_based_recalls)

print(f"precision @ {k}: {average_precision:.4f}")
print(f"recall @ {k}: {average_recall:.4f}")

precision @ 50: 0.8738
recall @ 50: 0.8639
