# Rating BCE loss with Users Split dataset


In [1]:
import copy
import os
import warnings
from ast import literal_eval
from typing import Any

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torcheval.metrics.functional.ranking import retrieval_precision
from tqdm import tqdm

In [2]:
MANUAL_SEED = 42
torch.manual_seed(MANUAL_SEED)

warnings.filterwarnings("ignore")

## Data loading and preprocessing


In [3]:
def load_dataset(path: str) -> pd.DataFrame:
    loaded_dfs = [
        pd.read_csv(os.path.join(path, file_name)) for file_name in os.listdir(path)
    ]
    return pd.concat(loaded_dfs)


def load_datasets(path: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    return load_dataset(os.path.join(path, "train/")), load_dataset(
        os.path.join(path, "test/")
    )

In [4]:
train_df, val_df = load_datasets("../data/interim/users_split/")

print(f"{len(train_df)=}")
print(f"{len(val_df)=}")

len(train_df)=22896
len(val_df)=2565


In [5]:
NUM_MOVIES = 1682
BASIC_USER_FEATURES = 3

TOTAL_USER_FEATURES = BASIC_USER_FEATURES + 19

In [6]:
class RecommendationDataset(torch.utils.data.Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df.drop(columns=["user_id"])
        features = []
        inputs = []
        targets = []
        for _, row in tqdm(df.iterrows(), total=len(df)):
            features.append(
                row[:BASIC_USER_FEATURES].tolist() + literal_eval(row["genres"])
            )
            inputs.append(literal_eval(row["input"]))
            targets.append(literal_eval(row["output"]))

        self.features = np.array(features)

        # normalize ratings
        self.inputs = np.array(inputs) / 5
        self.targets = np.array(targets) / 5

    def __getitem__(self, idx: int) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
        input_ratings = self.inputs[idx]
        input_data = np.concatenate([self.features[idx], input_ratings])
        mask = input_ratings == 0
        return input_data, mask, self.targets[idx]

    def __len__(self) -> int:
        return len(self.df)

In [7]:
train_dataset, val_dataset = (
    RecommendationDataset(train_df),
    RecommendationDataset(val_df),
)
print(f"{len(train_dataset)=}")
print(f"{len(val_dataset)=}")

  0%|          | 0/22896 [00:00<?, ?it/s]

100%|██████████| 22896/22896 [03:54<00:00, 97.64it/s] 
100%|██████████| 2565/2565 [00:22<00:00, 113.81it/s]


len(train_dataset)=22896
len(val_dataset)=2565


In [8]:
BATCH_SIZE = 32
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

DEVICE

device(type='cuda')

In [9]:
def collate_batch(batch: list) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    input_data_batch, mask_batch, target_batch = [], [], []
    for input_data, mask, target in batch:
        input_data_batch.append(input_data)
        mask_batch.append(mask)
        target_batch.append(target)

    return (
        torch.Tensor(input_data_batch),
        torch.Tensor(mask_batch).bool(),
        torch.Tensor(target_batch),
    )


train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
val_dataloader = torch.utils.data.DataLoader(
    dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)

In [10]:
it = train_dataloader._get_iterator()
inp, mask, out = it._next_data()
print(inp.shape)
print(mask.shape)
print(out.shape)

torch.Size([32, 1704])
torch.Size([32, 1682])
torch.Size([32, 1682])


## Creating the network


In [11]:
INPUT_SIZE = TOTAL_USER_FEATURES + NUM_MOVIES


class RecSys(nn.Module):
    def __init__(
        self,
        hidden_dim1: int = 1024,
        hidden_dim2: int = 1024,
    ):
        super(RecSys, self).__init__()

        self.d1 = nn.Dropout(0.1)

        self.fc1 = nn.Linear(INPUT_SIZE, hidden_dim1)

        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)

        self.fc3 = nn.Linear(hidden_dim2, NUM_MOVIES)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.d1(x)

        x = F.relu(self.fc2(x))

        return F.sigmoid(self.fc3(x))

In [12]:
torch.manual_seed(MANUAL_SEED)


def create_model() -> tuple[nn.Module, Any]:
    model = RecSys()

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    model = model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    return model, optimizer


loss_fn = torch.nn.BCELoss()

## Train model


In [13]:
def train_one_epoch(
    model: nn.Module, loader, optimizer, loss_fn, epoch, use_mask: bool = True
):
    model.train()
    train_loss = 0.0
    total = 0

    loop = tqdm(
        loader,
        total=len(loader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )
    for batch in loop:
        input_data, mask, target = batch
        input_data, target, mask = (
            input_data.to(DEVICE),
            target.to(DEVICE),
            mask.to(DEVICE),
        )

        # forward pass and loss calculation
        outputs = model(input_data)

        # zero the parameter gradients
        optimizer.zero_grad()

        positive_targets = (target > 0).float()
        if use_mask:
            loss = loss_fn(
                torch.masked_select(outputs, mask),
                torch.masked_select(positive_targets, mask),
            )
        else:
            loss = loss_fn(outputs, positive_targets)

        # backward pass
        loss.backward()
        total += target.shape[1]

        # optimizer run
        optimizer.step()

        train_loss += loss.item()
        loop.set_postfix({"loss": train_loss / total})


def val_one_epoch(model: nn.Module, loader, loss_fn, epoch, use_mask: bool = True):
    loop = tqdm(
        loader,
        total=len(loader),
        desc=f"Epoch {epoch}: val",
        leave=True,
    )
    val_loss = 0.0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for batch in loop:
            input_data, mask, target = batch
            input_data, target, mask = (
                input_data.to(DEVICE),
                target.to(DEVICE),
                mask.to(DEVICE),
            )

            outputs = model(input_data)

            positive_targets = (target > 0).float()
            if use_mask:
                loss = loss_fn(
                    torch.masked_select(outputs, mask),
                    torch.masked_select(positive_targets, mask),
                )
            else:
                loss = loss_fn(outputs, positive_targets)

            val_loss += loss.item()
            total += target.shape[1]
            loop.set_postfix({"loss": val_loss / total})
    return val_loss / total

In [14]:
NUM_EPOCHS = 5


def train_model(
    model: nn.Module,
    optimizer,
    loss_fn,
    train_dataloader,
    val_dataloader,
    save_path: str,
    use_mask: bool = True,
) -> nn.Module:
    best_loss = 1e10

    for epoch in range(1, NUM_EPOCHS + 1):
        train_one_epoch(
            model, train_dataloader, optimizer, loss_fn, epoch, use_mask=use_mask
        )
        val_loss = val_one_epoch(model, val_dataloader, loss_fn, epoch, use_mask=use_mask)
        if val_loss <= best_loss:
            val_loss = best_loss
            torch.save(model, save_path)

    return copy.deepcopy(model)

In [15]:
model, optimizer = create_model()
model_mask, optimizer_mask = create_model()

In [16]:
best = train_model(
    model,
    optimizer,
    loss_fn,
    train_dataloader,
    val_dataloader,
    "../models/rating_bce_users_split",
    use_mask=False,
)

Epoch 1: train: 100%|██████████| 716/716 [00:59<00:00, 11.99it/s, loss=9.32e-5] 
Epoch 1: val: 100%|██████████| 81/81 [00:02<00:00, 27.19it/s, loss=8.62e-5]
Epoch 2: train: 100%|██████████| 716/716 [00:34<00:00, 20.78it/s, loss=5.84e-5]
Epoch 2: val: 100%|██████████| 81/81 [00:03<00:00, 23.27it/s, loss=8.78e-5] 
Epoch 3: train: 100%|██████████| 716/716 [00:39<00:00, 18.01it/s, loss=4.08e-5]
Epoch 3: val: 100%|██████████| 81/81 [00:04<00:00, 19.00it/s, loss=9.71e-5] 
Epoch 4: train: 100%|██████████| 716/716 [00:37<00:00, 18.86it/s, loss=2.79e-5]
Epoch 4: val: 100%|██████████| 81/81 [00:04<00:00, 20.19it/s, loss=0.000109]
Epoch 5: train: 100%|██████████| 716/716 [00:43<00:00, 16.33it/s, loss=1.97e-5]
Epoch 5: val: 100%|██████████| 81/81 [00:04<00:00, 19.53it/s, loss=0.000129]


In [17]:
best_mask = train_model(
    model_mask,
    optimizer_mask,
    loss_fn,
    train_dataloader,
    val_dataloader,
    "../models/rating_bce_users_split_mask",
    use_mask=True,
)

Epoch 1: train: 100%|██████████| 716/716 [00:38<00:00, 18.48it/s, loss=6.58e-5] 
Epoch 1: val: 100%|██████████| 81/81 [00:04<00:00, 18.24it/s, loss=6.49e-5]
Epoch 2: train: 100%|██████████| 716/716 [00:37<00:00, 19.20it/s, loss=4.65e-5]
Epoch 2: val: 100%|██████████| 81/81 [00:03<00:00, 20.57it/s, loss=5.99e-5]
Epoch 3: train: 100%|██████████| 716/716 [00:38<00:00, 18.46it/s, loss=3.82e-5]
Epoch 3: val: 100%|██████████| 81/81 [00:03<00:00, 20.62it/s, loss=6.62e-5]
Epoch 4: train: 100%|██████████| 716/716 [00:38<00:00, 18.69it/s, loss=3.08e-5]
Epoch 4: val: 100%|██████████| 81/81 [00:03<00:00, 21.04it/s, loss=6.41e-5]
Epoch 5: train: 100%|██████████| 716/716 [00:41<00:00, 17.06it/s, loss=2.5e-5] 
Epoch 5: val: 100%|██████████| 81/81 [00:03<00:00, 20.86it/s, loss=6.86e-5]


## Test models


In [18]:
model = torch.load("../models/rating_bce_users_split")
model.eval()

RecSys(
  (d1): Dropout(p=0.1, inplace=False)
  (fc1): Linear(in_features=1704, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=1682, bias=True)
)

In [19]:
model_mask = torch.load("../models/rating_bce_users_split_mask")
model_mask.eval()

RecSys(
  (d1): Dropout(p=0.1, inplace=False)
  (fc1): Linear(in_features=1704, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=1682, bias=True)
)

In [20]:
def get_single_output(
    model: nn.Module,
    input_data: np.ndarray,
):
    with torch.no_grad():
        model.eval()
        input_tensor = torch.Tensor([input_data]).to(DEVICE)
        model_out = model(input_tensor)

    return model_out[0].cpu().numpy()

In [21]:
def load_genres(path: str) -> list[str]:
    return pd.read_csv(
        os.path.join(path, "u.genre"),
        sep="|",
        header=None,
        names=["name", "genre_idx"],
        encoding="ISO-8859-1",
    )["name"].tolist()


def load_items(path: str, genres: list[str]) -> pd.DataFrame:
    return pd.read_csv(
        os.path.join(path, "u.item"),
        sep="|",
        header=None,
        names=[
            "movie_id",
            "movie_title",
            "release_date",
            "video_release_date",
            "IMDb_URL",
            *genres,
        ],
        encoding="ISO-8859-1",
    )


genres = load_genres("../data/raw/ml-100k/")
movies_df = load_items("../data/raw/ml-100k/", genres)

In [24]:
def get_unseen_on_input_data(
    input_rating: np.ndarray, movie_ratings: np.ndarray
) -> np.ndarray:
    unseen_ratings = movie_ratings.copy()
    seen_indices = np.nonzero(input_rating > 0)[0]
    unseen_ratings[seen_indices] = 0
    return unseen_ratings


def calculate_genre_ratios(
    movie_indices: np.ndarray, items_df: pd.DataFrame
) -> np.ndarray:
    genres_sum = (
        items_df[items_df["movie_id"].isin(movie_indices + 1)]
        .iloc[:, 5:]
        .sum(axis=0)
        .to_numpy()
    )
    return genres_sum / genres_sum.sum()


def get_recommendations(
    model: nn.Module,
    encoded_age: float,
    encoded_gender: int,
    encoded_occupation: int,
    movie_indices: list[int],
    movies_df: pd.DataFrame,
    predicted_threshold: float,
    num_recs: int = 5,
) -> np.ndarray:
    movie_indices_shifted = np.array(movie_indices) - 1  # starting from 0

    movies_ratings = np.zeros(NUM_MOVIES)
    movies_ratings[movie_indices_shifted] = 1.0  # rating = 5
    input_vector = np.array(
        [
            encoded_age,
            encoded_gender,
            encoded_occupation,
            *calculate_genre_ratios(np.array(movie_indices_shifted), movies_df),
            *movies_ratings,
        ]
    )

    predictions = get_single_output(model, input_vector)
    predictions[predictions < predicted_threshold] = 0.0
    unseen_predictions = get_unseen_on_input_data(movies_ratings, predictions)

    movie_ids = np.argsort(-unseen_predictions) + 1

    unknown_idx = 267  # actual idx (from 1)
    movie_ids = np.delete(movie_ids, np.where(movie_ids == unknown_idx))

    return movie_ids[:num_recs]


def get_movie_titles(
    recommended_movies: np.ndarray, movies_df: pd.DataFrame
) -> list[str]:
    return [
        movies_df[movies_df["movie_id"] == movie_id]["movie_title"].to_list()[0]
        for movie_id in recommended_movies
    ]


def show_recommendations(
    models_set: list[tuple[str, nn.Module]],
    movies_set: list[tuple[str, list[int]]],
    predicted_threshold: float = 0.0,
):
    for movies_name, movies in movies_set:
        print(movies_name)
        for model_name, model in models_set:
            recommended_movies = get_recommendations(
                model, 0.21, 1, 19, movies, movies_df, predicted_threshold
            )
            print(f"{model_name:10}: {get_movie_titles(recommended_movies, movies_df)}")
        print()

In [25]:
models_set = [
    ("No mask", model),
    ("Mask", model_mask),
]

movies_set = [
    (
        "SCI-FI",
        [50, 257, 204, 181],
    ),  # Star Wars, MIB, Back to The Future, Return of the Jedi
    ("CARTOONS", [1, 225, 465, 501]),  # Toy Story, 101 Dalmatians, Jungle Book, Dumbo
    ("STAR TRACK", [222, 228, 380, 449]),  # Star Tracks
    ("PULP FICTION", [56]),  # Pulp Fiction
]

show_recommendations(models_set, movies_set)

SCI-FI
No mask   : ['Godfather, The (1972)', 'Evil Dead II (1987)', 'Crumb (1994)', 'Terminator, The (1984)', 'Scream (1996)']
Mask      : ['Perfect World, A (1993)', 'Raging Bull (1980)', 'Terminator, The (1984)', 'Crumb (1994)', 'Star Trek: The Wrath of Khan (1982)']

CARTOONS
No mask   : ['Star Trek IV: The Voyage Home (1986)', 'True Lies (1994)', 'Star Trek VI: The Undiscovered Country (1991)', 'Star Trek: The Wrath of Khan (1982)', 'Batman Returns (1992)']
Mask      : ['Blade Runner (1982)', 'Crumb (1994)', 'Alien (1979)', 'Casablanca (1942)', '2001: A Space Odyssey (1968)']

STAR TRACK
No mask   : ['Henry V (1989)', 'Pulp Fiction (1994)', 'Batman Forever (1995)', 'Fugitive, The (1993)', 'Contact (1997)']
Mask      : ['GoodFellas (1990)', 'Crumb (1994)', 'Event Horizon (1997)', 'Contact (1997)', 'Bananas (1971)']

PULP FICTION
No mask   : ['Star Wars (1977)', 'Star Trek: First Contact (1996)', 'Crumb (1994)', 'L.A. Confidential (1997)', 'Empire Strikes Back, The (1980)']
Mask     

In [26]:
show_recommendations(models_set, movies_set, predicted_threshold=0.8)  # rating >= 4

SCI-FI
No mask   : ['Chasing Amy (1997)', 'Crumb (1994)', 'Indiana Jones and the Last Crusade (1989)', 'Groundhog Day (1993)', 'Evil Dead II (1987)']
Mask      : ['Henry V (1989)', 'Jackie Brown (1997)', 'Streetcar Named Desire, A (1951)', 'Crumb (1994)', 'Bridge on the River Kwai, The (1957)']

CARTOONS
No mask   : ['Independence Day (ID4) (1996)', 'Mother (1996)', 'Mimic (1997)', 'Pulp Fiction (1994)', 'Blues Brothers, The (1980)']
Mask      : ['Die Hard (1988)', 'Star Trek: First Contact (1996)', 'Henry V (1989)', 'Blade Runner (1982)', 'Mother (1996)']

STAR TRACK
No mask   : ['Fugitive, The (1993)', 'Event Horizon (1997)', 'Batman Forever (1995)', 'Star Trek VI: The Undiscovered Country (1991)', 'Contact (1997)']
Mask      : ['Starship Troopers (1997)', 'Raging Bull (1980)', 'Terminator, The (1984)', 'Bridge on the River Kwai, The (1957)', 'Patton (1970)']

PULP FICTION
No mask   : ['Heat (1995)', 'Scream (1996)', 'Liar Liar (1997)', 'Star Wars (1977)', 'L.A. Confidential (1997)']

## Metrics

In [27]:
def generate_test_data(
    model: nn.Module, dataset: RecommendationDataset
) -> list[tuple[np.ndarray, np.ndarray]]:
    test_data = []

    for input_data, _, target in tqdm(dataset):
        predicted = get_single_output(model, input_data)

        input_ratings = input_data[TOTAL_USER_FEATURES:]
        unseen_predicted = get_unseen_on_input_data(input_ratings, predicted)
        unseen_target = get_unseen_on_input_data(input_ratings, target)
        test_data.append((unseen_target, unseen_predicted))

    return test_data

In [28]:
def get_top_args(x: np.ndarray, n: int) -> np.ndarray:
    return np.argsort(-x)[:n]


def top_intersection(target: np.ndarray, predicted: np.ndarray, top_n: int = 20):
    return list(
        set(get_top_args(target, top_n)).intersection(get_top_args(predicted, top_n))
    )


def top_k_intersections(
    data: list[tuple[np.ndarray, np.ndarray]], k: int, threshold: float = 0.0
) -> list[int]:
    intersections = []
    for unseen_target, unseen_predicted in data:
        nonzero_targets = unseen_target[unseen_target > threshold]
        relevant_predicted = unseen_predicted[unseen_predicted > threshold]
        intersections.append(
            len(top_intersection(nonzero_targets, relevant_predicted, k))
        )

    return intersections


def retrieval_precisions_on_k(
    data: list[tuple[np.ndarray, np.ndarray]], k: int
) -> list[int]:
    retrieval_precisions = []
    for unseen_target, unseen_predicted in data:
        nonzero_targets = unseen_target > 0
        relevant_predicted = unseen_predicted

        retrieval_precisions.append(
            retrieval_precision(
                torch.Tensor(relevant_predicted), torch.Tensor(nonzero_targets), k
            )
        )

    return retrieval_precisions


def average_precision_on_k(target: np.ndarray, predicted: np.ndarray, k: int) -> float:
    relevant_predicted = predicted.copy()
    if len(relevant_predicted) > k:
        relevant_predicted = relevant_predicted[:k]

    score = 0.0
    hits = 0

    for idx, x in enumerate(relevant_predicted):
        if x in target and x not in relevant_predicted[:idx]:
            hits += 1
            score += hits / (idx + 1.0)

    return score / min(len(target), k)


def map_on_k(targets: list[np.ndarray], predictions: list[np.ndarray], k: int) -> float:
    return np.mean(
        [
            average_precision_on_k(target, predicted, k)
            for target, predicted in zip(targets, predictions)
        ]
    )


def generate_total_data_lists(
    data: list[tuple[np.ndarray, np.ndarray]]
) -> tuple[list[np.ndarray], list[np.ndarray]]:
    all_targets = []
    all_predictions = []
    for unseen_target, unseen_predicted in data:
        nonzero_targets = unseen_target > 0
        all_targets.append(
            np.argsort(nonzero_targets)[len(nonzero_targets) - sum(nonzero_targets) :]
        )
        all_predictions.append(np.argsort(-unseen_predicted))

    return all_targets, all_predictions

In [29]:
def show_metrics(data: list[tuple[np.ndarray, np.ndarray]], ks: list[int]):
    all_targets, all_predictions = generate_total_data_lists(data)
    for k in ks:
        print(f"K={k}")
        intersections = top_k_intersections(data, k)
        retrieval_precisions = retrieval_precisions_on_k(data, k)
        map_score = map_on_k(all_targets, all_predictions, k)

        print(f"Mean top intersections: {np.mean(intersections)}")
        print(f"Mean retrieval precision: {np.mean(retrieval_precisions)}")
        print(f"MAP: {map_score}")
        print()

In [30]:
ks = [5, 10, 20, 50]

test_data = generate_test_data(model, val_dataset)
test_data_mask = generate_test_data(model_mask, val_dataset)

100%|██████████| 2565/2565 [00:03<00:00, 726.85it/s] 
100%|██████████| 2565/2565 [00:02<00:00, 951.36it/s] 


In [31]:
show_metrics(test_data, ks)

K=5
Mean top intersections: 0.14035087719298245
Mean retrieval precision: 0.5110331773757935
MAP: 0.43053822828676636

K=10
Mean top intersections: 0.4280701754385965
Mean retrieval precision: 0.4589473605155945
MAP: 0.3593875801963266

K=20
Mean top intersections: 1.1606237816764133
Mean retrieval precision: 0.3972514569759369
MAP: 0.30376922329838046

K=50
Mean top intersections: 3.9485380116959066
Mean retrieval precision: 0.3097154200077057
MAP: 0.2682276276495463



In [32]:
show_metrics(test_data_mask, ks)

K=5
Mean top intersections: 0.23235867446393763
Mean retrieval precision: 0.5186744928359985
MAP: 0.4439377301277886

K=10
Mean top intersections: 0.5220272904483431
Mean retrieval precision: 0.46627679467201233
MAP: 0.3738384316292397

K=20
Mean top intersections: 1.280701754385965
Mean retrieval precision: 0.4095321595668793
MAP: 0.32047008088875356

K=50
Mean top intersections: 4.221832358674464
Mean retrieval precision: 0.318401575088501
MAP: 0.28206564705192194

