# Rating loss 2.0

In [1]:
import copy
import os
import warnings
from ast import literal_eval

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

In [2]:
MANUAL_SEED = 42
torch.manual_seed(MANUAL_SEED)

warnings.filterwarnings("ignore")

## Data loading and preprocessing

In [3]:
path = os.path.join(".", "../data/raw/user_masks/")
loaded_dfs = [
    pd.read_csv(os.path.join(path, file_name)) for file_name in os.listdir(path)
]
df = pd.concat(loaded_dfs).drop(columns=["user_id"]).reset_index()
df.head()

Unnamed: 0,index,age,gender,occupation,input,output,genres
0,0,0.24,1.0,19.0,"[5.0, 3.0, 4.0, 3.0, 3.0, 5.0, 4.0, 1.0, 5.0, ...","[5.0, 3.0, 4.0, 3.0, 3.0, 5.0, 4.0, 1.0, 5.0, ...","[0.0019047619047619048, 0.1219047619047619, 0...."
1,1,0.53,0.0,13.0,"[4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.08333333333333333, 0.02777777777777777..."
2,2,0.23,1.0,20.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.10810810810810811, 0.03603603603603603..."
3,3,0.24,1.0,19.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.14814814814814814, 0.07407407407407407..."
4,4,0.33,0.0,13.0,"[4.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0028735632183908046, 0.14942528735632185, 0..."


In [4]:
len(literal_eval(df["genres"][0]))

19

In [5]:
TRAIN_RATIO = 0.9

train_df = df.sample(frac=TRAIN_RATIO, random_state=MANUAL_SEED)
val_df = df.drop(train_df.index)

train_df.drop(columns=["index"], inplace=True)
val_df.drop(columns=["index"], inplace=True)

print(f"{len(train_df)=}")
print(f"{len(val_df)=}")

len(train_df)=22915
len(val_df)=2546


In [6]:
NUM_MOVIES = 1682
USER_FEATURES = 3 + 19

In [7]:
class RecommendationDataset(torch.utils.data.Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df
        features = []
        inputs = []
        targets = []
        for _, row in tqdm(df.iterrows(), total=len(df)):
            features.append(row[:3].tolist() + literal_eval(row["genres"]))
            inputs.append(literal_eval(row["input"]))
            targets.append(literal_eval(row["output"]))

        self.features = np.array(features)
        self.inputs = np.array(inputs) / 5
        self.targets = np.array(targets) / 5

    def __getitem__(self, idx: int) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
        input_ratings = self.inputs[idx]
        # print(input_ratings)
        input_data = np.concatenate([self.features[idx], input_ratings])
        mask = input_ratings == 0
        return input_data, mask, self.targets[idx]

    def __len__(self) -> int:
        return len(self.df)

In [8]:
train_dataset, val_dataset = (
    # RecommendationDataset(train_df.iloc[:100, :]),
    # RecommendationDataset(val_df.iloc[:100, :]),
    RecommendationDataset(train_df),
    RecommendationDataset(val_df),
)
print(f"{len(train_dataset)=}")
print(f"{len(val_dataset)=}")

  0%|          | 21/22915 [00:00<05:38, 67.65it/s]

100%|██████████| 22915/22915 [03:22<00:00, 113.11it/s]
100%|██████████| 2546/2546 [00:17<00:00, 144.93it/s]


len(train_dataset)=22915
len(val_dataset)=2546


In [9]:
BATCH_SIZE = 32

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

DEVICE

device(type='cuda')

In [10]:
def collate_batch(batch: list) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    input_data_batch, mask_batch, target_batch = [], [], []
    for input_data, mask, target in batch:
        input_data_batch.append(input_data)
        mask_batch.append(mask)
        target_batch.append(target)

    return (
        torch.Tensor(input_data_batch),
        torch.Tensor(mask_batch).bool(),
        torch.Tensor(target_batch),
    )


train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
val_dataloader = torch.utils.data.DataLoader(
    dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)

In [11]:
for batch in train_dataloader:
    inp, mask, out = batch
    print(inp.shape)
    print(mask.shape)
    print(out.shape)
    break

torch.Size([32, 1704])
torch.Size([32, 1682])
torch.Size([32, 1682])


In [12]:
it = train_dataloader._get_iterator()

it._next_data()

(tensor([[ 0.1700,  1.0000, 18.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.1900,  0.0000, 18.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.4200,  1.0000, 14.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.2300,  1.0000, 13.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.2700,  0.0000, 18.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.2500,  1.0000, 14.0000,  ...,  0.0000,  0.0000,  0.0000]]),
 tensor([[False,  True,  True,  ...,  True,  True,  True],
         [False,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True],
         ...,
         [False,  True,  True,  ...,  True,  True,  True],
         [ True,  True, False,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True]]),
 tensor([[0.6000, 0.6000, 1.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.8000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.

## Creating the network

In [13]:
INPUT_SIZE = USER_FEATURES + NUM_MOVIES

In [14]:
class RecSys(nn.Module):
    def __init__(
        self,
        hidden_dim1: int = 1024,
        hidden_dim2: int = 1024,
        hidden_dim3: int = 1024,
    ):
        super(RecSys, self).__init__()
        self.d1 = nn.Dropout(0.1)
        self.d2 = nn.Dropout(0.2)
        self.fc1 = nn.Linear(INPUT_SIZE, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, NUM_MOVIES)

    def forward(self, x):
        x = self.d1(x)
        x = F.relu(self.fc1(x))
        x = self.d2(x)
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return F.sigmoid(self.fc4(x))
        # return F.relu(self.fc4(x))

In [53]:
torch.manual_seed(MANUAL_SEED)

model = RecSys()

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

model = model.to(DEVICE)

loss_fn = torch.nn.BCELoss()
# loss_fn = torch.nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

## Train model

In [54]:
def train_one_epoch(
    model,
    loader,
    optimizer,
    loss_fn,
    epoch,
):
    model.train()
    train_loss = 0.0
    total = 0

    loop = tqdm(
        loader,
        total=len(loader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )
    for batch in loop:
        input_data, mask, target = batch
        input_data, target, mask = (
            input_data.to(DEVICE),
            target.to(DEVICE),
            mask.to(DEVICE),
        )

        # forward pass and loss calculation
        outputs = model(input_data)

        # zero the parameter gradients
        optimizer.zero_grad()

        # loss = loss_fn(
        #     torch.masked_select(outputs, mask), torch.masked_select(target, mask)
        # )
        # loss = loss_fn(outputs, target)
        loss = loss_fn(outputs, (target > 0).float())

        # backward pass
        loss.backward()
        total += target.shape[1]

        # optimizer run
        optimizer.step()

        train_loss += loss.item()
        loop.set_postfix({"loss": train_loss / total})


def val_one_epoch(
    model,
    loader,
    loss_fn,
    epoch,
):
    loop = tqdm(
        loader,
        total=len(loader),
        desc=f"Epoch {epoch}: val",
        leave=True,
    )
    val_loss = 0.0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for batch in loop:
            input_data, mask, target = batch
            input_data, target, mask = (
                input_data.to(DEVICE),
                target.to(DEVICE),
                mask.to(DEVICE),
            )

            outputs = model(input_data)

            # loss = loss_fn(
            #     torch.masked_select(outputs, mask), torch.masked_select(target, mask)
            # )
            # loss = loss_fn(outputs, target)

            loss = loss_fn(outputs, (target > 0).float())

            val_loss += loss.item()
            total += target.shape[1]
            loop.set_postfix({"loss": val_loss / total})
    return val_loss / total

In [55]:
NUM_EPOCHS = 10

best_loss = 1e10

for epoch in range(1, NUM_EPOCHS + 1):
    train_one_epoch(model, train_dataloader, optimizer, loss_fn, epoch)
    val_loss = val_one_epoch(model, val_dataloader, loss_fn, epoch)
    if val_loss <= best_loss:
        val_loss = best_loss
        torch.save(model, "../models/rating_loss_2")


best = copy.deepcopy(model)

Epoch 1: train: 100%|██████████| 717/717 [00:34<00:00, 21.01it/s, loss=8.2e-5]  
Epoch 1: val: 100%|██████████| 80/80 [00:03<00:00, 25.33it/s, loss=6.34e-5]
Epoch 2: train: 100%|██████████| 717/717 [00:36<00:00, 19.84it/s, loss=5.35e-5]
Epoch 2: val: 100%|██████████| 80/80 [00:03<00:00, 24.98it/s, loss=4.04e-5]
Epoch 3: train: 100%|██████████| 717/717 [00:34<00:00, 20.73it/s, loss=3.02e-5]
Epoch 3: val: 100%|██████████| 80/80 [00:03<00:00, 24.81it/s, loss=2.15e-5]
Epoch 4: train: 100%|██████████| 717/717 [00:34<00:00, 20.65it/s, loss=1.63e-5]
Epoch 4: val: 100%|██████████| 80/80 [00:03<00:00, 25.65it/s, loss=1.37e-5]
Epoch 5: train: 100%|██████████| 717/717 [00:36<00:00, 19.87it/s, loss=1.02e-5]
Epoch 5: val: 100%|██████████| 80/80 [00:03<00:00, 22.87it/s, loss=1.05e-5]
Epoch 6: train:  14%|█▍        | 99/717 [00:05<00:31, 19.46it/s, loss=7.2e-6] 


KeyboardInterrupt: 

## Test model

In [56]:
model = torch.load("../models/rating_loss_2")
model.eval()

RecSys(
  (d1): Dropout(p=0.1, inplace=False)
  (d2): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=1704, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=1024, bias=True)
  (fc4): Linear(in_features=1024, out_features=1682, bias=True)
)

In [57]:
def greedy_test(
    model,
    input_data: torch.Tensor,
):
    with torch.no_grad():
        model.eval()

        input_data = input_data.to(DEVICE)

        model_out = model(input_data)

    return model_out

In [58]:
def get_new_data(input_data, target, predicted):
    input_ratings = input_data[USER_FEATURES:]
    remove_indices = np.nonzero(input_ratings > 0)[0]
    new_target = np.delete(target, remove_indices)
    new_predicted = np.delete(predicted, remove_indices)

    return new_target, new_predicted


def sort_args(x, n):
    return np.argsort(-x)[:n]


def top_intersection(target, predicted, top_n=20):
    return list(set(sort_args(target, top_n)).intersection(sort_args(predicted, top_n)))

In [66]:
from torcheval.metrics.functional.ranking import retrieval_precision

k = 10
retrieval_precisions = []

for input_data, _, masked_target in val_dataset:
    predictions = greedy_test(model, torch.Tensor([input_data]))
    target = masked_target
    predicted = predictions[0].cpu().numpy()

    new_target, new_predicted = get_new_data(input_data, target, predicted)

    nonzero_targets = new_target > 0
    relevant_predicted = new_predicted

    retrieval_precisions.append(
        retrieval_precision(
            torch.Tensor(relevant_predicted), torch.Tensor(nonzero_targets), k
        )
    )

print(f"{len(retrieval_precisions)=}")
print(f"{np.mean(retrieval_precisions)=}")
print(f"{np.max(retrieval_precisions)=}")
print(f"{np.min(retrieval_precisions)=}")

len(retrieval_precisions)=2546
np.mean(retrieval_precisions)=0.86771405
np.max(retrieval_precisions)=1.0
np.min(retrieval_precisions)=0.0


In [102]:
k = 10

intersections = []
for input_data, _, target in val_dataset:
    predictions = greedy_test(model, torch.Tensor([input_data]))
    predicted = predictions[0].cpu().numpy()

    new_target, new_predicted = get_new_data(input_data, target, predicted)

    nonzero_targets = new_target[new_target > 0]
    relevant_predicted = new_predicted[new_predicted > 0.2]

    intersections.append(len(top_intersection(nonzero_targets, relevant_predicted, k)))
print(f"{len(intersections)=}")
print(f"{np.mean(intersections)=}")
print(f"{np.max(intersections)=}")
print(f"{np.min(intersections)=}")

len(intersections)=2546
np.mean(intersections)=3.10840534171249
np.max(intersections)=10
np.min(intersections)=0


In [None]:
t = target > 0
np.argsort(t)[len(t) - sum(t) :]

array([ 215,  214,  260,   30,  264,  603,  309,  123,  209, 1203,  477,
        478,  413,  258,  356,  606,  322,  968,  134,  233, 1049,  317,
        434,  327,  257,  614,  315,  704, 1220,  314,  426,  312,  479,
        422,  130,  877,  268, 1027,  688,  281,  299,  510,  182,   78,
        513,  583,  527,  519,  677,  361,  173,   98,  749,  171,   94,
        346,   70,  302,  538,  483,   96,   49,  486,  116,  271,  199,
        944,  878,  891,  196,  303,  495,  660,  497,  661,  499,  192,
        658,  321], dtype=int64)

In [103]:
all_targets = []
all_predictions = []
for input_data, _, target in val_dataset:
    predictions = greedy_test(model, torch.Tensor([input_data]))
    predicted = predictions[0].cpu().numpy()

    new_target, new_predicted = get_new_data(input_data, target, predicted)

    t = new_target > 0
    all_targets.append(np.argsort(t)[len(t) - sum(t) :])
    all_predictions.append(np.argsort(-new_predicted))

    t = target > 0
    # all_targets.append(np.argsort(t)[len(t)-sum(t):])
    # all_predictions.append(np.argsort(-predicted))

In [96]:
def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

In [107]:
mapk(all_targets, all_predictions, 5)

0.9379484158156585