# Recommendation system solution

In [1]:
import copy
import warnings

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torcheval.metrics.functional.ranking import retrieval_precision
from tqdm import tqdm

In [2]:
MANUAL_SEED = 42
torch.manual_seed(MANUAL_SEED)

warnings.filterwarnings("ignore")

## Data loading and preprocessing

In [3]:
df = pd.read_csv("../data/raw/users_with_masks.csv", sep="\t")
print(f"{len(df)=}")
df.head()

len(df)=4715


Unnamed: 0,age,gender,occupation,zip_code,genre_unknown,genre_Action,genre_Adventure,genre_Animation,genre_Children's,genre_Comedy,...,rating_1672,rating_1673,rating_1674,rating_1675,rating_1676,rating_1677,rating_1678,rating_1679,rating_1680,rating_1681
0,0.24,1.0,0.904762,0.569647,0.001704,0.127768,0.07155,0.020443,0.042589,0.155026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.24,1.0,0.904762,0.569647,0.001704,0.127768,0.07155,0.020443,0.042589,0.155026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.24,1.0,0.904762,0.569647,0.001704,0.127768,0.07155,0.020443,0.042589,0.155026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.24,1.0,0.904762,0.569647,0.001704,0.127768,0.07155,0.020443,0.042589,0.155026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.24,1.0,0.904762,0.569647,0.001704,0.127768,0.07155,0.020443,0.042589,0.155026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
TRAIN_RATIO = 0.9

train_df = df.sample(frac=TRAIN_RATIO, random_state=MANUAL_SEED)
val_df = df.drop(train_df.index)

print(f"{len(train_df)=}")
print(f"{len(val_df)=}")

len(train_df)=4244
len(val_df)=471


In [5]:
NUM_MOVIES = 1682
USER_FEATURES = 23

In [6]:
class RecommendationDataset(torch.utils.data.Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.inputs = self.df.iloc[:, : USER_FEATURES + NUM_MOVIES].to_numpy()
        self.targets = self.df.iloc[
            :, USER_FEATURES + NUM_MOVIES : USER_FEATURES + NUM_MOVIES * 2
        ].to_numpy()

    def _mask(self, value: np.ndarray, mask: np.ndarray) -> np.ndarray:
        return value[mask.nonzero()]

    def __getitem__(self, idx: int) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
        input_data = self.inputs[idx]
        input_ratings = input_data[USER_FEATURES:]
        mask = input_ratings == 0
        target = self.targets[idx]
        return input_data, mask, target

    def __len__(self) -> int:
        return len(self.df)

In [7]:
dataset = RecommendationDataset(df)

In [8]:
train_dataset, val_dataset = (
    RecommendationDataset(train_df),
    RecommendationDataset(val_df),
)
print(f"{len(train_dataset)=}")
print(f"{len(val_dataset)=}")

len(train_dataset)=4244
len(val_dataset)=471


In [9]:
BATCH_SIZE = 128

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

DEVICE

device(type='cuda')

In [10]:
def collate_batch(batch: list) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    input_data_batch, mask_batch, target_batch = [], [], []
    for input_data, mask, target in batch:
        input_data_batch.append(input_data)
        mask_batch.append(mask)
        target_batch.append(target)

    return (
        torch.Tensor(input_data_batch),
        torch.Tensor(mask_batch).bool(),
        torch.Tensor(target_batch),
    )


train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
val_dataloader = torch.utils.data.DataLoader(
    dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)

In [11]:
for batch in train_dataloader:
    inp, mask, out = batch
    print(inp.shape)
    print(mask.shape)
    print(out.shape)
    break

torch.Size([128, 1705])
torch.Size([128, 1682])
torch.Size([128, 1682])


In [12]:
it = train_dataloader._get_iterator()

it._next_data()

(tensor([[0.4800, 0.0000, 0.9524,  ..., 0.0000, 0.0000, 0.0000],
         [0.4400, 1.0000, 0.0476,  ..., 0.0000, 0.0000, 0.0000],
         [0.2200, 1.0000, 0.8571,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.3000, 1.0000, 0.4286,  ..., 0.0000, 0.0000, 0.0000],
         [0.3100, 1.0000, 0.1429,  ..., 0.0000, 0.0000, 0.0000],
         [0.6000, 1.0000, 0.9524,  ..., 0.0000, 0.0000, 0.0000]]),
 tensor([[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]]),
 tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [1.0000, 0.4000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.8000, 0.0000, 0.0000,  ..., 0.0000, 0.0

## Creating the network

In [13]:
INPUT_SIZE = USER_FEATURES + NUM_MOVIES

In [14]:
class RecSys(nn.Module):
    def __init__(
        self,
        hidden_dim1: int = 1024,
        hidden_dim2: int = 1024,
        hidden_dim3: int = 1024,
    ):
        super(RecSys, self).__init__()
        self.d1 = nn.Dropout(0.1)
        self.d2 = nn.Dropout(0.2)
        self.fc1 = nn.Linear(INPUT_SIZE, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, NUM_MOVIES)

    def forward(self, x):
        x = self.d1(x)
        x = F.relu(self.fc1(x))
        x = self.d2(x)
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return F.sigmoid(self.fc4(x))
        # return F.relu(self.fc4(x))

In [15]:
torch.manual_seed(MANUAL_SEED)

model = RecSys()

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

model = model.to(DEVICE)

# loss_fn = torch.nn.MSELoss(reduction='sum')
loss_fn = torch.nn.MSELoss()
# loss_fn = torch.nn.L1Loss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

## Train model

In [16]:
def train_one_epoch(
    model,
    loader,
    optimizer,
    loss_fn,
    epoch,
):
    model.train()
    train_loss = 0.0
    total = 0

    loop = tqdm(
        loader,
        total=len(loader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )
    for batch in loop:
        input_data, mask, target = batch
        input_data, target, mask = (
            input_data.to(DEVICE),
            target.to(DEVICE),
            mask.to(DEVICE),
        )

        # forward pass and loss calculation
        outputs = model(input_data)

        # zero the parameter gradients
        optimizer.zero_grad()

        # loss = loss_fn(torch.masked_select(outputs, mask), torch.masked_select(target, mask))
        loss = loss_fn(outputs, target)

        # backward pass
        loss.backward()
        total += target.shape[1]

        # optimizer run
        optimizer.step()

        train_loss += loss.item()
        loop.set_postfix({"loss": train_loss / total})


def val_one_epoch(
    model,
    loader,
    loss_fn,
    epoch,
):
    loop = tqdm(
        loader,
        total=len(loader),
        desc=f"Epoch {epoch}: val",
        leave=True,
    )
    val_loss = 0.0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for batch in loop:
            input_data, mask, target = batch
            input_data, target, mask = (
                input_data.to(DEVICE),
                target.to(DEVICE),
                mask.to(DEVICE),
            )

            outputs = model(input_data)

            # loss = loss_fn(torch.masked_select(outputs, mask), torch.masked_select(target, mask))
            loss = loss_fn(outputs, target)

            val_loss += loss.item()
            total += target.shape[1]
            loop.set_postfix({"loss": val_loss / total})
    return val_loss / total

In [17]:
NUM_EPOCHS = 10

best_loss = 1e10

for epoch in range(1, NUM_EPOCHS + 1):
    train_one_epoch(model, train_dataloader, optimizer, loss_fn, epoch)
    val_loss = val_one_epoch(model, val_dataloader, loss_fn, epoch)
    if val_loss <= best_loss:
        val_loss = best_loss
        torch.save(model, "../models/rating_loss_without_mask")


best = copy.deepcopy(model)

Epoch 1: train:   0%|          | 0/34 [00:00<?, ?it/s]

Epoch 1: train: 100%|██████████| 34/34 [00:37<00:00,  1.11s/it, loss=0.000107]
Epoch 1: val: 100%|██████████| 4/4 [00:00<00:00,  6.93it/s, loss=3.36e-5]
Epoch 2: train: 100%|██████████| 34/34 [00:07<00:00,  4.61it/s, loss=2.13e-5]
Epoch 2: val: 100%|██████████| 4/4 [00:00<00:00,  6.88it/s, loss=1.77e-5]
Epoch 3: train: 100%|██████████| 34/34 [00:09<00:00,  3.55it/s, loss=1.94e-5]
Epoch 3: val: 100%|██████████| 4/4 [00:00<00:00,  4.94it/s, loss=1.76e-5]
Epoch 4: train: 100%|██████████| 34/34 [00:08<00:00,  3.88it/s, loss=1.92e-5]
Epoch 4: val: 100%|██████████| 4/4 [00:01<00:00,  3.70it/s, loss=1.74e-5]
Epoch 5: train: 100%|██████████| 34/34 [00:09<00:00,  3.52it/s, loss=1.91e-5]
Epoch 5: val: 100%|██████████| 4/4 [00:00<00:00,  5.62it/s, loss=1.72e-5]
Epoch 6: train: 100%|██████████| 34/34 [00:08<00:00,  3.80it/s, loss=1.86e-5]
Epoch 6: val: 100%|██████████| 4/4 [00:00<00:00,  4.50it/s, loss=1.64e-5]
Epoch 7: train: 100%|██████████| 34/34 [00:08<00:00,  4.22it/s, loss=1.74e-5]
Epoch 7: 

## Test model

In [18]:
model = torch.load("../models/rating_loss_without_mask")
model.eval()

RecSys(
  (d1): Dropout(p=0.1, inplace=False)
  (d2): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=1705, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=1024, bias=True)
  (fc4): Linear(in_features=1024, out_features=1682, bias=True)
)

In [19]:
def greedy_test(
    model,
    input_data: torch.Tensor,
):
    with torch.no_grad():
        model.eval()

        input_data = input_data.to(DEVICE)

        model_out = model(input_data)

    return model_out

In [20]:
class TestDataset:
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.inputs = self.df.iloc[:, : USER_FEATURES + NUM_MOVIES].to_numpy()
        self.targets = self.df.iloc[
            :, USER_FEATURES + NUM_MOVIES : USER_FEATURES + NUM_MOVIES * 2
        ].to_numpy()

    def __getitem__(self, idx: int) -> tuple[np.ndarray, np.ndarray]:
        input_data = self.inputs[idx]
        return input_data, self.targets[idx]

    def __len__(self) -> int:
        return len(self.df)

In [21]:
test_dataset = TestDataset(val_df)

## Metrics

In [22]:
def get_new_data(input_data, target, predicted):
    input_ratings = input_data[USER_FEATURES:]
    remove_indices = np.nonzero(input_ratings > 0)[0]
    new_target = np.delete(target, remove_indices)
    new_predicted = np.delete(predicted, remove_indices)

    return new_target, new_predicted


def sort_args(x, n):
    return np.argsort(-x)[:n]


def top_intersection(target, predicted, top_n=20):
    return list(set(sort_args(target, top_n)).intersection(sort_args(predicted, top_n)))

In [23]:
k = 10

intersections = []
for input_data, masked_target in test_dataset:
    predictions = greedy_test(model, torch.Tensor([input_data]))
    target = masked_target
    predicted = predictions[0].cpu().numpy()

    new_target, new_predicted = get_new_data(input_data, target, predicted)

    nonzero_targets = new_target[new_target > 0]
    relevant_predicted = new_predicted[new_predicted > 0.2]

    intersections.append(len(top_intersection(nonzero_targets, relevant_predicted, k)))
print(f"{len(intersections)=}")
print(f"{np.mean(intersections)=}")
print(f"{np.max(intersections)=}")
print(f"{np.min(intersections)=}")

len(intersections)=471
np.mean(intersections)=2.4416135881104033
np.max(intersections)=9
np.min(intersections)=0


In [36]:
k = 10
retrieval_precisions = []

for input_data, masked_target in test_dataset:
    predictions = greedy_test(model, torch.Tensor([input_data]))
    target = masked_target
    predicted = predictions[0].cpu().numpy()

    new_target, new_predicted = get_new_data(input_data, target, predicted)

    nonzero_targets = new_target > 0
    relevant_predicted = new_predicted

    retrieval_precisions.append(
        retrieval_precision(
            torch.Tensor(relevant_predicted), torch.Tensor(nonzero_targets), k
        )
    )

print(f"{len(retrieval_precisions)=}")
print(f"{np.mean(retrieval_precisions)=}")
print(f"{np.max(retrieval_precisions)=}")
print(f"{np.min(retrieval_precisions)=}")

len(retrieval_precisions)=471
np.mean(retrieval_precisions)=0.5292994
np.max(retrieval_precisions)=1.0
np.min(retrieval_precisions)=0.0
