# Recommendation system solution

In [228]:
import warnings

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import random_split
from tqdm import tqdm

In [229]:
MANUAL_SEED = 42
torch.manual_seed(MANUAL_SEED)

warnings.filterwarnings("ignore")

## Data loading and preprocessing

In [230]:
df = pd.read_csv("../data/raw/users_with_masks.csv", sep="\t")
print(f"{len(df)=}")
df.head()

len(df)=2829


Unnamed: 0,user_id,age,gender,occupation,zip_code,genre_unknown,genre_Action,genre_Adventure,genre_Animation,genre_Children's,...,rating_1672,rating_1673,rating_1674,rating_1675,rating_1676,rating_1677,rating_1678,rating_1679,rating_1680,rating_1681
0,1.0,0.24,1.0,0.904762,0.569647,0.001704,0.127768,0.07155,0.020443,0.042589,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.24,1.0,0.904762,0.569647,0.001704,0.127768,0.07155,0.020443,0.042589,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.24,1.0,0.904762,0.569647,0.001704,0.127768,0.07155,0.020443,0.042589,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,0.53,0.0,0.619048,0.59313,0.0,0.081301,0.02439,0.00813,0.03252,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,0.53,0.0,0.619048,0.59313,0.0,0.081301,0.02439,0.00813,0.03252,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [231]:
NUM_MOVIES = 1682

In [232]:
class RecommendationDataset(torch.utils.data.Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.inputs = self.df.iloc[:, : 24 + NUM_MOVIES].to_numpy()
        self.masks = self.df.iloc[:, 24 + NUM_MOVIES : 24 + NUM_MOVIES * 2].to_numpy()
        self.targets = self.df.iloc[
            :, 24 + NUM_MOVIES * 2 : 24 + NUM_MOVIES * 3
        ].to_numpy()

    def _mask(self, value: np.ndarray, mask: np.ndarray) -> np.ndarray:
        return value[mask.nonzero()]

    def __getitem__(self, idx: int) -> tuple[np.ndarray, int, np.ndarray]:
        mask = self.masks[idx]
        input_data = self.inputs[idx]
        masked_target = self.targets[idx][mask.nonzero()]
        return input_data, len(mask.nonzero()[0]), masked_target

    def __len__(self) -> int:
        return len(self.df)

In [233]:
dataset = RecommendationDataset(df)

In [234]:
train_dataset, val_dataset = random_split(
    dataset, [0.9, 0.1], generator=torch.Generator().manual_seed(MANUAL_SEED)
)
print(f"{len(train_dataset)=}")
print(f"{len(val_dataset)=}")

len(train_dataset)=2547
len(val_dataset)=282


In [235]:
BATCH_SIZE = 32

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [236]:
DEVICE

device(type='cuda')

In [237]:
def collate_batch(batch: list) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    input_data_batch, mask_length_batch, masked_target_batch = [], [], []
    for input_data, mask_length, masked_target in batch:
        input_data_batch.append(torch.Tensor(input_data))
        mask_length_batch.append(mask_length)
        masked_target_batch.append(torch.Tensor(masked_target))

    # return pad_sequence(input_data_batch).long(), torch.Tensor(mask_length_batch), pad_sequence(masked_target_batch).long()
    return (
        torch.swapaxes(pad_sequence(input_data_batch), 0, 1),
        torch.Tensor(mask_length_batch),
        torch.swapaxes(pad_sequence(masked_target_batch), 0, 1),
    )


train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
val_dataloader = torch.utils.data.DataLoader(
    dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)

In [238]:
for batch in train_dataloader:
    inp, mask_size, out = batch
    print(inp.shape)
    print(mask_size.shape)
    print(out.shape)
    break

torch.Size([32, 1706])
torch.Size([32])
torch.Size([32, 154])


In [239]:
it = train_dataloader._get_iterator()

it._next_data()

(tensor([[2.4200e+02, 3.3000e-01, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [2.8800e+02, 3.4000e-01, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [1.4600e+02, 4.5000e-01, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         ...,
         [7.2900e+02, 1.9000e-01, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [4.8300e+02, 2.9000e-01, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [4.6000e+01, 2.7000e-01, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00]]),
 tensor([  8.,  30.,  10.,   8.,  56.,   8.,  14., 150.,   8.,  22., 142.,  78.,
          20.,  14.,  42.,  18.,   8.,  88., 132.,  20.,  28.,   8.,  16., 146.,
          18.,  38.,   8., 108.,  20.,   8.,  22.,  10.]),
 tensor([[0.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.8000, 0.0000, 0.4000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.8000, 0.6000,  ...,

## Creating the network

In [240]:
INPUT_SIZE = 24 + NUM_MOVIES

In [241]:
class RecSys(nn.Module):
    def __init__(
        self,
        hidden_dim1: int = 1024,
        hidden_dim2: int = 1024,
    ):
        super(RecSys, self).__init__()
        self.fc1 = nn.Linear(INPUT_SIZE, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, NUM_MOVIES)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return F.sigmoid(self.fc3(x))

In [242]:
torch.manual_seed(MANUAL_SEED)

model = RecSys()

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

model = model.to(DEVICE)

loss_fn = torch.nn.MSELoss(reduction="sum")

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

## Train model

In [243]:
def train_one_epoch(
    model,
    loader,
    optimizer,
    loss_fn,
    epoch,
):
    model.train()
    train_loss = 0.0
    total = 0

    loop = tqdm(
        loader,
        total=len(loader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )
    for batch in loop:
        input_data, mask_sizes, masked_target = batch
        input_data, masked_target = input_data.to(DEVICE), masked_target.to(DEVICE)

        # forward pass and loss calculation
        model_out = model(input_data)
        outputs = model_out[:, : masked_target.shape[1]]

        # zero the parameter gradients
        optimizer.zero_grad()

        loss = loss_fn(outputs.reshape(-1), masked_target.reshape(-1))
        # loss = loss_fn(outputs, masked_target)

        # backward pass
        loss.backward()
        total += masked_target.shape[1]

        # optimizer run
        optimizer.step()

        train_loss += loss.item()
        loop.set_postfix({"loss": train_loss / total})


def val_one_epoch(
    model,
    loader,
    loss_fn,
    epoch,
):
    loop = tqdm(
        loader,
        total=len(loader),
        desc=f"Epoch {epoch}: val",
        leave=True,
    )
    val_loss = 0.0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for batch in loop:
            input_data, mask_sizes, masked_target = batch
            input_data, masked_target = input_data.to(DEVICE), masked_target.to(DEVICE)

            model_out = model(input_data)
            outputs = model_out[:, : masked_target.shape[1]]

            loss = loss_fn(outputs.reshape(-1), masked_target.reshape(-1))

            val_loss += loss.item()
            total += masked_target.shape[1]
            loop.set_postfix({"loss": val_loss / total})
    return val_loss / total

In [244]:
import copy

NUM_EPOCHS = 10

best_loss = 1e10

for epoch in range(1, NUM_EPOCHS + 1):
    train_one_epoch(model, train_dataloader, optimizer, loss_fn, epoch)
    val_loss = val_one_epoch(model, val_dataloader, loss_fn, epoch)
    if val_loss <= best_loss:
        val_loss = best_loss
        torch.save(model, "../models/solution_model")


best = copy.deepcopy(model)

Epoch 1: train: 100%|██████████| 80/80 [00:01<00:00, 44.94it/s, loss=3.31]
Epoch 1: val: 100%|██████████| 9/9 [00:00<00:00, 123.28it/s, loss=1.89]
Epoch 2: train: 100%|██████████| 80/80 [00:01<00:00, 62.31it/s, loss=1.67]
Epoch 2: val: 100%|██████████| 9/9 [00:00<00:00, 94.74it/s, loss=1.6]
Epoch 3: train: 100%|██████████| 80/80 [00:01<00:00, 61.78it/s, loss=1.52]
Epoch 3: val: 100%|██████████| 9/9 [00:00<00:00, 107.15it/s, loss=1.47]
Epoch 4: train: 100%|██████████| 80/80 [00:01<00:00, 64.67it/s, loss=1.44]
Epoch 4: val: 100%|██████████| 9/9 [00:00<00:00, 116.89it/s, loss=1.34]
Epoch 5: train: 100%|██████████| 80/80 [00:01<00:00, 66.39it/s, loss=1.37]
Epoch 5: val: 100%|██████████| 9/9 [00:00<00:00, 118.42it/s, loss=1.27]
Epoch 6: train: 100%|██████████| 80/80 [00:01<00:00, 63.54it/s, loss=1.29]
Epoch 6: val: 100%|██████████| 9/9 [00:00<00:00, 118.44it/s, loss=1.23]
Epoch 7: train: 100%|██████████| 80/80 [00:01<00:00, 59.08it/s, loss=1.24]
Epoch 7: val: 100%|██████████| 9/9 [00:00<00:

## Test model

In [245]:
raise NotImplementedError

TypeError: exceptions must derive from BaseException

In [None]:
model = torch.load("../models/solution_model")
model.eval()

DetoxTransformer(
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (input_embeddings): TokenEmbedding(
    (embedding): Embedding(14747, 320)
  )
  (output_embeddings): TokenEmbedding(
    (embedding): Embedding(14747, 320)
  )
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-3): 4 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=320, out_features=320, bias=True)
          )
          (linear1): Linear(in_features=320, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=320, bias=True)
          (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.

In [None]:
import re

TOKENIZER = get_tokenizer("spacy", language="en_core_web_sm")
BOS_IDX, EOS_IDX, PAD_IDX = vocab(["<bos>", "<eos>", "<pad>"])


def preprocess_text(text: str, vocab=vocab) -> torch.Tensor:
    return torch.tensor([BOS_IDX] + vocab(TOKENIZER(text.lower())) + [EOS_IDX])


def decode_tokens(tokens: torch.Tensor, vocab=vocab) -> str:
    text = (
        " ".join(vocab.lookup_tokens(list(tokens.cpu().numpy())))
        .replace("<bos>", "")
        .replace("<eos>", "")
        .strip()
    )
    return re.sub(" +", " ", re.sub(r'\s([?.!"](?:\s|$))', r"\1", text))

In [None]:
def greedy_decode(
    model: torch.nn.Module,
    src: torch.Tensor,
    src_mask: torch.Tensor,
    max_size: int,
    start_symbol: int,
) -> torch.Tensor:
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    answer = torch.ones(1, 1).fill_(start_symbol).long().to(DEVICE)
    for _ in range(max_size - 1):
        memory = memory.to(DEVICE)

        trg_mask = (generate_square_subsequent_mask(answer.size(0)).bool()).to(DEVICE)
        outputs = model.decode(answer, memory, trg_mask)
        outputs = outputs.transpose(0, 1)

        probabilities = model.generator(outputs[:, -1])
        _, next_word = torch.max(probabilities, dim=1)
        next_word = next_word.item()

        answer = torch.cat(
            [answer, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0
        )
        if next_word == EOS_IDX:
            break
    return answer


def detoxify(model: torch.nn.Module, src_sentence: str) -> str:
    src = preprocess_text(src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    output_tokens = greedy_decode(
        model, src, src_mask, max_size=num_tokens + 5, start_symbol=BOS_IDX
    ).flatten()
    return decode_tokens(output_tokens)

In [None]:
model_answers = []
for i, r in tqdm(test_df.iterrows(), total=len(test_df)):
    model_answers.append(detoxify(model, r["toxic"][:MAX_SIZE]))


test_df["generated"] = model_answers

100%|██████████| 500/500 [00:42<00:00, 11.75it/s]


In [None]:
test_df.head()

Unnamed: 0,toxic,nontoxic,generated
0,It's feeding time at the fucking zoo!,it's time to eat at the zoo!,it 's about it and the floor.
1,Everyone here bet on the hero and lost their a...,they all took a hero and lost everything.,everyone 's here on your hero and all those th...
2,Then I got to come home to Melvin and his bull...,then I'm going home and Melvin's there.,then i have to come home and left his voice.
3,Sara here was hoping to pick your brains.,Sara was hoping you could handle her.,there was here i was hoping i 'd pick you your...
4,"Oh, that's stupid. If anyone wants to tell me ...","if anyone wants to tell me what's going on, I'...",if anyone wants me to tell me what 's going on...


In [None]:
test_df.to_csv("../data/generated/custom_transformer.csv", index=False)