# MTRec:

Given $I$ number historical clicked news of a user $ N^h = [n_1^h , n_2^h, ..., n^h_I ]$ and a set of $J$ candidate news $ N^c = [n^c_1, n^c_2, ..., n^c_J ] $, our goal is to calculate the user interest score $s_j$ of each candidate news according to the historical behavior of the user, then the candidate news with the highest interest score is recommended to the user. 

For each news, we have its title text T , category label $p^c$, and entity set E. 

## 2.1 News Recommendation Framework

As shown in Figure 2, there are three main components in news recommendation framework, i.e., a news encoder, a user encoder, and a click predictor. 
### News Encoder
For each news n, we encode its title with pre-trained BRET (Devlin et al., 2019). Specifically, we feed the tokenized text T into the BERT model and **adopt the embedding of [CLS] token as the news representation r**. 

We denote the encoded vectors of historical clicked news $N^h$ and candidate news $N^c$ as $R^h = [r^h_1 , r^h_2 , ..., r^h_I ]$ and $R^c = [r^c_1, r^c_2, ..., r^c_J ]$, respectively. 

### Imports Loading

In [1]:
# Data manipulation & Maths imports
import os.path
import polars as pl
import numpy as np
from math import ceil
from pprint import pprint

# Torch & Transformer imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModel, 
    AutoTokenizer, 
    RobertaModel, 
    RobertaTokenizer, 
    XLMRobertaModel, 
    XLMRobertaTokenizer
)

# ebrec constants
from ebrec.utils._constants import (
    DEFAULT_ARTICLE_ID_COL,         # article_id
    DEFAULT_TITLE_COL,              # title
    DEFAULT_BODY_COL,               # body
    DEFAULT_SUBTITLE_COL,           # subtitle
    DEFAULT_TOPICS_COL,             # topics
    DEFAULT_CATEGORY_STR_COL,       # category_str
    DEFAULT_LABELS_COL,             # labels
    DEFAULT_USER_COL,               # user_id
    DEFAULT_HISTORY_ARTICLE_ID_COL, # article_id_fixed
    DEFAULT_INVIEW_ARTICLES_COL,    # article_ids_inview
    DEFAULT_CLICKED_ARTICLES_COL,   # article_ids_clicked
    DEFAULT_IMPRESSION_ID_COL       # impression_id
)

# ebrec utils
from ebrec.utils._articles_behaviors import map_list_article_id_to_value
from ebrec.utils._behaviors import truncate_history, create_binary_labels_column
from ebrec.utils._polars import slice_join_dataframes
from ebrec.utils._python import (
    generate_unique_name,
    repeat_by_list_values_from_matrix,
    create_lookup_objects,
    create_lookup_dict,
)

# Columns to be used in the dataset processing
COLUMNS = [
    DEFAULT_USER_COL,                # "user_id"
    DEFAULT_HISTORY_ARTICLE_ID_COL,  # "article_id_fixed"
    DEFAULT_INVIEW_ARTICLES_COL,     # "article_ids_inview"
    DEFAULT_CLICKED_ARTICLES_COL,    # "article_ids_clicked"
    DEFAULT_IMPRESSION_ID_COL,       # "impression_id"
]
DEFAULT_TOKENS_COL = "tokens"
N_SAMPLES_COL = "n_samples"


2024-06-10 17:31:34.511322: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


### Data Loading

In [4]:
class NewsDataset(Dataset):

    behaviors: pl.DataFrame
    history: pl.DataFrame
    articles: pl.DataFrame

    def __init__(
        self,
        tokenizer,
        behaviors: pl.DataFrame,
        history: pl.DataFrame,
        articles: pl.DataFrame,
        history_size: int = 30,
        padding_value: int = 0,
        max_length=128,
        batch_size=32,
        embeddings_path=None,
    ):
        self.behaviors = behaviors
        self.history = history
        self.articles = articles
        self.history_size = history_size
        self.padding_value = padding_value
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.max_length = max_length

        # TODO: I decided to instead only use pre-computed embeddings for now. You might want to look into this later down the line and implement custom embeddings (and e.g. train BERT as well).
        self.embeddings_path = embeddings_path

        # NOTE: Keep an eye on this if memory issues arise
        self.articles = self.articles.select(
            [
                DEFAULT_ARTICLE_ID_COL,     # article_id
                DEFAULT_TITLE_COL,          # title
                DEFAULT_BODY_COL,           # body
                DEFAULT_SUBTITLE_COL,       # subtitle
                DEFAULT_TOPICS_COL,         # topics
                DEFAULT_CATEGORY_STR_COL,   # category_str
            ]
        ).collect()

        self._process_history()
        self._prepare_training_data()

    def _process_history(self):
        self.history = (
            self.history.select(
                [
                    DEFAULT_USER_COL,               # "user_id"
                    DEFAULT_HISTORY_ARTICLE_ID_COL  # article_id_fixed
                ]
            )
            .pipe(
                truncate_history,
                column=DEFAULT_HISTORY_ARTICLE_ID_COL,
                history_size=self.history_size,
                padding_value=self.padding_value,
                enable_warning=False,
            )
            .collect()
        )

    def _prepare_training_data(self):
        self.behaviors = self.behaviors.collect()

        self.data: pl.DataFrame = (
            slice_join_dataframes(
                df1=self.behaviors,
                df2=self.history,
                on=DEFAULT_USER_COL,
                how="left",
            )
            .select(COLUMNS)
            .pipe(create_binary_labels_column, seed=42, label_col=DEFAULT_LABELS_COL)
            .with_columns(
                pl.col(DEFAULT_LABELS_COL).list.len().alias(N_SAMPLES_COL)
            )
        )

        assert self.embeddings_path is not None, "You need to provide a path to the embeddings file."
        embeddings = pl.read_parquet(self.embeddings_path)
        print("Loaded Embeddings Shape:", embeddings.shape)
        print("Sample Embedding:", embeddings.head())

        self.articles = (
            self.articles.lazy()
            .join(embeddings.lazy(), on=DEFAULT_ARTICLE_ID_COL, how="inner")
            .rename({"FacebookAI/xlm-roberta-base": DEFAULT_TOKENS_COL})
            .collect()
        )

        print("Joined Articles with Embeddings:", self.articles.shape)
        print("Sample Article Embedding:", self.articles.head())

        article_dict = create_lookup_dict(
            self.articles.select(DEFAULT_ARTICLE_ID_COL, DEFAULT_TOKENS_COL),
            key=DEFAULT_ARTICLE_ID_COL,
            value=DEFAULT_TOKENS_COL,
        )

        self.lookup_indexes, self.lookup_matrix = create_lookup_objects(
            article_dict, unknown_representation="zeros"
        )

        # print("Embeddings Shape:", self.lookup_matrix.shape)
        # print("Sample Embedding:", self.lookup_matrix[0])

    def __len__(self):
        """
        Number of batch steps in the data
        """
        return int(ceil(self.behaviors.shape[0] / self.batch_size))

    def __getitem__(self, index: int):
        """
        Get the batch of samples for the given index.

        Note: The dataset class provides a single index for each iteration. The batching is done internally in this method
        to utilize and optimize for speed. This can be seen as a mini-batching approach.

        Args:
            index (int): An integer index.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: A tuple containing the input features and labels as torch Tensors.
                Note, the output of the PyTorch DataLoader is (1, *shape), where 1 is the DataLoader's batch_size.
        """
        # Clever way to batch the data:
        batch_indices = range(index * self.batch_size,
                              (index + 1) * self.batch_size)
        batch = self.data[batch_indices]

        x = (
            batch.drop(DEFAULT_LABELS_COL)
            .pipe(
                map_list_article_id_to_value,
                behaviors_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
                mapping=self.lookup_indexes,
                fill_nulls=[0],
            )
            .pipe(
                map_list_article_id_to_value,
                behaviors_column=DEFAULT_INVIEW_ARTICLES_COL,
                mapping=self.lookup_indexes,
                fill_nulls=[0],
            )
        )
        # =>
        repeats = np.array(batch[N_SAMPLES_COL])
        # =>
        history_input = repeat_by_list_values_from_matrix(
            input_array=x[DEFAULT_HISTORY_ARTICLE_ID_COL].to_list(),
            matrix=self.lookup_matrix,
            repeats=repeats,
        ).squeeze(2)
        # =>
        candidate_input = self.lookup_matrix[
            x[DEFAULT_INVIEW_ARTICLES_COL].explode().to_list()
        ]
        # =>
        history_input = torch.tensor(history_input)
        candidate_input = torch.tensor(candidate_input)
        y = torch.tensor(batch[DEFAULT_LABELS_COL].explode(), dtype=torch.float32).view(
            -1, 1
        )
        # ========================
        return history_input, candidate_input, y


def load_data(tokenizer, data_path, split="train", embeddings_path=None):
    _data_path = os.path.join(data_path, split)

    df_behaviors = pl.scan_parquet(_data_path + "/behaviors.parquet")
    df_history = pl.scan_parquet(_data_path + "/history.parquet")
    df_articles = pl.scan_parquet(data_path + "/articles.parquet")

    return NewsDataset(tokenizer, df_behaviors, df_history, df_articles, embeddings_path=embeddings_path)


# Model and tokenizer initialization
MODEL_NAME = "FacebookAI/xlm-roberta-base"

# NOTE: We need the multilingual model for the dataset
# bert = XLMRobertaModel.from_pretrained(MODEL_NAME)
# tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

DATA_PATH = "../data/demo"
EMBEDDINGS_PATH = "../data/FacebookAI-xlm-roberta-base/FacebookAI_xlm_roberta_base/xlm_roberta_base.parquet"

tokenizer = None
dataset = load_data(tokenizer, DATA_PATH, split="train",
                    embeddings_path=EMBEDDINGS_PATH)

Loaded Embeddings Shape: (125541, 2)
Sample Embedding: shape: (5, 2)
┌────────────┬───────────────────────────────────┐
│ article_id ┆ FacebookAI/xlm-roberta-base       │
│ ---        ┆ ---                               │
│ i32        ┆ list[f32]                         │
╞════════════╪═══════════════════════════════════╡
│ 3000022    ┆ [0.102449, 0.101148, … -0.020715… │
│ 3000063    ┆ [0.107297, 0.103073, … 0.004873]  │
│ 3000613    ┆ [0.125139, 0.124621, … -0.05177]  │
│ 3000700    ┆ [0.105697, 0.076335, … -0.034872… │
│ 3000840    ┆ [0.098175, 0.114629, … -0.024436… │
└────────────┴───────────────────────────────────┘
Joined Articles with Embeddings: (11777, 7)
Sample Article Embedding: shape: (5, 7)
┌────────────┬──────────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ article_id ┆ title        ┆ body         ┆ subtitle    ┆ topics      ┆ category_st ┆ tokens      │
│ ---        ┆ ---          ┆ ---          ┆ ---         ┆ ---         ┆ r          

In [230]:
class NewsDatasetV2(NewsDataset):
    def __len__(self):
        return self.data.shape[0]
    def __getitem__(self, index: int):
        sample = dataset.data[index]
        sample = sample.pipe(
            map_list_article_id_to_value,
            behaviors_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
            mapping=dataset.lookup_indexes,
            fill_nulls=[0],

        ).pipe(
            map_list_article_id_to_value,
            behaviors_column=DEFAULT_INVIEW_ARTICLES_COL,
            mapping=dataset.lookup_indexes,
            fill_nulls=[0],
        )

        _history = sample[DEFAULT_HISTORY_ARTICLE_ID_COL].explode().explode().to_list()
        history = torch.from_numpy(dataset.lookup_matrix[_history])
        _candidates = sample[DEFAULT_INVIEW_ARTICLES_COL].explode().explode().to_list()
        candidates = torch.from_numpy(dataset.lookup_matrix[_candidates])
        # dataset.lookup_indexes
        labels = torch.tensor(sample[DEFAULT_LABELS_COL].to_list()[0])
        return history, candidates, labels


In [231]:
hidden_dim = 768
W = nn.Linear(hidden_dim, hidden_dim)
q = nn.Parameter(torch.randn(hidden_dim))
dataset = NewsDatasetV2(tokenizer, dataset.behaviors.lazy(), dataset.history.lazy(), dataset.articles.lazy(), embeddings_path=EMBEDDINGS_PATH)

Loaded Embeddings Shape: (125541, 2)
Sample Embedding: shape: (5, 2)
┌────────────┬───────────────────────────────────┐
│ article_id ┆ FacebookAI/xlm-roberta-base       │
│ ---        ┆ ---                               │
│ i32        ┆ list[f32]                         │
╞════════════╪═══════════════════════════════════╡
│ 3000022    ┆ [0.102449, 0.101148, … -0.020715… │
│ 3000063    ┆ [0.107297, 0.103073, … 0.004873]  │
│ 3000613    ┆ [0.125139, 0.124621, … -0.05177]  │
│ 3000700    ┆ [0.105697, 0.076335, … -0.034872… │
│ 3000840    ┆ [0.098175, 0.114629, … -0.024436… │
└────────────┴───────────────────────────────────┘
Joined Articles with Embeddings: (11777, 7)
Sample Article Embedding: shape: (5, 7)
┌────────────┬──────────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ article_id ┆ title        ┆ body         ┆ subtitle    ┆ topics      ┆ category_st ┆ tokens      │
│ ---        ┆ ---          ┆ ---          ┆ ---         ┆ ---         ┆ r          

In [218]:
history, candidates, labels = dataset[0]
print(history.shape, candidates.shape, labels.shape)

torch.Size([30, 768]) torch.Size([11, 768]) torch.Size([11])


In [152]:
att = q * F.tanh(W(history))
att_weight = F.softmax(att, dim=-1)

user_embedding = torch.sum(history * att_weight, dim=0)

# print(f"{user_embedding.shape=}")
# print(f"{user_embedding.unsqueeze(-1).shape=}")
# score = torch.bmm(candidates, user_embedding.unsqueeze(-1)) # B x M x 1

In [190]:
scores = torch.bmm(candidates.unsqueeze(0), user_embedding.unsqueeze(-1).unsqueeze(0)).squeeze()
scores

tensor([10.9078, 10.8969, 10.8815, 10.8896, 10.8903, 10.8746, 10.9196, 10.8738,
        10.8796, 10.8819, 10.8866], grad_fn=<SqueezeBackward0>)

In [232]:
def positive_nll_loss(preds, labels):
    pos = preds[labels == 1].exp()
    neg = preds[labels == 0].exp().sum(-1)
    return - torch.log(pos / (pos + neg)).mean()

# positive_nll_loss(scores.squeeze(), labels)

In [236]:
optimizer = optim.Adam([q, W.weight, W.bias], lr=0.0005)

for epoch in range(10):
    print(f"Epoch {epoch}")
    losses = []
    q.requires = True
    W.training = True
    with tqdm(dataset) as pbar:
        for batch in pbar:
            history, candidates, labels = batch
            att = q * F.tanh(W(history))
            att_weight = F.softmax(att, dim=-1)
            user_embedding = torch.sum(history * att_weight, dim=0)
            scores = torch.bmm(candidates.unsqueeze(0), user_embedding.unsqueeze(-1).unsqueeze(0)).squeeze()
            loss = positive_nll_loss(scores.squeeze(), labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
            pbar.set_postfix(loss=losses[-1])
    print(f"Average Loss: {np.mean(losses)}")



Epoch 0


  0%|          | 0/24724 [00:00<?, ?it/s, loss=2.41]

  3%|▎         | 675/24724 [00:14<08:28, 47.32it/s, loss=1.94]


KeyboardInterrupt: 

### News Encoder

In [6]:
class NewsEncoder(nn.Module):
    def __init__(self, bert):
        super(NewsEncoder, self).__init__()
        self.bert = bert

    def forward(self, input_ids):
        outputs = self.bert(input_ids)
        outputs = outputs.last_hidden_state[:, 0, :]
        return outputs

### User Encoder
To gain a user representation from the representations of historical clicked news, existing methods usually employ sequential (An et al., 2019) or attentive models (Wu et al., 2019d; Li et al., 2018). In this paper, we adopt additive attention as the user encoder to compress the historical information Rh. The user representation $r^u$ is then denoted as: 

$$ \textbf{r}^u = \sum_{i=1}^I \textbf{a}^u_i \textbf{r}^h_i , \textbf{a}^u_i = \text{softmax}(\textbf{q}^u·\tanh(W^u r^h_i )),$$

 where qu and Wu are trainable parameters. 
 

In [24]:
class MTRec(nn.Module):
    def __init__(self, hidden_dim):
        super(MTRec, self).__init__()

        self.W = nn.Linear(hidden_dim, hidden_dim)
        self.q = nn.Parameter(torch.randn(hidden_dim))

    def forward(self, history, candidates):
        '''
            B - batch size (keep in mind we use an unusual mini-batch approach)
            H - history size (number of articles in the history, usually 30)
            D - hidden size (768)
            history:    B x H x D 
            candidates: B x 1 x D
        '''

        # print(f"{candidates.shape=}")
        att = self.q * F.tanh(self.W(history))
        att_weight = F.softmax(att, dim=1)
        # print(f"{att_weight.shape=}")

        user_embedding = torch.sum(history * att_weight, dim=1)
        # print(f"{user_embedding.shape=}")
        # print(f"{user_embedding.unsqueeze(-1).shape=}")
        score = torch.bmm(candidates, user_embedding.unsqueeze(-1)) # B x M x 1
        # print(score.shape)
        return score.squeeze(-1)

    def reshape(self, batch_news, bz):
        n_news = len(batch_news) // bz
        reshaped_batch = batch_news.reshape(bz, n_news, -1)
        return reshaped_batch
    
model = MTRec(hidden_dim=768)


In [28]:

from tqdm import tqdm
import time

# Assuming model and dataset are already defined
optimizer = optim.Adam(model.parameters(), lr=0.05)
criterion = nn.BCEWithLogitsLoss()
dataloader = DataLoader(dataset, batch_size=1)

num_epochs = 5
writer = SummaryWriter('runs/experiment_1')  # Specify your log directory

for epoch in range(num_epochs):
    epoch_start_time = time.time()
    running_loss = 0.0
    num_batches = len(dataloader)
    losses = []
    with tqdm(total=num_batches, desc=f"Epoch {epoch+1}/{num_epochs}", unit='batch') as pbar:
        for i, batch in enumerate(dataloader):
            history, candidates, labels = batch
            history.squeeze_(0)
            candidates.squeeze_(0)
            labels.squeeze_(0)
            labels = labels.to(torch.bool)

            out = model(history, candidates)
            # out[labels==0] = 0

            # loss = nn.Gaussian(out, labels)
            # loss = positive_nll(out, labels)
            optimizer.zero_grad()
            
            
            pos = labels * out.exp()
            neg = torch.sum((~labels) * out.exp(), dim=0)
            # print(pos, neg)
            loss = -torch.log(pos.sum(1) / (pos.sum(1) + neg + 1e-6)).sum()
            
            loss.backward()
            
            
            optimizer.step()
            losses.append(loss.item())

            pbar.set_postfix({"loss": round(losses[-1], 3)})
            pbar.update(1)
            break
    print("Average loss:", np.mean(losses))

Step: 772:  Loss: 292.7912

In [55]:

#             out = model(history, candidates)
#             loss = criterion(out, labels)

#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()
            
#             running_loss += loss.item()
            
#             # Log the current loss value to TensorBoard
#             writer.add_scalar('Loss/Train', loss.item(), epoch * num_batches + i)
            
#             # Update the progress bar
#             pbar.set_postfix(loss=loss.item())
#             pbar.update(1)
    
#     avg_loss = running_loss / num_batches
#     epoch_time = time.time() - epoch_start_time
    
#     # Log the average loss and epoch time to TensorBoard
#     writer.add_scalar('Loss/Avg_Train', avg_loss, epoch)
#     writer.add_scalar('Time/Epoch', epoch_time, epoch)
    
#     # Display summary for the epoch
#     print(f"Epoch {epoch + 1}/{num_epochs}")
#     print(f"Average loss: {avg_loss:.12f}")
#     print(f"Epoch time: {epoch_time // 60:.0f}:{epoch_time % 60:02.0f}\n")

# writer.close()


tensor([[nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [n


### Click Predictor
For each candidate news, we obtain its interest score $s_j$ by matching the candidate news vector $r^c_j$ and the user representation $r^u$ via dot product: $s_j = r^c_j · r^u$. 

### Loss Function
Following previous work (Huang et al., 2013; Wu et al., 2019d), we employ the NCE loss to train the main ranking model. Then the main task loss LM ain is the negative log-likelihood of all positive samples in the training dataset D: 

$$ \mathcal{L}_{Main} = − \sum^{|D|}_{i=1} \log{\exp(s^+_i ) \over \exp(s^+_i ) + \sum^L_{j=1} \exp(s^j_i )} $$ 

where $s^+$ denotes the interest scores of positive news, $L$ indicates the number of negative news.