# Getting started

## Load functionality

In [8]:
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import tensorflow as tf
import polars as pl
# 
from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL, 
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_SUBTITLE_COL,
    DEFAULT_LABELS_COL,
    DEFAULT_TITLE_COL, 
    DEFAULT_USER_COL, 
)
#
from ebrec.utils._behaviors import (
    create_binary_labels_column, 
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history, 
)
from ebrec.utils._articles import convert_text2encoding_with_transformers
from ebrec.utils._articles import create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._polars import concat_str_columns
#
from ebrec.models.newsrec.dataloader import NRMSDataLoader
from ebrec.models.newsrec.model_config import hparams_nrms
from ebrec.models.newsrec import NRMSModel

## Load dataset

In [9]:

def ebnerd_from_path(path:Path, history_size:int = 30) -> pl.DataFrame:
    """
    Load ebnerd - function 
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)
        .pipe(
            truncate_history,
            column=DEFAULT_HISTORY_ARTICLE_ID_COL,
            history_size=history_size,
            padding_value=0,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .join(df_history, on=DEFAULT_USER_COL, how="inner")
        .collect()
    )
    return df_behaviors

### Generate labels
We sample a few just to get started. For testset we just make up a dummy column with 0 and 1 - this is not the true labels.

In [10]:
path = Path("../downloads/demo")
HISTORY_SIZE = 30
N_SAMPLES = 100
df_train = (
    ebnerd_from_path(path.joinpath("train"), history_size=HISTORY_SIZE)
    .pipe(sampling_strategy_wu2019,npratio=4,shuffle=True,with_replacement=True,seed=123,)
    .pipe(create_binary_labels_column)
    .sample(n=N_SAMPLES)
)
# =>
df_validation = (
    ebnerd_from_path(path.joinpath("validation"), history_size=HISTORY_SIZE)
    .pipe(create_binary_labels_column)
    .sample(n=N_SAMPLES)
)
# =>
df_test = (
    ebnerd_from_path(path.joinpath("test"), history_size=HISTORY_SIZE)
    .with_columns(
        pl.col(DEFAULT_INVIEW_ARTICLES_COL)
        .list.eval(pl.element() * 0)
        .list.tail(-1)
        .list.eval(pl.element().extend_constant(1, n=1))
        .alias(DEFAULT_LABELS_COL)
    )
    .sample(n=N_SAMPLES)
)

### Look at the difference between Training/Validation and Testset
Note, the testset doesn't include labels, and we have remove some of the other columns.

In [11]:
df_train.head(2)

impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage,article_id_fixed,labels
u32,i32,datetime[μs],f32,f32,i8,list[i64],list[i64],u32,bool,i8,i8,i8,bool,u32,f32,f32,list[i32],list[i8]
181760259,,2023-02-26 12:31:32,13.0,,2,"[9651272, 9649677, … 9647661]",[9649290],1966060,False,,,,False,166180,3.0,,"[9644792, 9646176, … 9646402]","[0, 0, … 0]"
42169409,,2023-02-28 19:32:08,33.0,,2,"[9654866, 9653210, … 9654861]",[9654866],1399580,False,,,,False,145769,2.0,48.0,"[9635113, 9645517, … 9646947]","[1, 0, … 0]"


In [12]:
df_test.head(2)

impression_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,article_id_fixed,labels
u32,datetime[μs],f32,f32,i8,list[i32],u32,bool,i8,i8,i8,bool,u32,list[i32],list[i32]
294542904,2023-03-13 10:57:42,12.0,,1,"[9672801, 9673153, … 9667877]",2389317,False,,,,False,57364,"[9663935, 9663605, … 9667329]","[0, 0, … 1]"
320871504,2023-03-14 18:26:38,95.0,100.0,2,"[9675349, 9675372, … 9675304]",2348586,False,,,,False,35530,"[9665934, 9667530, … 9667800]","[0, 0, … 1]"


## Load articles

In [13]:
df_articles = pl.read_parquet(path.joinpath("articles.parquet"))
df_articles.head(2)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str
3033563,"""Kniven for str…","""I aftenens udg…",2023-06-29 06:20:47,False,"""Når man ser fj…",2007-03-27 10:22:08,"[3005524, 3005525]","""article_defaul…","""https://ekstra…",[],[],[],414,"[433, 436]","""underholdning"""
3057640,"""Leths pige var…","""Jørgen Leth æn…",2023-06-29 06:21:24,False,"""Filmmanden, fo…",2005-10-09 17:36:00,[3047111],"""article_defaul…","""https://ekstra…",[],[],[],414,[432],"""underholdning"""


## Init model using HuggingFace's tokenizer and wordembedding
In the original implementation, they use the GloVe embeddings and tokenizer. To get going fast, we'll use a multilingual LLM from Hugging Face. 
Utilizing the tokenizer to tokenize the articles and the word-embedding to init NRMS.


In [14]:
TRANSFORMER_MODEL_NAME = "bert-base-multilingual-cased"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the 
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
# 
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)
# =>
article_mapping = create_article_id_to_value_mapping(df=df_articles, value_col=token_col_title)

# Initiate the dataloaders
In the implementations we have disconnected the models and data. Hence, you should built a dataloader that fits your needs.

In [15]:
train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=64,
)
val_dataloader = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=True,
    batch_size=32,
)
test_dataloader = NRMSDataLoader(
    behaviors=df_test,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=True,
    batch_size=32,
)

## Train the model


In [19]:
MODEL_NAME = "NRMS"
LOG_DIR = f"downloads/runs/{MODEL_NAME}"
MODEL_WEIGHTS = f"downloads/data/state_dict/{MODEL_NAME}/weights"

# CALLBACKS
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=MODEL_WEIGHTS, save_best_only=True, save_weights_only=True, verbose=1
)

hparams_nrms.history_size = HISTORY_SIZE
model = NRMSModel(
    hparams=hparams_nrms,
    word2vec_embedding=word2vec_embedding,
    seed=42,
)
hist = model.model.fit(
    train_dataloader,
    validation_data=val_dataloader,
    epochs=1,
    callbacks=[tensorboard_callback, early_stopping, modelcheckpoint],
)
model = model.model.load_weights(filepath=MODEL_WEIGHTS)





# Predict on the testset

In [17]:
pred = model.scorer.predict(test_dataloader)



In [32]:
df_test = add_prediction_scores(df_test, pred.tolist()).pipe(
    add_known_user_column, known_users=df_train[DEFAULT_USER_COL]
)

# Example how to compute some metrics:

In [35]:
from ebrec.evaluation import AucScore, MrrScore, MetricEvaluator, NdcgScore
def compute_evaluation_scores(
    df: pl.DataFrame,
    metric_functions: list[MetricEvaluator] = [
        AucScore(),
        MrrScore(),
        NdcgScore(k=5),
        NdcgScore(k=10),
    ],
    pred_score = "scores",
    labels:str="labels"
) -> dict[str, float]:
    # =>
    y_pred = df[pred_score].to_list()
    y_true = df[labels].to_list()
    # =>
    metr = MetricEvaluator(
        labels=y_true,
        predictions=y_pred,
        metric_functions=metric_functions,
    )
    return metr.evaluate().evaluations

compute_evaluation_scores(df_test)


{'auc': 0.4516887234900142,
 'mrr': 0.3148457418185679,
 'ndcg@5': 0.35970396106087243,
 'ndcg@10': 0.41935821814111035}