# Hello

This notebook is an example of how to make a beyond-accuracy dataset, and how one could make baselines

# Get started

## Dependencies

In [1]:
from ebrec.utils._python import write_json_file, read_json_file
from pathlib import Path
import polars as pl
import numpy as np

from ebrec.utils._constants import (
    DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL,
    DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_TOTAL_PAGEVIEWS_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_SENTIMENT_SCORE_COL,
    DEFAULT_SENTIMENT_LABEL_COL,
    DEFAULT_TOTAL_INVIEWS_COL,
    DEFAULT_IS_SUBSCRIBER_COL,
    DEFAULT_CATEGORY_STR_COL,
    DEFAULT_IS_SSO_USER_COL,
    DEFAULT_ARTICLE_ID_COL,
    DEFAULT_POSTCODE_COL,
    DEFAULT_TOPICS_COL,
    DEFAULT_GENDER_COL,
    DEFAULT_USER_COL,
    DEFAULT_AGE_COL,
)

from ebrec.evaluation.beyond_accuracy import (
    IntralistDiversity,
    Distribution,
    Serendipity,
    Sentiment,
    Coverage,
    Novelty,
)

from ebrec.utils._articles import create_sort_based_prediction_score
from ebrec.utils._behaviors import truncate_history

## Load dataset

In [2]:
ROOT_FOLDER = "evaluation_artifacts"
DATASET_SPLIT = "test"
DATASET_SIZE = "demo"
PATH = Path(f"../downloads/{DATASET_SIZE}")
PATH_BEYOND_ACCURACY = PATH.joinpath(ROOT_FOLDER)
PATH_BEYOND_ACCURACY.mkdir(exist_ok=True, parents=True)

# LOAD:
df_beyond_accuarcy = pl.scan_parquet(
    PATH.joinpath(DATASET_SPLIT, "behaviors.parquet")
).filter(pl.col("is_beyond_accuracy"))
df_behaviors = pl.scan_parquet(
    PATH.joinpath(DATASET_SPLIT, "behaviors.parquet")
).filter(~pl.col("is_beyond_accuracy"))
df_articles = pl.scan_parquet(PATH.joinpath("articles.parquet"))
df_history = pl.scan_parquet(PATH.joinpath(DATASET_SPLIT, "history.parquet")).select(
    DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL
)

### Output files for Beyond Accuracy

In [3]:
BEYOND_ACCURACY_HISTORY_DICT = "beyond_accuracy_history_dict.json"
BEYOND_ACCURACY_USERS_DICT = "beyond_accuracy_users_dict.json"
CANDIDATE_LIST = "candidate_list.json"
ARTICLES_DICT = "articles_dict.json"
BEHAVIORS_TIMESTAMP_DICT = "behaviors_timestamp_dict.json"

# Make / Dump Metadata

## Make candidate list for beyond-accuracy:

We select the candidate list from the testset

In [4]:
candidate_list = (
    df_beyond_accuarcy.select(pl.col(DEFAULT_INVIEW_ARTICLES_COL).first())
    .collect()
    .to_series()
)[0].to_list()
write_json_file(candidate_list, PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST))

print(f"Number of Candidate IDs: {len(candidate_list)} (example: {candidate_list[:5]})")
print(f"Dump: {PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST)}")

Number of Candidate IDs: 250 (example: [9793163, 9793069, 9792076, 9792749, 9791280])
Dump: ../downloads/demo/evaluation_artifacts/candidate_list.json


#### Sanity check

In [5]:
load_candidate_list = read_json_file(PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST))
if (
    not (
        df_beyond_accuarcy.select(DEFAULT_INVIEW_ARTICLES_COL).collect()
        == candidate_list
    )
    .sum()[DEFAULT_INVIEW_ARTICLES_COL]
    .to_list()[0]
    == df_beyond_accuarcy.select(DEFAULT_INVIEW_ARTICLES_COL).collect().shape[0]
):
    raise ValueError("candidate_list is not identical in the testset")

if not (np.array(candidate_list) - np.array(load_candidate_list)).sum() == 0:
    raise ValueError("candidate_list was not dump correctly")

print("santity check - passed")

santity check - passed


## User meta data: Segments

In [6]:
user_meta_columns = [
    DEFAULT_IS_SUBSCRIBER_COL,
    DEFAULT_IS_SSO_USER_COL,
    DEFAULT_POSTCODE_COL,
    DEFAULT_GENDER_COL,
    DEFAULT_AGE_COL,
]
df_users = df_beyond_accuarcy.select(pl.col(user_meta_columns)).collect()

users_dict = {col: df_users[col].to_list() for col in df_users.columns}
write_json_file(users_dict, PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_USERS_DICT))
print(f"Dump: {PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_USERS_DICT)}")

print(f"#rows: {df_users.shape[0]}")
df_users.head(3)

Dump: ../downloads/demo/evaluation_artifacts/beyond_accuracy_users_dict.json
#rows: 1615


is_subscriber,is_sso_user,postcode,gender,age
bool,bool,i8,i8,i8
True,True,,0.0,30.0
True,True,,,
True,True,,0.0,


## User Histories

In [7]:
HISTORY_SIZE = 10

df_user_histoies = (
    df_beyond_accuarcy.select(DEFAULT_USER_COL)
    .join(df_history, on=DEFAULT_USER_COL, how="left")
    .pipe(
        truncate_history,
        column=DEFAULT_HISTORY_ARTICLE_ID_COL,
        history_size=HISTORY_SIZE,
        padding_value=None,
        enable_warning=False,
    )
    .collect()
)
user_history_dict = {
    DEFAULT_HISTORY_ARTICLE_ID_COL: df_user_histoies[
        DEFAULT_HISTORY_ARTICLE_ID_COL
    ].to_list()
}
write_json_file(
    user_history_dict, PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_HISTORY_DICT)
)
print(f"Dump: {PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_HISTORY_DICT)}")
print(f"#rows: {df_user_histoies.shape[0]}")
df_user_histoies.head(3)

Dump: ../downloads/demo/evaluation_artifacts/beyond_accuracy_history_dict.json
#rows: 1615


user_id,article_id_fixed
u32,list[i32]
1744285,"[9790532, 9790532, … 9790700]"
631807,"[9790756, 9790811, … 9790804]"
1984028,"[9789300, 9790942, … 9789896]"


## Timestamp for Behaviors

Used for computing the AUC as function of time

In [8]:
df_behaviors_timestamp = df_behaviors.select(
    pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).cast(pl.Utf8),
).collect()
behaviors_timestamp_dict = {
    DEFAULT_IMPRESSION_TIMESTAMP_COL: df_behaviors_timestamp[
        DEFAULT_IMPRESSION_TIMESTAMP_COL
    ].to_list()
}
write_json_file(
    behaviors_timestamp_dict, PATH_BEYOND_ACCURACY.joinpath(BEHAVIORS_TIMESTAMP_DICT)
)
print(f"Dump: {PATH_BEYOND_ACCURACY.joinpath(BEHAVIORS_TIMESTAMP_DICT)}")
print(f"#rows: {df_behaviors_timestamp.shape[0]}")
df_behaviors_timestamp.head(3)

Dump: ../downloads/demo/evaluation_artifacts/behaviors_timestamp_dict.json
#rows: 27052


impression_time
str
"""2023-06-05 15:…"
"""2023-06-05 15:…"
"""2023-06-01 10:…"


# Make Candidate lookup dict / Dump lookup dict

### Articles to include: *candidate-list* and *history-articles*

In [9]:
history_article_id = (
    df_user_histoies.lazy()
    .select(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).explode().unique())
    .collect()[DEFAULT_HISTORY_ARTICLE_ID_COL]
    .to_list()
)
article_ids = candidate_list + history_article_id

## Select articles that should be included in the lookup dictionary

In [10]:
# =>
df_lookup_articles = (
    df_articles.filter(pl.col(DEFAULT_ARTICLE_ID_COL).is_in(article_ids))
    .with_columns(
        pl.col(
            DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL,
            DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL,
        ).cast(pl.Utf8)
    )
    # Zeros might cause issues
    .with_columns(
        pl.col(DEFAULT_TOTAL_INVIEWS_COL, DEFAULT_TOTAL_PAGEVIEWS_COL).fill_null(1)
    )
    .collect()
)

### Make normalize popularity-scores

In [11]:
DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MAX = (
    DEFAULT_TOTAL_PAGEVIEWS_COL + "_normalized_max"
)
DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX = (
    DEFAULT_TOTAL_PAGEVIEWS_COL + "_normalized_min_max"
)

MIN_X = df_lookup_articles[DEFAULT_TOTAL_PAGEVIEWS_COL].min()
MAX_X = df_lookup_articles[DEFAULT_TOTAL_PAGEVIEWS_COL].max()
MIN_RANGE = 1e-4
MAX_RANGE = 1.0

df_lookup_articles = df_lookup_articles.with_columns(
    (  # SIMPLE MAX NORMALIZATION: x / max()
        pl.col(DEFAULT_TOTAL_PAGEVIEWS_COL) / pl.col(DEFAULT_TOTAL_PAGEVIEWS_COL).max()
    ).alias(DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MAX)
).with_columns(
    (  #  MIN-MAX NORMALIZATION: ( x_i − X_min ⁡ ) / ( X_max ⁡ − X_min ⁡ ) * (max_range − min_range) + min_range
        ((pl.col(DEFAULT_TOTAL_PAGEVIEWS_COL) - MIN_X) / (MAX_X - MIN_X))
        * (MAX_RANGE - MIN_RANGE)
        + MIN_RANGE
    ).alias(
        DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX
    )
)

df_lookup_articles.select(
    DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MAX,
    DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX,
).describe()

statistic,total_pageviews_normalized_max,total_pageviews_normalized_min_max
str,f64,f64
"""count""",1832.0,1832.0
"""null_count""",0.0,0.0
"""mean""",0.103496,0.103584
"""std""",0.079458,0.07945
"""min""",1e-06,0.0001
"""25%""",0.049242,0.049336
"""50%""",0.089754,0.089844
"""75%""",0.144212,0.144296
"""max""",1.0,1.0


## Add embeddings representations

In [12]:
# => Embeddings:
BERT_VECTOR = "bert_base_multilingual_cased"
ROBERTA_VECTOR = "xlm_roberta_base"

CONTRASTIVE_VECTOR = "contrastive_vector"
DOCUMENT_VECTOR = "document_vector"


def load_join_embeddings(df: pl.DataFrame, emb_path: Path) -> pl.DataFrame:
    emb_contrastive = (
        pl.scan_parquet(PATH.parent.joinpath(emb_path))
        .filter(pl.col(DEFAULT_ARTICLE_ID_COL).is_in(df.select(DEFAULT_ARTICLE_ID_COL)))
        .collect()
    )
    return df.join(emb_contrastive, on=DEFAULT_ARTICLE_ID_COL, how="left")


df_lookup_articles = df_lookup_articles.pipe(
    load_join_embeddings,
    emb_path=f"embeddings/Ekstra_Bladet_contrastive_vector/{CONTRASTIVE_VECTOR}.parquet",
).pipe(
    load_join_embeddings,
    emb_path=f"embeddings/Ekstra_Bladet_word2vec/{DOCUMENT_VECTOR}.parquet",
)
print(f"#rows: {df_lookup_articles.shape[0]}")
df_lookup_articles.head(2)

#rows: 1832


article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label,total_pageviews_normalized_max,total_pageviews_normalized_min_max,contrastive_vector,document_vector
i32,str,str,str,bool,str,str,list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str,f64,f64,list[f32],list[f32]
3196611,"""Zoo-tårnet 100…","""I mange år var…","""2023-06-29 06:…",False,"""I mange år var…","""2005-06-10 05:…","[3067931, 3035588]","""article_defaul…","""https://ekstra…",[],[],"[""Kultur"", ""Museum og seværdighed""]",539,[],"""ferie""",1,1,,0.6275,"""Neutral""",1e-06,0.0001,"[-0.014504, 0.059745, … -0.046285]","[-0.01595, -0.071589, … 0.018402]"
3971783,"""Paradise-Maria…","""Reality-deltag…","""2023-06-29 06:…",False,"""Paradise Hotel…","""2013-04-17 17:…",,"""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Livsstil"", … ""Reality""]",414,[425],"""underholdning""",1,1,,0.959,"""Negative""",1e-06,0.0001,"[-0.008399, 0.025603, … 0.021549]","[0.046812, 0.012343, … 0.005147]"


## Convert to lookup dictionary:

In [13]:
articles_dict = {}
for row in df_lookup_articles.iter_rows(named=True):
    # Note, all keys in dictionaries are converted to strings, when serializing an object to JSON format.
    articles_dict[str(row[DEFAULT_ARTICLE_ID_COL])] = row
# Write it:
write_json_file(articles_dict, PATH_BEYOND_ACCURACY.joinpath(ARTICLES_DICT))
print(f"#articles: {len(articles_dict)}")
print(f"Dump: {PATH_BEYOND_ACCURACY.joinpath(ARTICLES_DICT)}")

#articles: 1832
Dump: ../downloads/demo/evaluation_artifacts/articles_dict.json


# Create Baselines

Make a couple *Baselines* based on the candidate-list:
1. @EditorialPicks: We approximate this based on the number **inview** an articles have recived. Ekstra Bladet is front-page driven, meaning, if an article has a lot of inview-impression (seen) a lot, we believe it has been selected to be in a top priority from the editors. This is static (it does change for our *candidate_list*), i.e., the computation is done once.
2. @Popular: We approximate this based on the number **clicks** an articles have recived. This is static (it does change for our *candidate_list*), i.e., the computation is done once.
3. @Random: Simple baseline and important baseline. We simple pick a set of *top-n* articles from the *candidate-list* and run multiple times.
4. @Dissimilarity / Similarity (will come later): Select top-n articles that are the most similar / dissimilar. 
5. @Newest: Simply pick the newest released articles. We do see newssite where the top banner is *Newest released*. We include it, but note this is very sensitive and might not be meaningful.

### Load the artifacts

In [14]:
def n_items(d) -> int:
    return len(d[list(d)[0]])


# =>
behaviors_timestamp_dict = read_json_file(
    PATH_BEYOND_ACCURACY.joinpath(BEHAVIORS_TIMESTAMP_DICT)
)
print(f"#behaviors_timestamp_dict: {n_items(behaviors_timestamp_dict)}")

# =>
history_dict = read_json_file(
    PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_HISTORY_DICT)
)
print(
    f"#history_dict: {n_items(history_dict)}\n history_dict.keys(): {history_dict.keys()}"
)

# =>
users_dict = read_json_file(PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_USERS_DICT))
print(f"#users_dict {n_items(users_dict)}\n users_dict.keys(): {users_dict.keys()}")

# =>
user_history_dict = read_json_file(
    PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_HISTORY_DICT)
)
print(
    f"#user_history_dict {n_items(user_history_dict)}\n users_dict.keys(): {user_history_dict.keys()}"
)

# =>
articles_dict = {
    int(key): val
    for key, val in read_json_file(PATH_BEYOND_ACCURACY.joinpath(ARTICLES_DICT)).items()
}
aid_keys = articles_dict[list(articles_dict)[0]].keys()
print(f"#articles_dict: {len(articles_dict)}\n articles_dict[ID].keys(): {aid_keys}")

# => Only the once actually found in the dataset (for demo only 154 of 250 are represent)
candidate_list = [
    id
    for id in read_json_file(PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST))
    if id in list(articles_dict)
]
print(f"#candidate_list: {len(candidate_list)}")

df_candidate_articles = df_lookup_articles.filter(
    pl.col(DEFAULT_ARTICLE_ID_COL).is_in(candidate_list)
)
print(f"#candidate-articles (df): {df_candidate_articles.shape[0]}")

#behaviors_timestamp_dict: 27052
#history_dict: 1615
 history_dict.keys(): dict_keys(['article_id_fixed'])
#users_dict 1615
 users_dict.keys(): dict_keys(['is_subscriber', 'is_sso_user', 'postcode', 'gender', 'age'])
#user_history_dict 1615
 users_dict.keys(): dict_keys(['article_id_fixed'])
#articles_dict: 1832
 articles_dict[ID].keys(): dict_keys(['article_id', 'title', 'subtitle', 'last_modified_time', 'premium', 'body', 'published_time', 'image_ids', 'article_type', 'url', 'ner_clusters', 'entity_groups', 'topics', 'category', 'subcategory', 'category_str', 'total_inviews', 'total_pageviews', 'total_read_time', 'sentiment_score', 'sentiment_label', 'total_pageviews_normalized_max', 'total_pageviews_normalized_min_max', 'contrastive_vector', 'document_vector'])
#candidate_list: 154
#candidate-articles (df): 154


## Make Ranked Candidate lists

### Editorical Pick


In [15]:
df_candidates_editorial_picks = create_sort_based_prediction_score(
    df_candidate_articles,
    column=DEFAULT_TOTAL_INVIEWS_COL,
    desc=True,
)
candidates_editorial_picks = np.array(
    [df_candidates_editorial_picks.select(DEFAULT_ARTICLE_ID_COL).to_series()]
)
# =>
print(candidates_editorial_picks[:, :2])
df_candidates_editorial_picks.head(2)

[[9790335 9791587]]


article_id,total_inviews,prediction_score
i32,i32,f64
9790335,1698890,1.0
9791587,1369829,0.5


### Popular

In [16]:
df_candidates_popular = create_sort_based_prediction_score(
    df_candidate_articles,
    column=DEFAULT_TOTAL_PAGEVIEWS_COL,
    desc=True,
)
candidates_popular = np.array(
    [df_candidates_popular.select(DEFAULT_ARTICLE_ID_COL).to_series()]
)
# =>
print(candidates_popular[:, :2])
df_candidates_popular.head(2)

[[9791428 9792719]]


article_id,total_pageviews,prediction_score
i32,i32,f64
9791428,256541,1.0
9792719,209050,0.5


### Newest

In [17]:
df_candidates_newest = create_sort_based_prediction_score(
    df_candidate_articles,
    column=DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL,
    desc=False,
)
candidates_newest = np.array(
    [df_candidates_newest.select(DEFAULT_ARTICLE_ID_COL).to_series()]
)
# =>
print(candidates_newest[:, :2])
df_candidates_newest.head(2)

[[9790515 9791205]]


article_id,published_time,prediction_score
i32,str,f64
9790515,"""2023-06-01 07:…",1.0
9791205,"""2023-06-01 07:…",0.5


## Init Metrics

In [18]:
instralist_diversity = IntralistDiversity()
distribution = Distribution()
serendipity = Serendipity()
sentiment = Sentiment()
coverage = Coverage()
novelty = Novelty()

## Setting Baselines (and your model)

In [19]:
RANDOM_ITER = df_beyond_accuarcy.select(DEFAULT_INVIEW_ARTICLES_COL).collect().shape[0]
TOP_N = 5

np.random.seed(123)
# Make list:
top_n_candidates_random = [
    np.random.choice(candidate_list, size=TOP_N, replace=False)
    for _ in range(RANDOM_ITER)
]
top_n_candidates_editorial_picks = candidates_editorial_picks[:, :TOP_N]
top_n_candidates_popular = candidates_popular[:, :TOP_N]
top_n_candidates_newest = candidates_newest[:, :TOP_N]
#
# Set them as tuples, just to loop through it:
candidates_name_pairs = [
    [top_n_candidates_editorial_picks, "editorial_picks"],
    [top_n_candidates_popular, "popular"],
    [top_n_candidates_random, "random"],
    [top_n_candidates_newest, "newest"],
]
# =>
user_history = user_history_dict[DEFAULT_HISTORY_ARTICLE_ID_COL]

print(f"#random-iterations: {RANDOM_ITER}")
print(f"Top@{TOP_N} ranked articles")

#random-iterations: 1615
Top@5 ranked articles


### Your Model
Try to add your model's prediction of the candidate list. In this notebook we just take a random sample.

In [20]:
candidates_your_model = [
    np.random.choice(candidate_list, size=TOP_N, replace=False)
    for _ in range(RANDOM_ITER)
]
candidates_name_pairs.append([candidates_your_model, "random_2"])

## User-level

### Instralist-Diversity

In [21]:
instralist_diversity_dict = {"name": ["mean", "std"]}

for candidates, list_name in candidates_name_pairs:
    scores = instralist_diversity(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=CONTRASTIVE_VECTOR,
    )
    instralist_diversity_dict[f"{list_name}_{instralist_diversity.name}"] = [
        scores.mean(),
        scores.std(),
    ]

pl.DataFrame(instralist_diversity_dict)

name,editorial_picks_intralist_diversity,popular_intralist_diversity,random_intralist_diversity,newest_intralist_diversity,random_2_intralist_diversity
str,f64,f64,f64,f64,f64
"""mean""",0.790542,0.840236,0.789926,0.91091,0.787597
"""std""",0.0,0.0,0.062761,0.0,0.065585


#### The embedding representation
This might be obvious, but the embedding representation used for computing a metric is very influential. Hence, baselines are important to determine high and low scores. Also, this is why these metrics can be very hard to interpret for us.

In [22]:
ROBERTA_EMB = "FacebookAI/xlm-roberta-base"
BERT_EMB = "google-bert/bert-base-multilingual-cased"
print(
    f"{CONTRASTIVE_VECTOR}: {instralist_diversity(top_n_candidates_editorial_picks, lookup_dict=articles_dict, lookup_key=CONTRASTIVE_VECTOR).mean()}"
)
print(
    f"{DOCUMENT_VECTOR}: {instralist_diversity(top_n_candidates_editorial_picks, lookup_dict=articles_dict, lookup_key=DOCUMENT_VECTOR).mean()}"
)

contrastive_vector: 0.7905415595169083
document_vector: 0.1584846028291677


### Sentiment

In [23]:
sentiment_dict = {"name": ["mean", "std"]}

for candidates, list_name in candidates_name_pairs:
    scores = sentiment(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=DEFAULT_SENTIMENT_SCORE_COL,
    )
    sentiment_dict[f"{list_name}_{sentiment.name}"] = [
        scores.mean(),
        scores.std(),
    ]

pl.DataFrame(sentiment_dict)

name,editorial_picks_sentiment,popular_sentiment,random_sentiment,newest_sentiment,random_2_sentiment
str,f64,f64,f64,f64,f64
"""mean""",0.7294,0.78342,0.840198,0.8303,0.840806
"""std""",0.0,0.0,0.067306,0.0,0.068372


### Novelty

In [24]:
novelty_dict = {"name": ["mean", "std"]}

for candidates, list_name in candidates_name_pairs:
    scores = novelty(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX,
    )
    novelty_dict[f"{list_name}_{novelty.name}"] = [
        scores.mean(),
        scores.std(),
    ]

pl.DataFrame(novelty_dict)

name,editorial_picks_novelty,popular_novelty,random_novelty,newest_novelty,random_2_novelty
str,f64,f64,f64,f64,f64
"""mean""",3.52556,1.968051,4.166654,3.997557,4.237515
"""std""",0.0,0.0,0.869856,0.0,0.853973


### Serendipity
When computing Serendipity it using the user's history; similarity between recommendations and browsed items

In [25]:
serendipity_dict = {"name": ["mean", "std"]}
for candidates, list_name in candidates_name_pairs:
    if len(candidates) == 1:
        candidates = np.tile(candidates, len(user_history)).reshape(-1, TOP_N)
    #
    scores = serendipity(
        candidates,
        H=user_history,
        lookup_dict=articles_dict,
        lookup_key=CONTRASTIVE_VECTOR,
    )
    serendipity_dict[f"{list_name}_{serendipity.name}"] = [scores.mean(), scores.std()]

pl.DataFrame(serendipity_dict)

name,editorial_picks_serendipity,popular_serendipity,random_serendipity,newest_serendipity,random_2_serendipity
str,f64,f64,f64,f64,f64
"""mean""",0.783484,0.789688,0.79126,0.832755,0.791841
"""std""",0.031662,0.031837,0.039926,0.020198,0.040973


## Model-level

### Coverage

In [26]:
coverage_dict = {"name": ["count", "fraction"]}
for candidates, list_name in candidates_name_pairs:
    coverage_dict[f"{list_name}_{coverage.name}"] = coverage(candidates, candidate_list)

pl.DataFrame(coverage_dict)

name,editorial_picks_coverage,popular_coverage,random_coverage,newest_coverage,random_2_coverage
str,f64,f64,f64,f64,f64
"""count""",5.0,5.0,154.0,5.0,154.0
"""fraction""",0.032468,0.032468,1.0,0.032468,1.0


### Distribution - Category

#### Distribution helper function

In [27]:
def concat_distribution_dict(dict_: dict) -> dict:
    return (
        pl.concat(
            [pl.DataFrame(val) for val in dict_.values()],
            how="diagonal",
        )
        .with_row_index(name="name")
        .with_columns(pl.Series(dict_.keys()).alias("name"))
    ).to_dict()

In [28]:
COLUMN = DEFAULT_CATEGORY_STR_COL

distribution_dict = {
    f"{list_name}_{novelty.name}": distribution(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=COLUMN,
    )
    for candidates, list_name in candidates_name_pairs
}

distribution_category_dict = concat_distribution_dict(distribution_dict)
pl.DataFrame(distribution_category_dict)

name,forbrug,nyheder,krimi,sport,penge,musik,underholdning,nationen
str,f64,f64,f64,f64,f64,f64,f64,f64
"""editorial_pick…",0.2,0.2,0.2,0.2,0.2,,,
"""popular_novelt…",,0.2,0.2,0.4,0.2,,,
"""random_novelty…",0.019938,0.263158,0.150217,0.272941,0.062539,0.058452,0.133746,0.039009
"""newest_novelty…",,,0.4,0.2,0.2,,0.2,
"""random_2_novel…",0.022415,0.262167,0.145759,0.27839,0.061176,0.056223,0.134985,0.038885


### Distribution - Sentiment

In [29]:
COLUMN = DEFAULT_SENTIMENT_LABEL_COL

distribution_dict = {
    f"{list_name}_{novelty.name}": distribution(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=COLUMN,
    )
    for candidates, list_name in candidates_name_pairs
}

distribution_sentiment_dict = concat_distribution_dict(distribution_dict)
pl.DataFrame(distribution_sentiment_dict)

name,Negative,Positive,Neutral
str,f64,f64,f64
"""editorial_pick…",0.4,0.6,
"""popular_novelt…",0.2,0.4,0.4
"""random_novelty…",0.552817,0.172136,0.275046
"""newest_novelty…",0.6,0.2,0.2
"""random_2_novel…",0.554303,0.172136,0.27356


### Distribution - Topics

In [30]:
COLUMN = DEFAULT_TOPICS_COL

distribution_dict = {
    f"{list_name}_{novelty.name}": distribution(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=COLUMN,
    )
    for candidates, list_name in candidates_name_pairs
}

distribution_topics_dict = concat_distribution_dict(distribution_dict)
pl.DataFrame(distribution_topics_dict)

name,Livsstil,Sundhed,Sygdom og behandling,Kendt,Bolig,Køb og salg,Kriminalitet,Bandekriminalitet,Sport,Erhverv,Privat virksomhed,Økonomi,Mikro,Politik,International politik,Personfarlig kriminalitet,Offentlig instans,Begivenhed,Fodbold,Sportsbegivenhed,Ketcher- og batsport,Underholdning,Musik og lyd,Underholdningsbegivenhed,Transportmiddel,Katastrofe,Mindre ulykke,Større transportmiddel,Uddannelse,Ungdomsuddannelse,National politik,Ansættelsesforhold,Motorsport,Makro,Samfund,Videnskab,Naturvidenskab,Film og tv,Konflikt og krig,Væbnet konflikt,Familieliv,Udlejning,Bil,Håndbold,Dyr,Større katastrofe,Kultur,Mad og drikke,Bedrageri,Vejr,Bæredygtighed og klima,Kosmetisk behandling,Offentlig transport,Partnerskab,Teknologi,Kunstig intelligens og software,Værdier,Reality,Renovering og indretning,Forbrugerelektronik
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""editorial_pick…",0.133333,0.066667,0.066667,0.133333,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""popular_novelt…",,,,0.157895,,,0.052632,,0.105263,0.105263,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""random_novelty…",0.025235,0.013574,0.011912,0.104201,0.015705,0.01279,0.054577,0.004295,0.067618,0.066426,0.029122,0.027335,0.007367,0.046959,0.020063,0.035266,0.003574,0.044765,0.044796,0.025047,0.00953,0.041881,0.015956,0.016144,0.021944,0.02116,0.014451,0.008056,0.006207,0.006207,0.025266,0.031599,0.003448,0.012351,0.016426,0.001411,0.001411,0.011129,0.010282,0.003542,0.003542,0.001442,0.009154,0.003229,0.001975,0.001411,0.001944,0.004796,0.003009,0.003511,0.003135,0.001693,0.001442,0.003072,0.006364,0.00326,0.003229,0.001693,0.001473,0.001599
"""newest_novelty…",,,,0.066667,,,0.066667,,0.066667,0.066667,0.066667,0.066667,,,,0.066667,,0.066667,,0.066667,0.066667,0.066667,,,,0.066667,0.066667,,0.066667,0.066667,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""random_2_novel…",0.023964,0.012576,0.011168,0.104114,0.015799,0.01242,0.053715,0.004317,0.068763,0.065947,0.029032,0.026498,0.006163,0.047302,0.020554,0.034069,0.003003,0.045081,0.044205,0.025872,0.010762,0.041764,0.015548,0.01583,0.0229,0.021367,0.015298,0.009291,0.006601,0.006601,0.02484,0.032066,0.003598,0.012013,0.016299,0.001533,0.001533,0.011231,0.010136,0.003473,0.003128,0.001721,0.00951,0.003566,0.001283,0.001658,0.00147,0.005005,0.003629,0.002784,0.003254,0.001408,0.00194,0.002972,0.007258,0.003441,0.003785,0.001595,0.001658,0.001689
