# Hello

This notebook is an example of how to make a beyond-accuracy dataset, and how one could make baselines

# Get started

## Dependencies

In [28]:
from ebrec.utils._python import (
    rank_predictions_by_score,
    write_submission_file,
    write_json_file,
    read_json_file,
)
from pathlib import Path
import polars as pl
import numpy as np

from ebrec.utils._constants import *
from ebrec.evaluation.beyond_accuracy import (
    IntralistDiversity,
    Distribution,
    Serendipity,
    Sentiment,
    Coverage,
    Novelty,
)

from ebrec.utils._articles import create_sort_based_prediction_score
from ebrec.utils._behaviors import truncate_history

## Set paths

In [29]:
ROOT_FOLDER = "evaluation_artifacts"
DATASET_SPLIT = "test"
DATASET_SIZE = "demo"
# ROOT PATH:
PATH = Path(f"../downloads/{DATASET_SIZE}")

## We are using the LARGE articles dataset to ensure we have all articles IDs,
# for beyond-accuracy; as only 154 aids are found in the demo.
ARTICLES_PATH = PATH.parent.joinpath("large")

# PATH TO DUMP ARTIFACTS:
PATH_BEYOND_ACCURACY = PATH.joinpath(ROOT_FOLDER)
PATH_BEYOND_ACCURACY.mkdir(exist_ok=True, parents=True)
# BASELINE ARTIFACTS:
PATH_BEYOND_ACCURACY_BASELINES = PATH_BEYOND_ACCURACY.joinpath("baselines")
PATH_BEYOND_ACCURACY_BASELINES.mkdir(parents=True, exist_ok=True)

### Output files

In [30]:
BEYOND_ACCURACY_HISTORY_DICT = "beyond_accuracy_history_dict.json"
BEYOND_ACCURACY_USERS_DICT = "beyond_accuracy_users_dict.json"
CANDIDATE_LIST = "candidate_list.json"
ARTICLES_DICT = "articles_dict.json"
BEHAVIORS_TIMESTAMP_DICT = "behaviors_timestamp_dict.json"
#
BASELINE_DIVERSITY = "instralist_diversity.json"
BASELINE_SENTIMENT_SCORE = "sentiment_score.json"
BASELINE_NOVELTY = "novelty.json"
BASELINE_SERENDIPITY = "serendipity.json"
BASELINE_COVERAGE = "coverage.json"
BASELINE_DISTRIBUTION_CATEGORY = "distribution_category.json"
BASELINE_DISTRIBUTION_SENTIMENT_LABEL = "distribution_sentiment_label.json"
BASELINE_DISTRIBUTION_TOPICS = "distribution_topics.json"

## Load dataset

In [31]:
# Load data:
df_beyond_accuarcy = pl.scan_parquet(
    PATH.joinpath(DATASET_SPLIT, "behaviors.parquet")
).filter(pl.col("is_beyond_accuracy"))
df_behaviors = pl.scan_parquet(
    PATH.joinpath(DATASET_SPLIT, "behaviors.parquet")
).filter(~pl.col("is_beyond_accuracy"))
df_articles = pl.scan_parquet(ARTICLES_PATH.joinpath("articles.parquet"))
df_history = pl.scan_parquet(PATH.joinpath(DATASET_SPLIT, "history.parquet")).select(
    DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL
)

# Make / Dump Metadata

## Candidate list:

We select the candidate list from the testset

In [32]:
candidate_list = (
    df_beyond_accuarcy.select(pl.col(DEFAULT_INVIEW_ARTICLES_COL).first())
    .collect()
    .to_series()
)[0].to_list()


write_json_file(
    candidate_list, PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST), verbose=True
)

print(f"Number of Candidate IDs: {len(candidate_list)} (example: {candidate_list[:5]})")

Writing JSON: '../downloads/demo/evaluation_artifacts/candidate_list.json'
Number of Candidate IDs: 250 (example: [9793163, 9793069, 9792076, 9792749, 9791280])


#### Sanity check

In [33]:
load_candidate_list = read_json_file(PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST))
if (
    not (
        df_beyond_accuarcy.select(DEFAULT_INVIEW_ARTICLES_COL).collect()
        == candidate_list
    )
    .sum()[DEFAULT_INVIEW_ARTICLES_COL]
    .to_list()[0]
    == df_beyond_accuarcy.select(DEFAULT_INVIEW_ARTICLES_COL).collect().shape[0]
):
    raise ValueError("candidate_list is not identical in the testset")

if not (np.array(candidate_list) - np.array(load_candidate_list)).sum() == 0:
    raise ValueError("candidate_list was not dump correctly")

print("santity check - passed")

santity check - passed


## User meta data: Segments

In [34]:
user_meta_columns = [
    DEFAULT_IS_SUBSCRIBER_COL,
    DEFAULT_IS_SSO_USER_COL,
    DEFAULT_POSTCODE_COL,
    DEFAULT_GENDER_COL,
    DEFAULT_AGE_COL,
]
df_users = df_beyond_accuarcy.select(pl.col(user_meta_columns)).collect()

users_dict = {col: df_users[col].to_list() for col in df_users.columns}
write_json_file(
    users_dict, PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_USERS_DICT), verbose=True
)
print(f"#rows: {df_users.shape[0]}")
df_users.head(3)

Writing JSON: '../downloads/demo/evaluation_artifacts/beyond_accuracy_users_dict.json'
#rows: 1615


is_subscriber,is_sso_user,postcode,gender,age
bool,bool,i8,i8,i8
True,True,,0.0,30.0
True,True,,,
True,True,,0.0,


## User Histories

In [35]:
HISTORY_SIZE = 10

df_user_histoies = (
    df_beyond_accuarcy.select(DEFAULT_USER_COL)
    .join(df_history, on=DEFAULT_USER_COL, how="left")
    .pipe(
        truncate_history,
        column=DEFAULT_HISTORY_ARTICLE_ID_COL,
        history_size=HISTORY_SIZE,
        padding_value=None,
        enable_warning=False,
    )
    .collect()
)
user_history_dict = {
    DEFAULT_HISTORY_ARTICLE_ID_COL: df_user_histoies[
        DEFAULT_HISTORY_ARTICLE_ID_COL
    ].to_list()
}
write_json_file(
    user_history_dict,
    PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_HISTORY_DICT),
    verbose=True,
)
print(f"#rows: {df_user_histoies.shape[0]}")
df_user_histoies.head(3)

Writing JSON: '../downloads/demo/evaluation_artifacts/beyond_accuracy_history_dict.json'
#rows: 1615


user_id,article_id_fixed
u32,list[i32]
1744285,"[9790532, 9790532, … 9790700]"
631807,"[9790756, 9790811, … 9790804]"
1984028,"[9789300, 9790942, … 9789896]"


## Timestamp for Behaviors

Used for computing the AUC as function of time

In [36]:
df_behaviors_timestamp = df_behaviors.select(
    pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).cast(pl.Utf8),
).collect()
behaviors_timestamp_dict = {
    DEFAULT_IMPRESSION_TIMESTAMP_COL: df_behaviors_timestamp[
        DEFAULT_IMPRESSION_TIMESTAMP_COL
    ].to_list()
}
write_json_file(
    behaviors_timestamp_dict,
    PATH_BEYOND_ACCURACY.joinpath(BEHAVIORS_TIMESTAMP_DICT),
    verbose=True,
)
print(f"#rows: {df_behaviors_timestamp.shape[0]}")
df_behaviors_timestamp.head(3)

Writing JSON: '../downloads/demo/evaluation_artifacts/behaviors_timestamp_dict.json'
#rows: 27052


impression_time
str
"""2023-06-05 15:…"
"""2023-06-05 15:…"
"""2023-06-01 10:…"


# Make Candidate lookup dict / Dump lookup dict

## Articles to include: *candidate-list* and *history-articles*

In [37]:
history_article_id = (
    df_user_histoies.lazy()
    .select(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).explode().unique())
    .collect()[DEFAULT_HISTORY_ARTICLE_ID_COL]
    .to_list()
)
print(f"#history_article_id: {len(history_article_id)})")

#history_article_id: 1678)


Note, the different datasizes (*demo*, *small*, and *large*) has subset of the total article-catelog. Hence, if you're using *demo*, not all of the articles in the candidate-list may be in the dataset.

In [38]:
aids_in_split = (
    df_articles.select(DEFAULT_ARTICLE_ID_COL)
    .collect()[DEFAULT_ARTICLE_ID_COL]
    .to_list()
)

history_article_id = [id for id in history_article_id if id in aids_in_split]
candidate_list = [id for id in candidate_list if id in aids_in_split]

article_ids = candidate_list + history_article_id
print(
    f"#articles: {len(article_ids)} (#candidate_list: {len(candidate_list)} & #history_article_id: {len(history_article_id)})"
)

#articles: 1928 (#candidate_list: 250 & #history_article_id: 1678)


## Select articles that should be included in the lookup dictionary

In [39]:
# =>
df_lookup_articles = (
    df_articles.filter(pl.col(DEFAULT_ARTICLE_ID_COL).is_in(article_ids))
    .with_columns(
        pl.col(
            DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL,
            DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL,
        ).cast(pl.Utf8)
    )
    # Zeros might cause issues
    .with_columns(
        pl.col(DEFAULT_TOTAL_INVIEWS_COL, DEFAULT_TOTAL_PAGEVIEWS_COL).fill_null(1)
    )
    .collect()
)
print(f"df_lookup_articles shape: {df_lookup_articles.shape}")

df_lookup_articles shape: (1928, 21)


### Make normalize popularity-scores

In [40]:
DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MAX = (
    DEFAULT_TOTAL_PAGEVIEWS_COL + "_normalized_max"
)
DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX = (
    DEFAULT_TOTAL_PAGEVIEWS_COL + "_normalized_min_max"
)

MIN_X = df_lookup_articles[DEFAULT_TOTAL_PAGEVIEWS_COL].min()
MAX_X = df_lookup_articles[DEFAULT_TOTAL_PAGEVIEWS_COL].max()
MIN_RANGE = 1e-4
MAX_RANGE = 1.0

df_lookup_articles = df_lookup_articles.with_columns(
    (  # SIMPLE MAX NORMALIZATION: x / max()
        pl.col(DEFAULT_TOTAL_PAGEVIEWS_COL) / pl.col(DEFAULT_TOTAL_PAGEVIEWS_COL).max()
    ).alias(DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MAX)
).with_columns(
    (  #  MIN-MAX NORMALIZATION: ( x_i − X_min ⁡ ) / ( X_max ⁡ − X_min ⁡ ) * (max_range − min_range) + min_range
        ((pl.col(DEFAULT_TOTAL_PAGEVIEWS_COL) - MIN_X) / (MAX_X - MIN_X))
        * (MAX_RANGE - MIN_RANGE)
        + MIN_RANGE
    ).alias(
        DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX
    )
)

df_lookup_articles.select(
    DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MAX,
    DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX,
).describe()

statistic,total_pageviews_normalized_max,total_pageviews_normalized_min_max
str,f64,f64
"""count""",1928.0,1928.0
"""null_count""",0.0,0.0
"""mean""",0.098342,0.098431
"""std""",0.08066,0.080652
"""min""",1e-06,0.0001
"""25%""",0.042917,0.043012
"""50%""",0.084734,0.084825
"""75%""",0.140855,0.14094
"""max""",1.0,1.0


## Add embeddings representations

In [41]:
# => Embeddings:
BERT_VECTOR = "bert_base_multilingual_cased"
ROBERTA_VECTOR = "xlm_roberta_base"

CONTRASTIVE_VECTOR = "contrastive_vector"
DOCUMENT_VECTOR = "document_vector"


def load_join_embeddings(df: pl.DataFrame, emb_path: Path) -> pl.DataFrame:
    emb_contrastive = (
        pl.scan_parquet(PATH.parent.joinpath(emb_path))
        .filter(pl.col(DEFAULT_ARTICLE_ID_COL).is_in(df.select(DEFAULT_ARTICLE_ID_COL)))
        .collect()
    )
    return df.join(emb_contrastive, on=DEFAULT_ARTICLE_ID_COL, how="left")


df_lookup_articles = df_lookup_articles.pipe(
    load_join_embeddings,
    emb_path=f"embeddings/Ekstra_Bladet_contrastive_vector/{CONTRASTIVE_VECTOR}.parquet",
).pipe(
    load_join_embeddings,
    emb_path=f"embeddings/Ekstra_Bladet_word2vec/{DOCUMENT_VECTOR}.parquet",
)
print(f"#rows: {df_lookup_articles.shape[0]}")
df_lookup_articles.head(2)

#rows: 1928


article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label,total_pageviews_normalized_max,total_pageviews_normalized_min_max,contrastive_vector,document_vector
i32,str,str,str,bool,str,str,list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str,f64,f64,list[f32],list[f32]
3196611,"""Zoo-tårnet 100…","""I mange år var…","""2023-06-29 06:…",False,"""I mange år var…","""2005-06-10 05:…","[3067931, 3035588]","""article_defaul…","""https://ekstra…",[],[],"[""Kultur"", ""Museum og seværdighed""]",539,[],"""ferie""",1,1,,0.6275,"""Neutral""",1e-06,0.0001,"[-0.014504, 0.059745, … -0.046285]","[-0.01595, -0.071589, … 0.018402]"
3971783,"""Paradise-Maria…","""Reality-deltag…","""2023-06-29 06:…",False,"""Paradise Hotel…","""2013-04-17 17:…",,"""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Livsstil"", … ""Reality""]",414,[425],"""underholdning""",1,1,,0.959,"""Negative""",1e-06,0.0001,"[-0.008399, 0.025603, … 0.021549]","[0.046812, 0.012343, … 0.005147]"


## Convert to lookup dictionary:

In [42]:
articles_dict = {}
for row in df_lookup_articles.iter_rows(named=True):
    # Note, all keys in dictionaries are converted to strings, when serializing an object to JSON format.
    articles_dict[str(row[DEFAULT_ARTICLE_ID_COL])] = row
# Write it:
write_json_file(
    articles_dict, PATH_BEYOND_ACCURACY.joinpath(ARTICLES_DICT), verbose=True
)
print(f"#articles: {len(articles_dict)}")

Writing JSON: '../downloads/demo/evaluation_artifacts/articles_dict.json'
#articles: 1928


# Create Baselines

Make a couple *Baselines* based on the candidate-list:
1. @EditorialPicks: We approximate this based on the number **inview** an articles have recived. Ekstra Bladet is front-page driven, meaning, if an article has a lot of inview-impression (seen) a lot, we believe it has been selected to be in a top priority from the editors. This is static (it does change for our *candidate_list*), i.e., the computation is done once.
2. @Popular: We approximate this based on the number **clicks** an articles have recived. This is static (it does change for our *candidate_list*), i.e., the computation is done once.
3. @Random: Simple baseline and important baseline. We simple pick a set of *top-n* articles from the *candidate-list* and run multiple times.
4. @Dissimilarity / Similarity (will come later): Select top-n articles that are the most similar / dissimilar. 
5. @Newest: Simply pick the newest released articles. We do see newssite where the top banner is *Newest released*. We include it, but note this is very sensitive and might not be meaningful.

### Load the artifacts

In [43]:
def n_items(d) -> int:
    return len(d[list(d)[0]])


# =>
behaviors_timestamp_dict = read_json_file(
    PATH_BEYOND_ACCURACY.joinpath(BEHAVIORS_TIMESTAMP_DICT)
)
print(f"#behaviors_timestamp_dict: {n_items(behaviors_timestamp_dict)}")

# =>
history_dict = read_json_file(
    PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_HISTORY_DICT)
)
print(
    f"#history_dict: {n_items(history_dict)}\n history_dict.keys(): {history_dict.keys()}"
)

# =>
users_dict = read_json_file(PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_USERS_DICT))
print(f"#users_dict {n_items(users_dict)}\n users_dict.keys(): {users_dict.keys()}")

# =>
user_history_dict = read_json_file(
    PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_HISTORY_DICT)
)
print(
    f"#user_history_dict {n_items(user_history_dict)}\n users_dict.keys(): {user_history_dict.keys()}"
)

# =>
articles_dict = {
    int(key): val
    for key, val in read_json_file(PATH_BEYOND_ACCURACY.joinpath(ARTICLES_DICT)).items()
}
aid_keys = articles_dict[list(articles_dict)[0]].keys()
print(f"#articles_dict: {len(articles_dict)}\n articles_dict[ID].keys(): {aid_keys}")

# => Only the once actually found in the dataset (for demo only 154 of 250 are represent)
candidate_list = [
    id
    for id in read_json_file(PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST))
    if id in list(articles_dict)
]
print(f"#candidate_list: {len(candidate_list)}")

df_candidate_articles = df_lookup_articles.filter(
    pl.col(DEFAULT_ARTICLE_ID_COL).is_in(candidate_list)
)
print(f"#candidate-articles (df): {df_candidate_articles.shape[0]}")

#behaviors_timestamp_dict: 27052
#history_dict: 1615
 history_dict.keys(): dict_keys(['article_id_fixed'])
#users_dict 1615
 users_dict.keys(): dict_keys(['is_subscriber', 'is_sso_user', 'postcode', 'gender', 'age'])
#user_history_dict 1615
 users_dict.keys(): dict_keys(['article_id_fixed'])
#articles_dict: 1928
 articles_dict[ID].keys(): dict_keys(['article_id', 'title', 'subtitle', 'last_modified_time', 'premium', 'body', 'published_time', 'image_ids', 'article_type', 'url', 'ner_clusters', 'entity_groups', 'topics', 'category', 'subcategory', 'category_str', 'total_inviews', 'total_pageviews', 'total_read_time', 'sentiment_score', 'sentiment_label', 'total_pageviews_normalized_max', 'total_pageviews_normalized_min_max', 'contrastive_vector', 'document_vector'])
#candidate_list: 250
#candidate-articles (df): 250


## Make Ranked Candidate lists

### Editorical Pick


In [44]:
df_candidates_editorial_picks = create_sort_based_prediction_score(
    df_candidate_articles,
    column=DEFAULT_TOTAL_INVIEWS_COL,
    desc=True,
)
candidates_editorial_picks = np.array(
    [df_candidates_editorial_picks.select(DEFAULT_ARTICLE_ID_COL).to_series()]
)
# =>
print(candidates_editorial_picks[:, :2])
df_candidates_editorial_picks.head(2)

[[9790335 9791587]]


article_id,total_inviews,prediction_score
i32,i32,f64
9790335,1698890,1.0
9791587,1369829,0.5


### Popular

In [45]:
df_candidates_popular = create_sort_based_prediction_score(
    df_candidate_articles,
    column=DEFAULT_TOTAL_PAGEVIEWS_COL,
    desc=True,
)
candidates_popular = np.array(
    [df_candidates_popular.select(DEFAULT_ARTICLE_ID_COL).to_series()]
)
# =>
print(candidates_popular[:, :2])
df_candidates_popular.head(2)

[[9791428 9792719]]


article_id,total_pageviews,prediction_score
i32,i32,f64
9791428,256541,1.0
9792719,209050,0.5


### Newest

In [46]:
df_candidates_newest = create_sort_based_prediction_score(
    df_candidate_articles,
    column=DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL,
    desc=False,
)
candidates_newest = np.array(
    [df_candidates_newest.select(DEFAULT_ARTICLE_ID_COL).to_series()]
)
# =>
print(candidates_newest[:, :2])
df_candidates_newest.head(2)

[[9790515 9791205]]


article_id,published_time,prediction_score
i32,str,f64
9790515,"""2023-06-01 07:…",1.0
9791205,"""2023-06-01 07:…",0.5


## Init Metrics

In [47]:
instralist_diversity = IntralistDiversity()
distribution = Distribution()
serendipity = Serendipity()
sentiment = Sentiment()
coverage = Coverage()
novelty = Novelty()

## Setting Baselines (and your model)

In [48]:
RANDOM_ITER = df_beyond_accuarcy.select(DEFAULT_INVIEW_ARTICLES_COL).collect().shape[0]
TOP_N = 5

np.random.seed(123)
# Make list:
top_n_candidates_random = [
    np.random.choice(candidate_list, size=TOP_N, replace=False)
    for _ in range(RANDOM_ITER)
]
top_n_candidates_editorial_picks = candidates_editorial_picks[:, :TOP_N]
top_n_candidates_popular = candidates_popular[:, :TOP_N]
top_n_candidates_newest = candidates_newest[:, :TOP_N]
#
# Set them as tuples, just to loop through it:
candidates_name_pairs = [
    [top_n_candidates_editorial_picks, "editorial_picks"],
    [top_n_candidates_popular, "popular"],
    [top_n_candidates_random, "random"],
    [top_n_candidates_newest, "newest"],
]
# =>
user_history = user_history_dict[DEFAULT_HISTORY_ARTICLE_ID_COL]

print(f"#random-iterations: {RANDOM_ITER}")
print(f"Top@{TOP_N} ranked articles")

#random-iterations: 1615
Top@5 ranked articles


### Your Model
Try to add your model's prediction of the candidate list. In this notebook we just take a random sample.

In [49]:
candidates_your_model = [
    np.random.choice(candidate_list, size=TOP_N, replace=False)
    for _ in range(RANDOM_ITER)
]
candidates_name_pairs.append([candidates_your_model, "random_2"])

## User-level

### Instralist-Diversity

In [50]:
instralist_diversity_dict = {"name": ["mean", "std"]}

for candidates, list_name in candidates_name_pairs:
    scores = instralist_diversity(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=CONTRASTIVE_VECTOR,
    )
    instralist_diversity_dict[f"{list_name}_{instralist_diversity.name}"] = [
        scores.mean(),
        scores.std(),
    ]

write_json_file(
    instralist_diversity_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_DIVERSITY),
    verbose=True,
)

pl.DataFrame(instralist_diversity_dict)

Writing JSON: '../downloads/demo/evaluation_artifacts/baselines/instralist_diversity.json'


name,editorial_picks_intralist_diversity,popular_intralist_diversity,random_intralist_diversity,newest_intralist_diversity,random_2_intralist_diversity
str,f64,f64,f64,f64,f64
"""mean""",0.790542,0.840236,0.751938,0.91091,0.749204
"""std""",0.0,0.0,0.091009,0.0,0.093655


#### The embedding representation
This might be obvious, but the embedding representation used for computing a metric is very influential. Hence, baselines are important to determine high and low scores. Also, this is why these metrics can be very hard to interpret for us.

In [51]:
ROBERTA_EMB = "FacebookAI/xlm-roberta-base"
BERT_EMB = "google-bert/bert-base-multilingual-cased"
print(
    f"{CONTRASTIVE_VECTOR}: {instralist_diversity(top_n_candidates_editorial_picks, lookup_dict=articles_dict, lookup_key=CONTRASTIVE_VECTOR).mean()}"
)
print(
    f"{DOCUMENT_VECTOR}: {instralist_diversity(top_n_candidates_editorial_picks, lookup_dict=articles_dict, lookup_key=DOCUMENT_VECTOR).mean()}"
)

contrastive_vector: 0.7905415595169083
document_vector: 0.1584846028291677


### Sentiment

In [52]:
sentiment_dict = {"name": ["mean", "std"]}

for candidates, list_name in candidates_name_pairs:
    scores = sentiment(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=DEFAULT_SENTIMENT_SCORE_COL,
    )
    sentiment_dict[f"{list_name}_{sentiment.name}"] = [
        scores.mean(),
        scores.std(),
    ]

write_json_file(
    sentiment_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_SENTIMENT_SCORE),
    verbose=True,
)

pl.DataFrame(sentiment_dict)

Writing JSON: '../downloads/demo/evaluation_artifacts/baselines/sentiment_score.json'


name,editorial_picks_sentiment,popular_sentiment,random_sentiment,newest_sentiment,random_2_sentiment
str,f64,f64,f64,f64,f64
"""mean""",0.7294,0.78342,0.820845,0.8303,0.820305
"""std""",0.0,0.0,0.071638,0.0,0.070301


### Novelty

In [53]:
novelty_dict = {"name": ["mean", "std"]}

for candidates, list_name in candidates_name_pairs:
    scores = novelty(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX,
    )
    novelty_dict[f"{list_name}_{novelty.name}"] = [
        scores.mean(),
        scores.std(),
    ]

write_json_file(
    novelty_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_NOVELTY),
    verbose=True,
)

pl.DataFrame(novelty_dict)

Writing JSON: '../downloads/demo/evaluation_artifacts/baselines/novelty.json'


name,editorial_picks_novelty,popular_novelty,random_novelty,newest_novelty,random_2_novelty
str,f64,f64,f64,f64,f64
"""mean""",3.52556,1.968051,7.682532,3.997557,7.751626
"""std""",0.0,0.0,2.051901,0.0,2.117146


### Serendipity
When computing Serendipity it using the user's history; similarity between recommendations and browsed items

In [54]:
serendipity_dict = {"name": ["mean", "std"]}
for candidates, list_name in candidates_name_pairs:
    if len(candidates) == 1:
        candidates = np.tile(candidates, len(user_history)).reshape(-1, TOP_N)
    #
    scores = serendipity(
        candidates,
        H=user_history,
        lookup_dict=articles_dict,
        lookup_key=CONTRASTIVE_VECTOR,
    )
    serendipity_dict[f"{list_name}_{serendipity.name}"] = [scores.mean(), scores.std()]

write_json_file(
    serendipity_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_SERENDIPITY),
    verbose=True,
)

pl.DataFrame(serendipity_dict)

Writing JSON: '../downloads/demo/evaluation_artifacts/baselines/serendipity.json'


name,editorial_picks_serendipity,popular_serendipity,random_serendipity,newest_serendipity,random_2_serendipity
str,f64,f64,f64,f64,f64
"""mean""",0.783484,0.789688,0.804878,0.832755,0.8049
"""std""",0.031662,0.031837,0.037859,0.020198,0.039168


## Model-level

### Coverage

In [55]:
coverage_dict = {"name": ["count", "fraction"]}
for candidates, list_name in candidates_name_pairs:
    coverage_dict[f"{list_name}_{coverage.name}"] = coverage(candidates, candidate_list)

write_json_file(
    coverage_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_COVERAGE),
    verbose=True,
)

pl.DataFrame(coverage_dict)

Writing JSON: '../downloads/demo/evaluation_artifacts/baselines/coverage.json'


name,editorial_picks_coverage,popular_coverage,random_coverage,newest_coverage,random_2_coverage
str,f64,f64,f64,f64,f64
"""count""",5.0,5.0,250.0,5.0,250.0
"""fraction""",0.02,0.02,1.0,0.02,1.0


### Distribution - Category

#### Distribution helper function

In [56]:
def concat_distribution_dict(dict_: dict) -> dict:
    output_results = (
        pl.concat(
            [pl.DataFrame(val) for val in dict_.values()],
            how="diagonal",
        )
        .with_row_index(name="name")
        .with_columns(pl.Series(dict_.keys()).alias("name"))
    ).to_dict()
    return {key: val.to_list() for key, val in output_results.items()}

In [57]:
COLUMN = DEFAULT_CATEGORY_STR_COL
distribution_dict = {
    f"{list_name}_{novelty.name}": distribution(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=COLUMN,
    )
    for candidates, list_name in candidates_name_pairs
}
# =>
distribution_category_dict = concat_distribution_dict(distribution_dict)

write_json_file(
    distribution_category_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_DISTRIBUTION_CATEGORY),
    verbose=True,
)

pl.DataFrame(distribution_category_dict)

Writing JSON: '../downloads/demo/evaluation_artifacts/baselines/distribution_category.json'


name,forbrug,nyheder,krimi,sport,penge,auto,underholdning,musik,nationen,incoming
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""editorial_pick…",0.2,0.2,0.2,0.2,0.2,,,,,
"""popular_novelt…",,0.2,0.2,0.4,0.2,,,,,
"""random_novelty…",0.011765,0.161115,0.097214,0.166192,0.039876,0.379195,0.083591,0.03356,0.024272,0.00322
"""newest_novelty…",,,0.4,0.2,0.2,,0.2,,,
"""random_2_novel…",0.011146,0.162105,0.088545,0.170402,0.039009,0.385635,0.080248,0.038638,0.020186,0.004087


### Distribution - Sentiment

In [58]:
COLUMN = DEFAULT_SENTIMENT_LABEL_COL
distribution_dict = {
    f"{list_name}_{novelty.name}": distribution(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=COLUMN,
    )
    for candidates, list_name in candidates_name_pairs
}
# =>
distribution_sentiment_dict = concat_distribution_dict(distribution_dict)

write_json_file(
    distribution_sentiment_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_DISTRIBUTION_SENTIMENT_LABEL),
    verbose=True,
)

pl.DataFrame(distribution_sentiment_dict)

Writing JSON: '../downloads/demo/evaluation_artifacts/baselines/distribution_sentiment_label.json'


name,Negative,Positive,Neutral
str,f64,f64,f64
"""editorial_pick…",0.4,0.6,
"""popular_novelt…",0.2,0.4,0.4
"""random_novelty…",0.392074,0.313808,0.294118
"""newest_novelty…",0.6,0.2,0.2
"""random_2_novel…",0.394551,0.316037,0.289412


### Distribution - Topics

In [59]:
COLUMN = DEFAULT_TOPICS_COL
distribution_dict = {
    f"{list_name}_{novelty.name}": distribution(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=COLUMN,
    )
    for candidates, list_name in candidates_name_pairs
}
# =>

distribution_topics_dict = concat_distribution_dict(distribution_dict)

write_json_file(
    distribution_topics_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_DISTRIBUTION_TOPICS),
    verbose=True,
)

pl.DataFrame(distribution_topics_dict)

Writing JSON: '../downloads/demo/evaluation_artifacts/baselines/distribution_topics.json'


name,Livsstil,Sundhed,Sygdom og behandling,Kendt,Bolig,Køb og salg,Kriminalitet,Bandekriminalitet,Sport,Erhverv,Privat virksomhed,Økonomi,Mikro,Politik,International politik,Personfarlig kriminalitet,Offentlig instans,Begivenhed,Fodbold,Sportsbegivenhed,Ketcher- og batsport,Ansættelsesforhold,Underholdning,National politik,Film og tv,Makro,Vejr,Katastrofe,Dyr,Musik og lyd,Underholdningsbegivenhed,Motorsport,Familieliv,Mindre ulykke,Større katastrofe,Transportmiddel,Større transportmiddel,Offentlig transport,Teknologi,Kunstig intelligens og software,Forbrugerelektronik,Bil,Mad og drikke,Samfund,Værdier,Konflikt og krig,Bæredygtighed og klima,Bedrageri,Væbnet konflikt,Reality,Partnerskab,Kosmetisk behandling,Kultur,Håndbold,Videnskab,Naturvidenskab,Uddannelse,Ungdomsuddannelse,Renovering og indretning,Udlejning
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""editorial_pick…",0.133333,0.066667,0.066667,0.133333,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""popular_novelt…",,,,0.157895,,,0.052632,,0.105263,0.105263,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""random_novelty…",0.017945,0.008588,0.007506,0.071396,0.071222,0.066543,0.038159,0.00391,0.045491,0.095625,0.070314,0.105401,0.007087,0.032154,0.014209,0.024648,0.002269,0.029676,0.029501,0.017002,0.007192,0.021471,0.028209,0.017177,0.007716,0.008239,0.002793,0.015536,0.000803,0.01016,0.010509,0.002165,0.002199,0.010648,0.001187,0.01613,0.004818,0.000943,0.004469,0.002374,0.001222,0.007227,0.004224,0.012813,0.002514,0.007785,0.002444,0.00206,0.002933,0.001292,0.002584,0.001117,0.00192,0.001885,0.001292,0.001292,0.004259,0.004259,0.001152,0.002339
"""newest_novelty…",,,,0.066667,,,0.066667,,0.066667,0.066667,0.066667,0.066667,,,,0.066667,,0.066667,,0.066667,0.066667,,0.066667,,,,,0.066667,,,,,,0.066667,,,,,,,,,,,,,,,,,,,,,,,0.066667,0.066667,,
"""random_2_novel…",0.017772,0.008415,0.007297,0.071296,0.069446,0.064977,0.035299,0.003107,0.046751,0.09867,0.073147,0.107503,0.007297,0.031493,0.013652,0.02252,0.002025,0.032017,0.031423,0.018121,0.006669,0.022171,0.029084,0.016375,0.007158,0.008065,0.002793,0.013547,0.001432,0.011731,0.011487,0.002758,0.002479,0.009741,0.000943,0.014839,0.004958,0.001222,0.004155,0.00213,0.001257,0.005901,0.004748,0.011976,0.00192,0.006808,0.002025,0.002235,0.002619,0.001257,0.002095,0.001187,0.002828,0.00185,0.001501,0.001501,0.004434,0.004434,0.001222,0.002235


# Make a Submission file
In this example, we randomly rank the articles.

In [60]:
np.random.seed(123)
df = (
    pl.concat([df_behaviors, df_beyond_accuarcy])
    .select(DEFAULT_IMPRESSION_ID_COL, DEFAULT_INVIEW_ARTICLES_COL)
    .with_columns(
        pl.col(DEFAULT_INVIEW_ARTICLES_COL).map_elements(
            lambda x: list(np.random.permutation(len(x)) + 1)
        )
    )
    .collect()
)
df.head(3)

impression_id,article_ids_inview
u32,list[i64]
6453517,"[7, 11, … 3]"
6455350,"[8, 14, … 6]"
6749981,"[5, 4, … 3]"


### Example: how to convert prediction scores to argsorted output format
A quick detour, to see how, you could convert actual prediction-scores to the format

In [61]:
rank_score = df[:100].with_columns(
    pl.col(DEFAULT_INVIEW_ARTICLES_COL).list.eval(1 / pl.element())
)
rank_score.head(5)

impression_id,article_ids_inview
u32,list[f64]
6453517,"[0.142857, 0.090909, … 0.333333]"
6455350,"[0.125, 0.071429, … 0.166667]"
6749981,"[0.2, 0.25, … 0.333333]"
6749983,"[0.5, 0.125, … 0.142857]"
6749986,"[1.0, 0.125, … 0.142857]"


In [62]:
rank_score.with_columns(
    pl.col(DEFAULT_INVIEW_ARTICLES_COL)
    .map_elements(lambda x: list(rank_predictions_by_score(x)))
    .alias("prediction_scores")
).head(5)

impression_id,article_ids_inview,prediction_scores
u32,list[f64],list[i64]
6453517,"[0.142857, 0.090909, … 0.333333]","[7, 11, … 3]"
6455350,"[0.125, 0.071429, … 0.166667]","[8, 14, … 6]"
6749981,"[0.2, 0.25, … 0.333333]","[5, 4, … 3]"
6749983,"[0.5, 0.125, … 0.142857]","[2, 8, … 7]"
6749986,"[1.0, 0.125, … 0.142857]","[1, 8, … 7]"


### Write submission file:

In [63]:
df.head(10)

impression_ids = df[DEFAULT_IMPRESSION_ID_COL].to_list()
prediction_scores = df[DEFAULT_INVIEW_ARTICLES_COL].to_list()

write_submission_file(
    impression_ids=impression_ids,
    prediction_scores=prediction_scores,
    path=PATH_BEYOND_ACCURACY.joinpath("predictions.txt"),
    filename_zip=f"predictions_{DATASET_SIZE}_random.zip",
)

28667it [00:00, 548987.36it/s]




Zipping ../downloads/demo/evaluation_artifacts/predictions.txt to ../downloads/demo/evaluation_artifacts/predictions_demo_random.zip
