# Hello

This notebook is an example of how to make a beyond-accuracy dataset, and how one could make baselines

# Get started

## Dependencies

In [24]:
from pathlib import Path
import polars as pl
import numpy as np

from ebrec.utils._constants import *
from ebrec.evaluation.beyond_accuracy import (
    IntralistDiversity,
    Distribution,
    Serendipity,
    Sentiment,
    Coverage,
    Novelty,
)
from ebrec.utils._articles import create_sort_based_prediction_score
from ebrec.utils._behaviors import truncate_history
from ebrec.utils._polars import slice_join_dataframes
from ebrec.utils._python import (
    rank_predictions_by_score,
    write_submission_file,
    write_json_file,
    read_json_file,
)

## Set paths

In [31]:
DATASPLIT = "ebnerd_large"
# ROOT PATH:
PATH = Path("~/ebnerd_data").expanduser()
#
## We are using the LARGE articles dataset to ensure we have all articles IDs,
# for beyond-accuracy; as only 154 aids are found in the demo.
ARTICLES_PATH = PATH.joinpath("articles.parquet")


# PATH TO DUMP ARTIFACTS:
PATH_BEYOND_ACCURACY = Path("ebnerd_predictions")
PATH_BEYOND_ACCURACY.mkdir(exist_ok=True, parents=True)
# BASELINE ARTIFACTS:
PATH_BEYOND_ACCURACY_BASELINES = PATH_BEYOND_ACCURACY.joinpath("baselines")
PATH_BEYOND_ACCURACY_BASELINES.mkdir(parents=True, exist_ok=True)

### Output files

In [32]:
BEYOND_ACCURACY_HISTORY_DICT = "beyond_accuracy_history_dict.json"
BEYOND_ACCURACY_USERS_DICT = "beyond_accuracy_users_dict.json"
CANDIDATE_LIST = "candidate_list.json"
ARTICLES_DICT = "articles_dict.json"
BEHAVIORS_TIMESTAMP_DICT = "behaviors_timestamp_dict.json"
#
BASELINE_DIVERSITY = "intralist_diversity.json"
BASELINE_SENTIMENT_SCORE = "sentiment_score.json"
BASELINE_NOVELTY = "novelty.json"
BASELINE_SERENDIPITY = "serendipity.json"
BASELINE_COVERAGE = "coverage.json"
BASELINE_DISTRIBUTION_CATEGORY = "distribution_category.json"
BASELINE_DISTRIBUTION_SENTIMENT_LABEL = "distribution_sentiment_label.json"
BASELINE_DISTRIBUTION_TOPICS = "distribution_topics.json"

## Load dataset

In [33]:
# Load data:
df_beyond_accuarcy = pl.scan_parquet(
    PATH.joinpath("ebnerd_testset", "test", "behaviors.parquet")
).filter(pl.col("is_beyond_accuracy"))
df_behaviors = pl.scan_parquet(
    PATH.joinpath("ebnerd_testset", "test", "behaviors.parquet")
).filter(~pl.col("is_beyond_accuracy"))
df_articles = pl.scan_parquet(ARTICLES_PATH)
df_history = pl.scan_parquet(
    PATH.joinpath("ebnerd_testset", "test", "history.parquet")
).select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)

# Make / Dump Metadata

## Candidate list:

We select the candidate list from the testset

In [34]:
candidate_list = (
    df_beyond_accuarcy.select(pl.col(DEFAULT_INVIEW_ARTICLES_COL).first())
    .collect()
    .to_series()
)[0].to_list()


write_json_file(
    candidate_list, PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST), verbose=True
)

print(f"Number of Candidate IDs: {len(candidate_list)} (example: {candidate_list[:5]})")

Writing JSON: 'ebnerd_predictions/candidate_list.json'
Number of Candidate IDs: 250 (example: [9793163, 9793069, 9792076, 9792749, 9791280])


#### Sanity check

In [36]:
load_candidate_list = read_json_file(PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST))
if (
    not (
        df_beyond_accuarcy.select(DEFAULT_INVIEW_ARTICLES_COL).collect()
        == candidate_list
    )
    .sum()[DEFAULT_INVIEW_ARTICLES_COL]
    .to_list()[0]
    == df_beyond_accuarcy.select(DEFAULT_INVIEW_ARTICLES_COL).collect().shape[0]
):
    raise ValueError("candidate_list is not identical in the testset")

if not (np.array(candidate_list) - np.array(load_candidate_list)).sum() == 0:
    raise ValueError("candidate_list was not dump correctly")

print("santity check - passed")

santity check - passed


## User meta data: Segments

In [37]:
user_meta_columns = [
    DEFAULT_IS_SUBSCRIBER_COL,
    DEFAULT_IS_SSO_USER_COL,
    DEFAULT_POSTCODE_COL,
    DEFAULT_GENDER_COL,
    DEFAULT_AGE_COL,
]
df_users = df_beyond_accuarcy.select(pl.col(user_meta_columns)).collect()

users_dict = {col: df_users[col].to_list() for col in df_users.columns}
write_json_file(
    users_dict, PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_USERS_DICT), verbose=True
)
print(f"#rows: {df_users.shape[0]}")
df_users.head(3)

Writing JSON: 'ebnerd_predictions/beyond_accuracy_users_dict.json'
#rows: 200000


is_subscriber,is_sso_user,postcode,gender,age
bool,bool,i8,i8,i8
True,True,,0.0,
True,True,,,
True,True,,0.0,50.0


## User Histories

In [38]:
HISTORY_SIZE = 20
df_history_truncate = df_history.pipe(
    truncate_history,
    column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_size=HISTORY_SIZE,
    padding_value=None,
    enable_warning=False,
)
# =>
df_user_histoies = (
    df_beyond_accuarcy.select(DEFAULT_USER_COL)
    .join(df_history_truncate, on=DEFAULT_USER_COL, how="left")
    .collect()
)
user_history_dict = {
    DEFAULT_HISTORY_ARTICLE_ID_COL: df_user_histoies[
        DEFAULT_HISTORY_ARTICLE_ID_COL
    ].to_list()
}
write_json_file(
    user_history_dict,
    PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_HISTORY_DICT),
    verbose=True,
)
print(f"#rows: {df_user_histoies.shape[0]}")
df_user_histoies.head(3)

Writing JSON: 'ebnerd_predictions/beyond_accuracy_history_dict.json'
#rows: 200000


user_id,article_id_fixed
u32,list[i32]
1049297,"[9788862, 9788067, … 9787586]"
231624,"[9789910, 9789704, … 9790811]"
716356,"[9785062, 9772508, … 9786313]"


## Timestamp for Behaviors

Used for computing the AUC as function of time

In [39]:
df_behaviors_timestamp = df_behaviors.select(
    pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).cast(pl.Utf8),
).collect()
behaviors_timestamp_dict = {
    DEFAULT_IMPRESSION_TIMESTAMP_COL: df_behaviors_timestamp[
        DEFAULT_IMPRESSION_TIMESTAMP_COL
    ].to_list()
}
write_json_file(
    behaviors_timestamp_dict,
    PATH_BEYOND_ACCURACY.joinpath(BEHAVIORS_TIMESTAMP_DICT),
    verbose=True,
)
print(f"#rows: {df_behaviors_timestamp.shape[0]}")
df_behaviors_timestamp.head(3)

Writing JSON: 'ebnerd_predictions/behaviors_timestamp_dict.json'
#rows: 13336710


impression_time
str
"""2023-06-05 15:…"
"""2023-06-05 15:…"
"""2023-06-05 15:…"


# Make Candidate lookup dict / Dump lookup dict

## Articles to include: *candidate-list* and *history-articles*

In [40]:
history_article_id = (
    df_user_histoies.lazy()
    .select(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).explode().unique())
    .collect()[DEFAULT_HISTORY_ARTICLE_ID_COL]
    .to_list()
)
print(f"#history_article_id: {len(history_article_id)})")

#history_article_id: 14228)


Note, the different datasizes (*demo*, *small*, and *large*) has subset of the total article-catelog. Hence, if you're using *demo*, not all of the articles in the candidate-list may be in the dataset.

In [41]:
aids_in_split = (
    df_articles.select(DEFAULT_ARTICLE_ID_COL)
    .collect()[DEFAULT_ARTICLE_ID_COL]
    .to_list()
)

history_article_id = [id for id in history_article_id if id in aids_in_split]
candidate_list = [id for id in candidate_list if id in aids_in_split]

article_ids = candidate_list + history_article_id
print(
    f"#articles: {len(article_ids)} (#candidate_list: {len(candidate_list)} & #history_article_id: {len(history_article_id)})"
)

#articles: 14478 (#candidate_list: 250 & #history_article_id: 14228)


## Select articles that should be included in the lookup dictionary

In [42]:
# =>
df_lookup_articles = (
    df_articles.filter(pl.col(DEFAULT_ARTICLE_ID_COL).is_in(article_ids))
    .with_columns(
        pl.col(
            DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL,
            DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL,
        ).cast(pl.Utf8)
    )
    # Zeros might cause issues
    .with_columns(
        pl.col(DEFAULT_TOTAL_INVIEWS_COL, DEFAULT_TOTAL_PAGEVIEWS_COL).fill_null(1)
    )
    .collect()
)
print(f"df_lookup_articles shape: {df_lookup_articles.shape}")

df_lookup_articles shape: (14478, 21)


### Make normalize popularity-scores

In [43]:
DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MAX = (
    DEFAULT_TOTAL_PAGEVIEWS_COL + "_normalized_max"
)
DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX = (
    DEFAULT_TOTAL_PAGEVIEWS_COL + "_normalized_min_max"
)

MIN_X = df_lookup_articles[DEFAULT_TOTAL_PAGEVIEWS_COL].min()
MAX_X = df_lookup_articles[DEFAULT_TOTAL_PAGEVIEWS_COL].max()
MIN_RANGE = 1e-4
MAX_RANGE = 1.0

df_lookup_articles = df_lookup_articles.with_columns(
    (  # SIMPLE MAX NORMALIZATION: x / max()
        pl.col(DEFAULT_TOTAL_PAGEVIEWS_COL) / pl.col(DEFAULT_TOTAL_PAGEVIEWS_COL).max()
    ).alias(DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MAX)
).with_columns(
    (  #  MIN-MAX NORMALIZATION: ( x_i − X_min ⁡ ) / ( X_max ⁡ − X_min ⁡ ) * (max_range − min_range) + min_range
        ((pl.col(DEFAULT_TOTAL_PAGEVIEWS_COL) - MIN_X) / (MAX_X - MIN_X))
        * (MAX_RANGE - MIN_RANGE)
        + MIN_RANGE
    ).alias(
        DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX
    )
)

df_lookup_articles.select(
    DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MAX,
    DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX,
).describe()

statistic,total_pageviews_normalized_max,total_pageviews_normalized_min_max
str,f64,f64
"""count""",14478.0,14478.0
"""null_count""",0.0,0.0
"""mean""",0.018543,0.018641
"""std""",0.034784,0.034781
"""min""",6.1059e-07,0.0001
"""25%""",6.1059e-07,0.0001
"""50%""",6.1059e-07,0.0001
"""75%""",0.028582,0.028679
"""max""",1.0,1.0


## Add embeddings representations

In [47]:
# => Embeddings:
BERT_VECTOR = "bert_base_multilingual_cased"
ROBERTA_VECTOR = "xlm_roberta_base"

CONTRASTIVE_VECTOR = "contrastive_vector"
DOCUMENT_VECTOR = "document_vector"


def load_join_embeddings(df: pl.DataFrame, emb_path: Path) -> pl.DataFrame:
    emb_contrastive = (
        pl.scan_parquet(PATH.parent.joinpath(emb_path))
        .filter(pl.col(DEFAULT_ARTICLE_ID_COL).is_in(df.select(DEFAULT_ARTICLE_ID_COL)))
        .collect()
    )
    return df.join(emb_contrastive, on=DEFAULT_ARTICLE_ID_COL, how="left")


df_lookup_articles = df_lookup_articles.pipe(
    load_join_embeddings,
    emb_path=PATH.joinpath(
        "artifacts", "Ekstra_Bladet_contrastive_vector", f"{CONTRASTIVE_VECTOR}.parquet"
    ),
).pipe(
    load_join_embeddings,
    emb_path=PATH.joinpath(
        "artifacts", "Ekstra_Bladet_word2vec", f"{DOCUMENT_VECTOR}.parquet"
    ),
)
print(f"#rows: {df_lookup_articles.shape[0]}")
df_lookup_articles.head(2)

#rows: 14478


article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label,total_pageviews_normalized_max,total_pageviews_normalized_min_max,contrastive_vector,document_vector
i32,str,str,str,bool,str,str,list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str,f64,f64,list[f32],list[f32]
3005351,"""Sainz bekræfte…","""FODBOLD: Carlo…","""2023-06-29 06:…",False,"""Michael Laudru…","""2006-05-15 20:…",[3170015],"""article_defaul…","""https://ekstra…",[],[],"[""Erhverv"", ""Kendt"", … ""Politik""]",142,"[196, 227]","""sport""",1,1,,0.8204,"""Neutral""",6.1059e-07,0.0001,"[-0.081303, 0.012837, … 0.025402]","[0.030043, 0.014314, … 0.069532]"
3006206,"""Ny sex-skandal…","""Stjernen fra '…","""2023-06-29 06:…",False,"""Den engelske s…","""2007-03-06 08:…",[3152747],"""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Livsstil"", … ""Kultur""]",414,[432],"""underholdning""",1,1,,0.9295,"""Negative""",6.1059e-07,0.0001,"[-0.050866, 0.063385, … -0.030806]","[0.114115, 0.00947, … 0.04868]"


## Convert to lookup dictionary:

In [48]:
articles_dict = {}
for row in df_lookup_articles.iter_rows(named=True):
    # Note, all keys in dictionaries are converted to strings, when serializing an object to JSON format.
    articles_dict[str(row[DEFAULT_ARTICLE_ID_COL])] = row
# Write it:
write_json_file(
    articles_dict, PATH_BEYOND_ACCURACY.joinpath(ARTICLES_DICT), verbose=True
)
print(f"#articles: {len(articles_dict)}")

Writing JSON: 'ebnerd_predictions/articles_dict.json'
#articles: 14478


# Create Baselines

Make a couple *Baselines* based on the candidate-list:
1. @EditorialPicks: We approximate this based on the number **inview** an articles have recived. Ekstra Bladet is front-page driven, meaning, if an article has a lot of inview-impression (seen) a lot, we believe it has been selected to be in a top priority from the editors. This is static (it does change for our *candidate_list*), i.e., the computation is done once.
2. @Popular: We approximate this based on the number **clicks** an articles have recived. This is static (it does change for our *candidate_list*), i.e., the computation is done once.
3. @Random: Simple baseline and important baseline. We simple pick a set of *top-n* articles from the *candidate-list* and run multiple times.
4. @Dissimilarity / Similarity (will come later): Select top-n articles that are the most similar / dissimilar. 
5. @Newest: Simply pick the newest released articles. We do see newssite where the top banner is *Newest released*. We include it, but note this is very sensitive and might not be meaningful.

### Load the artifacts

In [49]:
def n_items(d) -> int:
    return len(d[list(d)[0]])


# =>
behaviors_timestamp_dict = read_json_file(
    PATH_BEYOND_ACCURACY.joinpath(BEHAVIORS_TIMESTAMP_DICT)
)
print(f"#behaviors_timestamp_dict: {n_items(behaviors_timestamp_dict)}")

# =>
history_dict = read_json_file(
    PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_HISTORY_DICT)
)
print(
    f"#history_dict: {n_items(history_dict)}\n history_dict.keys(): {history_dict.keys()}"
)

# =>
users_dict = read_json_file(PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_USERS_DICT))
print(f"#users_dict {n_items(users_dict)}\n users_dict.keys(): {users_dict.keys()}")

# =>
user_history_dict = read_json_file(
    PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_HISTORY_DICT)
)
print(
    f"#user_history_dict {n_items(user_history_dict)}\n users_dict.keys(): {user_history_dict.keys()}"
)

# =>
articles_dict = {
    int(key): val
    for key, val in read_json_file(PATH_BEYOND_ACCURACY.joinpath(ARTICLES_DICT)).items()
}
aid_keys = articles_dict[list(articles_dict)[0]].keys()
print(f"#articles_dict: {len(articles_dict)}\n articles_dict[ID].keys(): {aid_keys}")

# => Only the once actually found in the dataset (for demo only 154 of 250 are represent)
candidate_list = [
    id
    for id in read_json_file(PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST))
    if id in list(articles_dict)
]
print(f"#candidate_list: {len(candidate_list)}")

df_candidate_articles = df_lookup_articles.filter(
    pl.col(DEFAULT_ARTICLE_ID_COL).is_in(candidate_list)
)
print(f"#candidate-articles (df): {df_candidate_articles.shape[0]}")

#behaviors_timestamp_dict: 13336710
#history_dict: 200000
 history_dict.keys(): dict_keys(['article_id_fixed'])
#users_dict 200000
 users_dict.keys(): dict_keys(['is_subscriber', 'is_sso_user', 'postcode', 'gender', 'age'])
#user_history_dict 200000
 users_dict.keys(): dict_keys(['article_id_fixed'])
#articles_dict: 14478
 articles_dict[ID].keys(): dict_keys(['article_id', 'title', 'subtitle', 'last_modified_time', 'premium', 'body', 'published_time', 'image_ids', 'article_type', 'url', 'ner_clusters', 'entity_groups', 'topics', 'category', 'subcategory', 'category_str', 'total_inviews', 'total_pageviews', 'total_read_time', 'sentiment_score', 'sentiment_label', 'total_pageviews_normalized_max', 'total_pageviews_normalized_min_max', 'contrastive_vector', 'document_vector'])
#candidate_list: 250
#candidate-articles (df): 250


## Make Ranked Candidate lists

### Editorial (Top Inview Articles)


In [50]:
df_candidates_editorial_picks = create_sort_based_prediction_score(
    df_candidate_articles,
    column=DEFAULT_TOTAL_INVIEWS_COL,
    desc=True,
)
candidates_editorial_picks = np.array(
    [df_candidates_editorial_picks.select(DEFAULT_ARTICLE_ID_COL).to_series()]
)
# =>
print(candidates_editorial_picks[:, :2])
df_candidates_editorial_picks.head(2)

[[9790335 9791587]]


article_id,total_inviews,prediction_score
i32,i32,f64
9790335,1698890,1.0
9791587,1369829,0.5


### Popular

In [51]:
df_candidates_popular = create_sort_based_prediction_score(
    df_candidate_articles,
    column=DEFAULT_TOTAL_PAGEVIEWS_COL,
    desc=True,
)
candidates_popular = np.array(
    [df_candidates_popular.select(DEFAULT_ARTICLE_ID_COL).to_series()]
)
# =>
print(candidates_popular[:, :2])
df_candidates_popular.head(2)

[[9791428 9792719]]


article_id,total_pageviews,prediction_score
i32,i32,f64
9791428,256541,1.0
9792719,209050,0.5


## Init Metrics

In [52]:
intralist_diversity = IntralistDiversity()
distribution = Distribution()
serendipity = Serendipity()
sentiment = Sentiment()
coverage = Coverage()
novelty = Novelty()

## Setting Baselines (and your model)

### Random-baseline
Note, we're just running a random baseline with 200,000 iterations. This means that, based on the distribution, we would expect segments to be equally represented. However, you need to increase this number before you see them truly balance out.

In [56]:
RANDOM_ITER = df_beyond_accuarcy.select(DEFAULT_INVIEW_ARTICLES_COL).collect().shape[0]
TOP_N = 5

np.random.seed(123)
# Make list:
top_n_candidates_random = [
    np.random.choice(candidate_list, size=TOP_N, replace=False)
    for _ in range(RANDOM_ITER)
]
top_n_candidates_editorial_picks = candidates_editorial_picks[:, :TOP_N]
top_n_candidates_popular = candidates_popular[:, :TOP_N]
#
# Set them as tuples, just to loop through it:
candidates_name_pairs = [
    [top_n_candidates_editorial_picks, "top-inview-articles"],
    [top_n_candidates_popular, "popular"],
    [top_n_candidates_random, "random"],
]
# =>
user_history = user_history_dict[DEFAULT_HISTORY_ARTICLE_ID_COL]

print(f"#random-iterations: {RANDOM_ITER}")
print(f"Top@{TOP_N} ranked articles")

#random-iterations: 200000
Top@5 ranked articles


### Your Model
Try to add your model's prediction of the candidate list. In this notebook we just take a random sample.

In [58]:
candidates_your_model = [
    np.random.choice(candidate_list, size=TOP_N, replace=False)
    for _ in range(RANDOM_ITER)
]
## Just uncomment and add the prediction and name:
# candidates_name_pairs.append([candidates_your_model, "random_2"])

## User-level

### Instralist-Diversity

In [59]:
intralist_diversity_dict = {"name": ["mean", "std"]}

for candidates, list_name in candidates_name_pairs:
    scores = intralist_diversity(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=CONTRASTIVE_VECTOR,
    )
    intralist_diversity_dict[f"{list_name}_{intralist_diversity.name}"] = [
        scores.mean(),
        scores.std(),
    ]

write_json_file(
    intralist_diversity_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_DIVERSITY),
    verbose=True,
)

pl.DataFrame(intralist_diversity_dict)

Writing JSON: 'ebnerd_predictions/baselines/intralist_diversity.json'


name,top-inview-articles_intralist_diversity,popular_intralist_diversity,random_intralist_diversity
str,f64,f64,f64
"""mean""",0.790542,0.840236,0.754795
"""std""",0.0,0.0,0.090213


#### The embedding representation
This might be obvious, but the embedding representation used for computing a metric is very influential. Hence, baselines are important to determine high and low scores. Also, this is why these metrics can be very hard to interpret for us.

In [60]:
ROBERTA_EMB = "FacebookAI/xlm-roberta-base"
BERT_EMB = "google-bert/bert-base-multilingual-cased"
print(
    f"{CONTRASTIVE_VECTOR}: {intralist_diversity(top_n_candidates_editorial_picks, lookup_dict=articles_dict, lookup_key=CONTRASTIVE_VECTOR).mean()}"
)
print(
    f"{DOCUMENT_VECTOR}: {intralist_diversity(top_n_candidates_editorial_picks, lookup_dict=articles_dict, lookup_key=DOCUMENT_VECTOR).mean()}"
)

contrastive_vector: 0.7905415595169083
document_vector: 0.1584846028291677


### Sentiment

In [61]:
sentiment_dict = {"name": ["mean", "std"]}

for candidates, list_name in candidates_name_pairs:
    scores = sentiment(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=DEFAULT_SENTIMENT_SCORE_COL,
    )
    sentiment_dict[f"{list_name}_{sentiment.name}"] = [
        scores.mean(),
        scores.std(),
    ]

write_json_file(
    sentiment_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_SENTIMENT_SCORE),
    verbose=True,
)

pl.DataFrame(sentiment_dict)

Writing JSON: 'ebnerd_predictions/baselines/sentiment_score.json'


name,top-inview-articles_sentiment,popular_sentiment,random_sentiment
str,f64,f64,f64
"""mean""",0.7294,0.78342,0.821769
"""std""",0.0,0.0,0.071807


### Novelty

In [62]:
novelty_dict = {"name": ["mean", "std"]}

for candidates, list_name in candidates_name_pairs:
    scores = novelty(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX,
    )
    novelty_dict[f"{list_name}_{novelty.name}"] = [
        scores.mean(),
        scores.std(),
    ]

write_json_file(
    novelty_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_NOVELTY),
    verbose=True,
)

pl.DataFrame(novelty_dict)

Writing JSON: 'ebnerd_predictions/baselines/novelty.json'


name,top-inview-articles_novelty,popular_novelty,random_novelty
str,f64,f64,f64
"""mean""",4.625792,3.069874,8.36168
"""std""",0.0,0.0,1.849578


### Serendipity
When computing Serendipity it using the user's history; similarity between recommendations and browsed items

In [63]:
serendipity_dict = {"name": ["mean", "std"]}
for candidates, list_name in candidates_name_pairs:
    if len(candidates) == 1:
        candidates = np.tile(candidates, len(user_history)).reshape(-1, TOP_N)
    #
    scores = serendipity(
        candidates,
        H=user_history,
        lookup_dict=articles_dict,
        lookup_key=CONTRASTIVE_VECTOR,
    )
    serendipity_dict[f"{list_name}_{serendipity.name}"] = [scores.mean(), scores.std()]

write_json_file(
    serendipity_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_SERENDIPITY),
    verbose=True,
)

pl.DataFrame(serendipity_dict)

Writing JSON: 'ebnerd_predictions/baselines/serendipity.json'


name,top-inview-articles_serendipity,popular_serendipity,random_serendipity
str,f64,f64,f64
"""mean""",0.786146,0.791526,0.806839
"""std""",0.026169,0.026284,0.03369


## Model-level

### Coverage

In [64]:
coverage_dict = {"name": ["count", "fraction"]}
for candidates, list_name in candidates_name_pairs:
    coverage_dict[f"{list_name}_{coverage.name}"] = coverage(candidates, candidate_list)

write_json_file(
    coverage_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_COVERAGE),
    verbose=True,
)

pl.DataFrame(coverage_dict)

Writing JSON: 'ebnerd_predictions/baselines/coverage.json'


name,top-inview-articles_coverage,popular_coverage,random_coverage
str,f64,f64,f64
"""count""",5.0,5.0,250.0
"""fraction""",0.02,0.02,1.0


### Distribution - Category

#### Distribution helper function

In [65]:
def concat_distribution_dict(dict_: dict) -> dict:
    output_results = (
        pl.concat(
            [pl.DataFrame(val) for val in dict_.values()],
            how="diagonal",
        )
        .with_row_index(name="name")
        .with_columns(pl.Series(dict_.keys()).alias("name"))
    ).to_dict()
    return {key: val.to_list() for key, val in output_results.items()}

In [66]:
COLUMN = DEFAULT_CATEGORY_STR_COL
distribution_dict = {
    f"{list_name}_{novelty.name}": distribution(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=COLUMN,
    )
    for candidates, list_name in candidates_name_pairs
}
# =>
distribution_category_dict = concat_distribution_dict(distribution_dict)

write_json_file(
    distribution_category_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_DISTRIBUTION_CATEGORY),
    verbose=True,
)

pl.DataFrame(distribution_category_dict)

Writing JSON: 'ebnerd_predictions/baselines/distribution_category.json'


name,forbrug,nyheder,krimi,sport,penge,auto,underholdning,musik,nationen,incoming
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""top-inview-art…",0.2,0.2,0.2,0.2,0.2,,,,,
"""popular_novelt…",,0.2,0.2,0.4,0.2,,,,,
"""random_novelty…",0.011886,0.160379,0.095914,0.16759,0.036201,0.37992,0.084096,0.035976,0.023992,0.004046


### Distribution - Sentiment

In [67]:
COLUMN = DEFAULT_SENTIMENT_LABEL_COL
distribution_dict = {
    f"{list_name}_{novelty.name}": distribution(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=COLUMN,
    )
    for candidates, list_name in candidates_name_pairs
}
# =>
distribution_sentiment_dict = concat_distribution_dict(distribution_dict)

write_json_file(
    distribution_sentiment_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_DISTRIBUTION_SENTIMENT_LABEL),
    verbose=True,
)

pl.DataFrame(distribution_sentiment_dict)

Writing JSON: 'ebnerd_predictions/baselines/distribution_sentiment_label.json'


name,Negative,Positive,Neutral
str,f64,f64,f64
"""top-inview-art…",0.4,0.6,
"""popular_novelt…",0.2,0.4,0.4
"""random_novelty…",0.396154,0.312102,0.291744


### Distribution - Topics

In [68]:
COLUMN = DEFAULT_TOPICS_COL
distribution_dict = {
    f"{list_name}_{novelty.name}": distribution(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=COLUMN,
    )
    for candidates, list_name in candidates_name_pairs
}
# =>

distribution_topics_dict = concat_distribution_dict(distribution_dict)

write_json_file(
    distribution_topics_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_DISTRIBUTION_TOPICS),
    verbose=True,
)

pl.DataFrame(distribution_topics_dict)

Writing JSON: 'ebnerd_predictions/baselines/distribution_topics.json'


name,Livsstil,Sundhed,Sygdom og behandling,Kendt,Bolig,Køb og salg,Kriminalitet,Bandekriminalitet,Sport,Erhverv,Privat virksomhed,Økonomi,Mikro,Politik,International politik,Personfarlig kriminalitet,Offentlig instans,Begivenhed,Fodbold,Sportsbegivenhed,Ketcher- og batsport,Ansættelsesforhold,Underholdning,National politik,Film og tv,Makro,Vejr,Katastrofe,Dyr,Musik og lyd,Underholdningsbegivenhed,Motorsport,Familieliv,Mindre ulykke,Større katastrofe,Transportmiddel,Større transportmiddel,Offentlig transport,Teknologi,Kunstig intelligens og software,Forbrugerelektronik,Bil,Mad og drikke,Samfund,Værdier,Konflikt og krig,Bæredygtighed og klima,Bedrageri,Væbnet konflikt,Reality,Partnerskab,Kosmetisk behandling,Kultur,Håndbold,Videnskab,Naturvidenskab,Uddannelse,Ungdomsuddannelse,Renovering og indretning,Udlejning
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""top-inview-art…",0.133333,0.066667,0.066667,0.133333,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""popular_novelt…",,,,0.157895,,,0.052632,,0.105263,0.105263,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""random_novelty…",0.016972,0.008972,0.007861,0.07182,0.069587,0.06503,0.038267,0.003397,0.045939,0.095745,0.07094,0.104557,0.00675,0.031369,0.013429,0.024699,0.002267,0.031291,0.030231,0.017845,0.00678,0.021373,0.029226,0.016832,0.007851,0.007883,0.002222,0.015724,0.00112,0.011237,0.011198,0.002232,0.002253,0.011225,0.001136,0.016908,0.005648,0.001156,0.004462,0.002265,0.001123,0.00675,0.004573,0.012429,0.00221,0.006748,0.002306,0.002265,0.002253,0.001114,0.002229,0.001145,0.002285,0.002227,0.001138,0.001138,0.00448,0.00448,0.001153,0.002251


# Make a Submission file
In this example, we randomly rank the articles.

In [69]:
np.random.seed(123)
df = (
    pl.concat([df_behaviors, df_beyond_accuarcy])
    .select(DEFAULT_IMPRESSION_ID_COL, DEFAULT_INVIEW_ARTICLES_COL)
    .with_columns(
        pl.col(DEFAULT_INVIEW_ARTICLES_COL).map_elements(
            lambda x: list(np.random.permutation(len(x)) + 1)
        )
    )
    .collect()
)
df.head(3)

impression_id,article_ids_inview
u32,list[i64]
6451339,"[8, 1, … 3]"
6451363,"[5, 4, … 2]"
6451382,"[1, 5, … 2]"


### Example: how to convert prediction scores to argsorted output format
A quick detour, to see how, you could convert actual prediction-scores to the format

In [70]:
rank_score = df[:100].with_columns(
    pl.col(DEFAULT_INVIEW_ARTICLES_COL).list.eval(1 / pl.element())
)
rank_score.head(5)

impression_id,article_ids_inview
u32,list[f64]
6451339,"[0.125, 1.0, … 0.333333]"
6451363,"[0.2, 0.25, … 0.5]"
6451382,"[1.0, 0.2, … 0.5]"
6451383,"[0.142857, 0.166667, … 0.2]"
6451385,"[0.2, 0.333333, … 1.0]"


In [71]:
rank_score.with_columns(
    pl.col(DEFAULT_INVIEW_ARTICLES_COL)
    .map_elements(lambda x: list(rank_predictions_by_score(x)))
    .alias("prediction_scores")
).head(5)

impression_id,article_ids_inview,prediction_scores
u32,list[f64],list[i64]
6451339,"[0.125, 1.0, … 0.333333]","[8, 1, … 3]"
6451363,"[0.2, 0.25, … 0.5]","[5, 4, … 2]"
6451382,"[1.0, 0.2, … 0.5]","[1, 5, … 2]"
6451383,"[0.142857, 0.166667, … 0.2]","[7, 6, … 5]"
6451385,"[0.2, 0.333333, … 1.0]","[5, 3, … 1]"


### Write submission file:

In [72]:
impression_ids = df[DEFAULT_IMPRESSION_ID_COL].to_list()
prediction_scores = df[DEFAULT_INVIEW_ARTICLES_COL].to_list()

write_submission_file(
    impression_ids=impression_ids,
    prediction_scores=prediction_scores,
    path=PATH_BEYOND_ACCURACY.joinpath("predictions.txt"),
    filename_zip=f"predictions_{DATASPLIT}_random.zip",
)

13536710it [00:19, 712081.65it/s]


Zipping ebnerd_predictions/predictions.txt to ebnerd_predictions/predictions_ebnerd_large_random.zip


## The End 🚀