# Example of how to make a beyond-accuracy dataset

## Get started

### Dependencies

In [165]:
from ebrec.utils._python import write_json_file, read_json_file
from pathlib import Path
import polars as pl
import numpy as np
import json

from ebrec.utils._constants import (
    DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL,
    DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_TOTAL_PAGEVIEWS_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_TOTAL_INVIEWS_COL,
    DEFAULT_IS_SUBSCRIBER_COL,
    DEFAULT_ARTICLE_ID_COL,
    DEFAULT_POSTCODE_COL,
    DEFAULT_GENDER_COL,
    DEFAULT_AGE_COL,
)

### Load dataset

In [166]:
dataset_split = "demo"
path = Path(f"../downloads/{dataset_split}")
path_beyond = path.joinpath("beyond_accuracy")
path_beyond.mkdir(exist_ok=True, parents=True)

### Output files:

In [167]:
BEHAVIORS_TIMESTAMP_DICT = "behaviors_timestamp_dict.json"
CANDIDATE_LIST = "candidate_list.json"
CANDIDATE_DICT = "candidate_dict.json"
USERS_DICT = "users_dict.json"

### Load Dataset

In [168]:
df_beyond_accuarcy = pl.scan_parquet(path.joinpath("test", "behaviors.parquet")).filter(pl.col("is_beyond_accuracy"))
df_behaviors = pl.scan_parquet(path.joinpath("test", "behaviors.parquet")).filter(~pl.col("is_beyond_accuracy"))
df_articles = pl.scan_parquet(path.joinpath("articles.parquet"))

## Dump Metadata

### Make candidate list for beyond-accuracy:

We select the candidate list from the testset

In [169]:
candidate_list = (
    df_beyond_accuarcy.select(pl.col(DEFAULT_INVIEW_ARTICLES_COL).first())
    .collect()
    .to_series()
)[0].to_list()
write_json_file(candidate_list, path_beyond.joinpath(CANDIDATE_LIST))

print(f"Number of Candidate IDs: {len(candidate_list)} (example: {candidate_list[:5]})")
print(f"Dump: {path_beyond.joinpath(CANDIDATE_LIST)}")

Number of Candidate IDs: 250 (example: [9793163, 9793069, 9792076, 9792749, 9791280])
Dump: ../downloads/demo/beyond_accuracy/candidate_list.json


#### Sanity check

In [170]:
load_candidate_list = read_json_file(path_beyond.joinpath(CANDIDATE_LIST))
if (
    not (
        df_beyond_accuarcy.select(DEFAULT_INVIEW_ARTICLES_COL).collect()
        == candidate_list
    )
    .sum()[DEFAULT_INVIEW_ARTICLES_COL]
    .to_list()[0]
    == df_beyond_accuarcy.select(DEFAULT_INVIEW_ARTICLES_COL).collect().shape[0]
):
    raise ValueError("candidate_list is not identical in the testset")

if not (np.array(candidate_list) - np.array(load_candidate_list)).sum() == 0:
    raise ValueError("candidate_list was not dump correctly")

print("santity check - passed")

santity check - passed


### User meta data: Segments

In [171]:
user_meta_columns = [
    DEFAULT_IS_SUBSCRIBER_COL,
    DEFAULT_POSTCODE_COL,
    DEFAULT_GENDER_COL,
    DEFAULT_AGE_COL,
]
df_users = df_beyond_accuarcy.select(pl.col(user_meta_columns)).collect()

users_dict = {col : df_users[col].to_list() for col in df_users.columns}
write_json_file(users_dict, path_beyond.joinpath(USERS_DICT))
print(f"Dump: {path_beyond.joinpath(USERS_DICT)}")
df_users.head(3)


Dump: ../downloads/demo/beyond_accuracy/users_dict.json


is_subscriber,postcode,gender,age
bool,i8,i8,i8
True,,0.0,30.0
True,,,
True,,0.0,


### Timestamp for Behaviors

Can be used for making the AUC as function of time

In [172]:
df_behaviors_timestamp = (
    df_behaviors.select(
        pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).cast(pl.Utf8),
    )
    .collect()
)
behaviors_timestamp_dict = {
    DEFAULT_IMPRESSION_TIMESTAMP_COL: df_behaviors_timestamp[
        DEFAULT_IMPRESSION_TIMESTAMP_COL
    ].to_list()
}
write_json_file(
    behaviors_timestamp_dict, path_beyond.joinpath(BEHAVIORS_TIMESTAMP_DICT)
)
print(f"Dump: {path_beyond.joinpath(BEHAVIORS_TIMESTAMP_DICT)}")
df_behaviors_timestamp.head(3)

Dump: ../downloads/demo/beyond_accuracy/behaviors_timestamp_dict.json


impression_time
str
"""2023-06-05 15:…"
"""2023-06-05 15:…"
"""2023-06-01 10:…"


## Make Candidate lookup dict

### Select Candidate articles

In [173]:
### Make candidate lookup dictionary for beyond-accuracy:
# =>
candidate_articles = (
    df_articles.filter(pl.col(DEFAULT_ARTICLE_ID_COL).is_in(candidate_list))
    .with_columns(
        pl.col(
            DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL,
            DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL,
        ).cast(pl.Utf8)
    )
    # Zeros might cause issues
    .with_columns(
        pl.col(DEFAULT_TOTAL_INVIEWS_COL, DEFAULT_TOTAL_PAGEVIEWS_COL).fill_null(1)
    )
    .collect()
)

### Add embeddings representations

In [174]:
# => Embeddings:
def load_join_embeddings(df:pl.DataFrame, emb_path:Path) -> pl.DataFrame:
    emb_contrastive = (
        pl.scan_parquet(
            path.parent.joinpath(emb_path)
        )
        .filter(
            pl.col(DEFAULT_ARTICLE_ID_COL).is_in(df.select(DEFAULT_ARTICLE_ID_COL))
        ).collect()
    )
    return df.join(emb_contrastive, on=DEFAULT_ARTICLE_ID_COL, how = "left")

candidate_articles = (
    candidate_articles.pipe(
        load_join_embeddings,
        emb_path="embeddings/Ekstra_Bladet_contrastive_vector/contrastive_vector.parquet",
    )
    .pipe(
        load_join_embeddings,
        emb_path="embeddings/FacebookAI_xlm_roberta_base/xlm_roberta_base.parquet",
    )
    .pipe(
        load_join_embeddings,
        emb_path="embeddings/google_bert_base_multilingual_cased/bert_base_multilingual_cased.parquet"
    )
    .pipe(
        load_join_embeddings,
        emb_path="embeddings/Ekstra_Bladet_word2vec/document_vector.parquet"
    )
)
candidate_articles.head(2)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label,contrastive_vector,FacebookAI/xlm-roberta-base,google-bert/bert-base-multilingual-cased,document_vector
i32,str,str,str,bool,str,str,list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str,list[f32],list[f32],list[f32],list[f32]
9777912,"""Stort galleri:…","""Den tyske topm…","""2023-10-11 05:…",True,"""Heidi Klums la…","""2023-06-01 11:…","[9777875, 9777930, … 9777889]","""article_defaul…","""https://ekstra…","[""Bergisch Gladbach"", ""Flavio Briatore"", … ""Heidi Klums""]","[""LOC"", ""PER"", … ""PER""]","[""Kendt"", ""Livsstil"", ""Underholdning""]",414,[432],"""underholdning""",793993,40407,1742984.0,0.5703,"""Neutral""","[-0.046561, -0.017556, … 0.003914]","[0.095054, 0.096886, … -0.013514]","[-0.097854, 0.062035, … -0.098488]","[0.071191, 0.016312, … 0.020225]"
9780773,"""Afsløring: Hvi…","""Hovsa:""","""2023-06-29 06:…",False,"""Ved du noget o…","""2023-06-01 18:…","[9789766, 9791805]","""article_defaul…","""https://ekstra…","[""Allan Melander"", ""Christian Bartholdy"", … ""Strandgade""]","[""PER"", ""PER"", … ""LOC""]","[""Kriminalitet"", ""Bedrageri"", … ""Økonomi""]",118,[133],"""nyheder""",343369,63807,5806831.0,0.9254,"""Negative""","[-0.006871, 0.040639, … 0.142761]","[0.106638, 0.110743, … -0.012386]","[-0.083905, 0.007199, … 0.000927]","[0.000026, -0.042221, … 0.011401]"


### Convert to lookup dictionary:

In [175]:
candidate_dict = {}
for row in candidate_articles.iter_rows(named=True):
    # Note, all keys in dictionaries are converted to strings, when serializing an object to JSON format.
    candidate_dict[str(row[DEFAULT_ARTICLE_ID_COL])] = row
# Write it:
write_json_file(candidate_dict, path_beyond.joinpath(CANDIDATE_DICT))
print(f"Dump: {path_beyond.joinpath(CANDIDATE_DICT)}")

Dump: ../downloads/demo/beyond_accuracy/candidate_dict.json


## Make Baselines

Make a couple *Baselines* based on the candidate-list:
1. @EditorialPicks
2. @Random
3. @Popular
4. @Newest
<!-- 4. @5 sim/dis similarity -->