# Hello

This notebook is an example of how to make a beyond-accuracy dataset, and how one could make baselines

# Get started

## Dependencies

### Output files

In [257]:
BEHAVIORS_TIMESTAMP_DICT = "behaviors_timestamp_dict.json"
CANDIDATE_LIST = "candidate_list.json"
CANDIDATE_DICT = "candidate_dict.json"
USERS_DICT = "users_dict.json"

In [258]:
from ebrec.utils._python import write_json_file, read_json_file
from pathlib import Path
import polars as pl
import numpy as np

from ebrec.utils._constants import (
    DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL,
    DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_TOTAL_PAGEVIEWS_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_SENTIMENT_SCORE_COL,
    DEFAULT_SENTIMENT_LABEL_COL,
    DEFAULT_TOTAL_INVIEWS_COL,
    DEFAULT_IS_SUBSCRIBER_COL,
    DEFAULT_CATEGORY_STR_COL,
    DEFAULT_IS_SSO_USER_COL,
    DEFAULT_ARTICLE_ID_COL,
    DEFAULT_POSTCODE_COL,
    DEFAULT_TOPICS_COL,
    DEFAULT_GENDER_COL,
    DEFAULT_AGE_COL,
)

from ebrec.evaluation.beyond_accuracy import (
    IntralistDiversity,
    Distribution,
    Serendipity,
    Sentiment,
    Coverage,
    Novelty,
)

from ebrec.utils._articles import create_sort_based_prediction_score

np.random.seed(123)

## Load dataset

In [259]:
dataset_split = "large"

PATH = Path(f"../downloads/{dataset_split}")
PATH_BEYOND_ACCURACY = PATH.joinpath("beyond_accuracy")
PATH_BEYOND_ACCURACY.mkdir(exist_ok=True, parents=True)

df_beyond_accuarcy = pl.scan_parquet(PATH.joinpath("test", "behaviors.parquet")).filter(pl.col("is_beyond_accuracy"))
df_behaviors = pl.scan_parquet(PATH.joinpath("test", "behaviors.parquet")).filter(~pl.col("is_beyond_accuracy"))
df_articles = pl.scan_parquet(PATH.joinpath("articles.parquet"))

# Make / Dump Metadata

## Make candidate list for beyond-accuracy:

We select the candidate list from the testset

In [260]:
candidate_list = (
    df_beyond_accuarcy.select(pl.col(DEFAULT_INVIEW_ARTICLES_COL).first())
    .collect()
    .to_series()
)[0].to_list()
write_json_file(candidate_list, PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST))

print(f"Number of Candidate IDs: {len(candidate_list)} (example: {candidate_list[:5]})")
print(f"Dump: {PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST)}")

Number of Candidate IDs: 250 (example: [9793163, 9793069, 9792076, 9792749, 9791280])
Dump: ../downloads/large/beyond_accuracy/candidate_list.json


#### Sanity check

In [261]:
load_candidate_list = read_json_file(PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST))
if (
    not (
        df_beyond_accuarcy.select(DEFAULT_INVIEW_ARTICLES_COL).collect()
        == candidate_list
    )
    .sum()[DEFAULT_INVIEW_ARTICLES_COL]
    .to_list()[0]
    == df_beyond_accuarcy.select(DEFAULT_INVIEW_ARTICLES_COL).collect().shape[0]
):
    raise ValueError("candidate_list is not identical in the testset")

if not (np.array(candidate_list) - np.array(load_candidate_list)).sum() == 0:
    raise ValueError("candidate_list was not dump correctly")

print("santity check - passed")

santity check - passed


## User meta data: Segments

In [262]:
user_meta_columns = [
    DEFAULT_IS_SUBSCRIBER_COL,
    DEFAULT_IS_SSO_USER_COL,
    DEFAULT_POSTCODE_COL,
    DEFAULT_GENDER_COL,
    DEFAULT_AGE_COL,
]
df_users = df_beyond_accuarcy.select(pl.col(user_meta_columns)).collect()

users_dict = {col : df_users[col].to_list() for col in df_users.columns}
write_json_file(users_dict, PATH_BEYOND_ACCURACY.joinpath(USERS_DICT))
print(f"Dump: {PATH_BEYOND_ACCURACY.joinpath(USERS_DICT)}")
df_users.head(3)


Dump: ../downloads/large/beyond_accuracy/users_dict.json


is_subscriber,is_sso_user,postcode,gender,age
bool,bool,i8,i8,i8
True,True,,0.0,
True,True,,,
True,True,,0.0,50.0


## Timestamp for Behaviors

Can be used for making the AUC as function of time

In [263]:
df_behaviors_timestamp = (
    df_behaviors.select(
        pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).cast(pl.Utf8),
    )
    .collect()
)
behaviors_timestamp_dict = {
    DEFAULT_IMPRESSION_TIMESTAMP_COL: df_behaviors_timestamp[
        DEFAULT_IMPRESSION_TIMESTAMP_COL
    ].to_list()
}
write_json_file(
    behaviors_timestamp_dict, PATH_BEYOND_ACCURACY.joinpath(BEHAVIORS_TIMESTAMP_DICT)
)
print(f"Dump: {PATH_BEYOND_ACCURACY.joinpath(BEHAVIORS_TIMESTAMP_DICT)}")
df_behaviors_timestamp.head(3)

Dump: ../downloads/large/beyond_accuracy/behaviors_timestamp_dict.json


impression_time
str
"""2023-06-05 15:…"
"""2023-06-05 15:…"
"""2023-06-05 15:…"


# Make Candidate lookup dict / Dump lookup dict

## Select Candidate articles

In [264]:
### Make candidate lookup dictionary for beyond-accuracy:
# =>
df_candidate_articles = (
    df_articles.filter(pl.col(DEFAULT_ARTICLE_ID_COL).is_in(candidate_list))
    .with_columns(
        pl.col(
            DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL,
            DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL,
        ).cast(pl.Utf8)
    )
    # Zeros might cause issues
    .with_columns(
        pl.col(DEFAULT_TOTAL_INVIEWS_COL, DEFAULT_TOTAL_PAGEVIEWS_COL).fill_null(1)
    )
    .collect()
)

## Add embeddings representations

In [265]:
# => Embeddings:
BERT_VECTOR = "bert_base_multilingual_cased"
CONTRASTIVE_VECTOR = "contrastive_vector"
DOCUMENT_VECTOR = "document_vector"
ROBERTA_VECTOR = "xlm_roberta_base"

def load_join_embeddings(df:pl.DataFrame, emb_path:Path) -> pl.DataFrame:
    emb_contrastive = (
        pl.scan_parquet(
            PATH.parent.joinpath(emb_path)
        )
        .filter(
            pl.col(DEFAULT_ARTICLE_ID_COL).is_in(df.select(DEFAULT_ARTICLE_ID_COL))
        ).collect()
    )
    return df.join(emb_contrastive, on=DEFAULT_ARTICLE_ID_COL, how = "left")

df_candidate_articles = (
    df_candidate_articles.pipe(
        load_join_embeddings,
        emb_path=f"embeddings/Ekstra_Bladet_contrastive_vector/{CONTRASTIVE_VECTOR}.parquet",
    )
    .pipe(
        load_join_embeddings,
        emb_path=f"embeddings/FacebookAI_xlm_roberta_base/{ROBERTA_VECTOR}.parquet",
    )
    .pipe(
        load_join_embeddings,
        emb_path=f"embeddings/google_bert_base_multilingual_cased/{BERT_VECTOR}.parquet"
    )
    .pipe(
        load_join_embeddings,
        emb_path=f"embeddings/Ekstra_Bladet_word2vec/{DOCUMENT_VECTOR}.parquet"
    )
)
df_candidate_articles.head(2)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label,contrastive_vector,FacebookAI/xlm-roberta-base,google-bert/bert-base-multilingual-cased,document_vector
i32,str,str,str,bool,str,str,list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str,list[f32],list[f32],list[f32],list[f32]
9777912,"""Stort galleri:…","""Den tyske topm…","""2023-10-11 05:…",True,"""Heidi Klums la…","""2023-06-01 11:…","[9777875, 9777930, … 9777889]","""article_defaul…","""https://ekstra…","[""Bergisch Gladbach"", ""Flavio Briatore"", … ""Heidi Klums""]","[""LOC"", ""PER"", … ""PER""]","[""Kendt"", ""Livsstil"", ""Underholdning""]",414,[432],"""underholdning""",793993,40407,1742984.0,0.5703,"""Neutral""","[-0.046561, -0.017556, … 0.003914]","[0.095054, 0.096886, … -0.013514]","[-0.097854, 0.062035, … -0.098488]","[0.071191, 0.016312, … 0.020225]"
9780773,"""Afsløring: Hvi…","""Hovsa:""","""2023-06-29 06:…",False,"""Ved du noget o…","""2023-06-01 18:…","[9789766, 9791805]","""article_defaul…","""https://ekstra…","[""Allan Melander"", ""Christian Bartholdy"", … ""Strandgade""]","[""PER"", ""PER"", … ""LOC""]","[""Kriminalitet"", ""Bedrageri"", … ""Økonomi""]",118,[133],"""nyheder""",343369,63807,5806831.0,0.9254,"""Negative""","[-0.006871, 0.040639, … 0.142761]","[0.106638, 0.110743, … -0.012386]","[-0.083905, 0.007199, … 0.000927]","[0.000026, -0.042221, … 0.011401]"


## Convert to lookup dictionary:

In [266]:
candidate_dict = {}
for row in df_candidate_articles.iter_rows(named=True):
    # Note, all keys in dictionaries are converted to strings, when serializing an object to JSON format.
    candidate_dict[str(row[DEFAULT_ARTICLE_ID_COL])] = row
# Write it:
write_json_file(candidate_dict, PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_DICT))
print(f"Dump: {PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_DICT)}")

Dump: ../downloads/large/beyond_accuracy/candidate_dict.json


# Make Baselines

Make a couple *Baselines* based on the candidate-list:
1. @EditorialPicks: We approximate this based on the number **inview** an articles have recived. Ekstra Bladet is front-page driven, meaning, if an article has a lot of inview-impression (seen) a lot, we believe it has been selected to be in a top priority from the editors. This is static (it does change for our *candidate_list*), i.e., the computation is done once.
2. @Popular: We approximate this based on the number **clicks** an articles have recived. This is static (it does change for our *candidate_list*), i.e., the computation is done once.
3. @Random: Simple baseline and important baseline. We simple pick a set of *top-n* articles from the *candidate-list* and run multiple times.
4. @Dissimilarity / Similarity (will come later): Select top-n articles that are the most similar / dissimilar. 
5. @Newest: Simply pick the newest released articles. We do see newssite where the top banner is *Newest released*. We include it, but note this is very sensitive and might not be meaningful.

### Load the artifacts

In [267]:
behaviors_timestamp_dict = read_json_file(PATH_BEYOND_ACCURACY.joinpath(BEHAVIORS_TIMESTAMP_DICT))
candidate_list = read_json_file(PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST))
candidate_dict = read_json_file(PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_DICT))
users_dict = read_json_file(PATH_BEYOND_ACCURACY.joinpath(USERS_DICT))

# Only the once actually found in the dataset (for demo only 154 of 250 are represent)
candidate_list = [str(id) for id in candidate_list if str(id) in list(candidate_dict)]

print(candidate_dict[list(candidate_dict)[0]].keys())

dict_keys(['article_id', 'title', 'subtitle', 'last_modified_time', 'premium', 'body', 'published_time', 'image_ids', 'article_type', 'url', 'ner_clusters', 'entity_groups', 'topics', 'category', 'subcategory', 'category_str', 'total_inviews', 'total_pageviews', 'total_read_time', 'sentiment_score', 'sentiment_label', 'contrastive_vector', 'FacebookAI/xlm-roberta-base', 'google-bert/bert-base-multilingual-cased', 'document_vector'])


## Make Ranked Candidate lists

### Editorical Pick


In [268]:
df_candidates_editorial_picks = create_sort_based_prediction_score(df_candidate_articles, column=DEFAULT_TOTAL_INVIEWS_COL, desc=True)
candidates_editorial_picks = np.array([df_candidates_editorial_picks.select(DEFAULT_ARTICLE_ID_COL).cast(pl.Utf8).to_series()])
# =>
print(df_candidates_editorial_picks.head(2))
print(candidates_editorial_picks[:, :2])

shape: (2, 3)
┌────────────┬───────────────┬──────────────────┐
│ article_id ┆ total_inviews ┆ prediction_score │
│ ---        ┆ ---           ┆ ---              │
│ i32        ┆ i32           ┆ f64              │
╞════════════╪═══════════════╪══════════════════╡
│ 9790335    ┆ 1698890       ┆ 1.0              │
│ 9791587    ┆ 1369829       ┆ 0.5              │
└────────────┴───────────────┴──────────────────┘
[['9790335' '9791587']]


### Popular

In [269]:
df_candidates_popular = create_sort_based_prediction_score(df_candidate_articles, column=DEFAULT_TOTAL_PAGEVIEWS_COL, desc=True)
candidates_popular = np.array([df_candidates_popular.select(DEFAULT_ARTICLE_ID_COL).cast(pl.Utf8).to_series()])
# => 
print(df_candidates_popular.head(2))
print(candidates_popular[:, :2])

shape: (2, 3)
┌────────────┬─────────────────┬──────────────────┐
│ article_id ┆ total_pageviews ┆ prediction_score │
│ ---        ┆ ---             ┆ ---              │
│ i32        ┆ i32             ┆ f64              │
╞════════════╪═════════════════╪══════════════════╡
│ 9791428    ┆ 256541          ┆ 1.0              │
│ 9792719    ┆ 209050          ┆ 0.5              │
└────────────┴─────────────────┴──────────────────┘
[['9791428' '9792719']]


### Newest

In [270]:
df_candidates_newest = create_sort_based_prediction_score(df_candidate_articles, column=DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL, desc=False)
candidates_newest = np.array([df_candidates_newest.select(DEFAULT_ARTICLE_ID_COL).cast(pl.Utf8).to_series()])
# => 
print(df_candidates_newest.head(2))
print(candidates_newest[:, :2])

shape: (2, 3)
┌────────────┬────────────────────────────┬──────────────────┐
│ article_id ┆ published_time             ┆ prediction_score │
│ ---        ┆ ---                        ┆ ---              │
│ i32        ┆ str                        ┆ f64              │
╞════════════╪════════════════════════════╪══════════════════╡
│ 9790515    ┆ 2023-06-01 07:02:14.000000 ┆ 1.0              │
│ 9791205    ┆ 2023-06-01 07:06:57.000000 ┆ 0.5              │
└────────────┴────────────────────────────┴──────────────────┘
[['9790515' '9791205']]


In [271]:


candidates_random = np.array([df_candidates_newest.select(DEFAULT_ARTICLE_ID_COL).cast(pl.Utf8).to_series()])
# => 
print(df_candidates_newest.head(2))
print(candidates_newest[:, :2])

shape: (2, 3)
┌────────────┬────────────────────────────┬──────────────────┐
│ article_id ┆ published_time             ┆ prediction_score │
│ ---        ┆ ---                        ┆ ---              │
│ i32        ┆ str                        ┆ f64              │
╞════════════╪════════════════════════════╪══════════════════╡
│ 9790515    ┆ 2023-06-01 07:02:14.000000 ┆ 1.0              │
│ 9791205    ┆ 2023-06-01 07:06:57.000000 ┆ 0.5              │
└────────────┴────────────────────────────┴──────────────────┘
[['9790515' '9791205']]


## Init Metrics

In [272]:
instralist_diversity = IntralistDiversity()
distribution = Distribution()
serendipity = Serendipity()
sentiment = Sentiment()
coverage = Coverage()
novelty = Novelty()

## Select @n Candidates

In [273]:
RANDOM_ITER = 10_000
TOP_N = 5

n_candidates_random = [np.random.choice(list(candidate_dict), size=TOP_N, replace=False) for _ in range(RANDOM_ITER)]
n_candidates_editorial_picks = candidates_editorial_picks[:, :TOP_N]
n_candidates_popular = candidates_popular[:, :TOP_N]
n_candidates_newest = candidates_newest[:, :TOP_N]

## User-level

### Instralist-Diversity

In [290]:
instralist_diversity_dict = {
    f"{instralist_diversity.name}_editorial_picks" : instralist_diversity(n_candidates_editorial_picks, lookup_dict=candidate_dict, lookup_key=CONTRASTIVE_VECTOR)[0],
    f"{instralist_diversity.name}_popular" : instralist_diversity(n_candidates_popular, lookup_dict=candidate_dict, lookup_key=CONTRASTIVE_VECTOR)[0],
    f"{instralist_diversity.name}_random" : instralist_diversity(n_candidates_random, lookup_dict=candidate_dict, lookup_key=CONTRASTIVE_VECTOR)[0],
    f"{instralist_diversity.name}_diversity_newest" : instralist_diversity(candidates_newest, lookup_dict=candidate_dict, lookup_key=CONTRASTIVE_VECTOR)[0],
}
pl.DataFrame(instralist_diversity_dict)

intralist_diversity_editorial_picks,intralist_diversity_popular,intralist_diversity_random,intralist_diversity_diversity_newest
f64,f64,f64,f64
0.790542,0.840236,0.730538,0.754899


#### The embedding representation
This might be obvious, but the embedding representation used for computing a metric is very influential. Hence, baselines are important to determine high and low scores. Also, this is why these metrics can be very hard to interpret for us.

In [275]:
ROBERTA_EMB = "FacebookAI/xlm-roberta-base"
BERT_EMB = "google-bert/bert-base-multilingual-cased"

print(f"{CONTRASTIVE_VECTOR}: {instralist_diversity(n_candidates_editorial_picks, lookup_dict=candidate_dict, lookup_key=CONTRASTIVE_VECTOR)[0]}")
print(f"{DOCUMENT_VECTOR}: {instralist_diversity(n_candidates_editorial_picks, lookup_dict=candidate_dict, lookup_key=DOCUMENT_VECTOR)[0]}")
print(f"{ROBERTA_EMB}: {instralist_diversity(n_candidates_editorial_picks, lookup_dict=candidate_dict, lookup_key=ROBERTA_EMB)[0]}")
print(f"{BERT_EMB}: {instralist_diversity(n_candidates_editorial_picks, lookup_dict=candidate_dict, lookup_key=BERT_EMB)[0]}")

contrastive_vector: 0.7905415595169083
document_vector: 0.1584846028291677
FacebookAI/xlm-roberta-base: 0.0007398918387222619
google-bert/bert-base-multilingual-cased: 0.028635621408856893


### Sentiment

In [289]:
sentiment_dict = {
    f"{sentiment.name}_editorial_picks" : sentiment(n_candidates_editorial_picks, lookup_dict=candidate_dict, lookup_key=DEFAULT_SENTIMENT_SCORE_COL)[0],
    f"{sentiment.name}_popular" : sentiment(n_candidates_popular, lookup_dict=candidate_dict, lookup_key=DEFAULT_SENTIMENT_SCORE_COL)[0],
    f"{sentiment.name}_random" : sentiment(n_candidates_random, lookup_dict=candidate_dict, lookup_key=DEFAULT_SENTIMENT_SCORE_COL)[0],
    f"{sentiment.name}_diversity_newest" : sentiment(candidates_newest, lookup_dict=candidate_dict, lookup_key=DEFAULT_SENTIMENT_SCORE_COL)[0],
}
pl.DataFrame(sentiment_dict)

sentiment_editorial_picks,sentiment_popular,sentiment_random,sentiment_diversity_newest
f64,f64,f64,f64
0.7294,0.78342,0.82532,0.821488


### Serendipity [MISSING]

In [277]:
### ADD USER HISTORY

### Novelty [Novelty SCORE]

In [291]:
novelty_dict = {
    f"{novelty.name}_editorial_picks" : novelty(n_candidates_editorial_picks, lookup_dict=candidate_dict, lookup_key=DEFAULT_TOTAL_PAGEVIEWS_COL)[0],
    f"{novelty.name}_popular" : novelty(n_candidates_popular, lookup_dict=candidate_dict, lookup_key=DEFAULT_TOTAL_PAGEVIEWS_COL)[0],
    f"{novelty.name}_random" : novelty(n_candidates_random, lookup_dict=candidate_dict, lookup_key=DEFAULT_TOTAL_PAGEVIEWS_COL)[0],
    f"{novelty.name}_diversity_newest" : novelty(candidates_newest, lookup_dict=candidate_dict, lookup_key=DEFAULT_TOTAL_PAGEVIEWS_COL)[0],
}
pl.DataFrame(novelty_dict)

novelty_editorial_picks,novelty_popular,novelty_random,novelty_diversity_newest
f64,f64,f64,f64
-16.01345,-17.572331,-9.651651,-9.543215


## Model-level

### Coverage

In [305]:
coverage_dict = {
    f"{coverage.name}_editorial_picks" : coverage(n_candidates_editorial_picks, candidate_list),
    f"{coverage.name}_popular" : coverage(n_candidates_popular, candidate_list),
    f"{coverage.name}_random" : coverage(n_candidates_random, candidate_list),
    f"{coverage.name}_newest" : coverage(n_candidates_newest, candidate_list),
}
pl.DataFrame(coverage_dict)

coverage_editorial_picks,coverage_popular,coverage_random,coverage_newest
f64,f64,f64,f64
5.0,5.0,250.0,5.0
0.02,0.02,1.0,0.02


### Distribution Helper Function

In [306]:
def compute_transform_distribution(
    R,
    lookup_dict: dict,
    lookup_key: str,
    suffix: str,
):
    # =>
    distribution = Distribution()
    return {
        **{"name": f"{distribution.name}{suffix}"},
        **distribution(
            R,
            lookup_dict=lookup_dict,
            lookup_key=lookup_key,
        ),
    }

### Distribution - Category

In [307]:
df_distribution_category = pl.concat([
    pl.DataFrame(compute_transform_distribution(n_candidates_editorial_picks, lookup_dict=candidate_dict, lookup_key=DEFAULT_CATEGORY_STR_COL, suffix="_editorial_picks")),
    pl.DataFrame(compute_transform_distribution(n_candidates_popular, lookup_dict=candidate_dict, lookup_key=DEFAULT_CATEGORY_STR_COL, suffix="_popular")),
    pl.DataFrame(compute_transform_distribution(n_candidates_random, lookup_dict=candidate_dict, lookup_key=DEFAULT_CATEGORY_STR_COL, suffix="_random")),
    pl.DataFrame(compute_transform_distribution(n_candidates_newest, lookup_dict=candidate_dict, lookup_key=DEFAULT_CATEGORY_STR_COL, suffix="_newest")),
], how="diagonal")
# =>
df_distribution_category

name,forbrug,nyheder,krimi,sport,penge,auto,underholdning,musik,nationen,incoming
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""distribution_e…",0.2,0.2,0.2,0.2,0.2,,,,,
"""distribution_p…",,0.2,0.2,0.4,0.2,,,,,
"""distribution_r…",0.01148,0.16282,0.09896,0.16754,0.03454,0.38024,0.08304,0.03522,0.02226,0.0039
"""distribution_n…",,,0.4,0.2,0.2,,0.2,,,


### Distribution - Sentiment

In [308]:
df_distribution_sentiment = pl.concat([
    pl.DataFrame(compute_transform_distribution(n_candidates_editorial_picks, lookup_dict=candidate_dict, lookup_key=DEFAULT_SENTIMENT_LABEL_COL, suffix="_editorial_picks")),
    pl.DataFrame(compute_transform_distribution(n_candidates_popular, lookup_dict=candidate_dict, lookup_key=DEFAULT_SENTIMENT_LABEL_COL, suffix="_popular")),
    pl.DataFrame(compute_transform_distribution(n_candidates_random, lookup_dict=candidate_dict, lookup_key=DEFAULT_SENTIMENT_LABEL_COL, suffix="_random")),
    pl.DataFrame(compute_transform_distribution(n_candidates_newest, lookup_dict=candidate_dict, lookup_key=DEFAULT_SENTIMENT_LABEL_COL, suffix="_newest")),
], how="diagonal")
# =>
df_distribution_sentiment

name,Negative,Positive,Neutral
str,f64,f64,f64
"""distribution_e…",0.4,0.6,
"""distribution_p…",0.2,0.4,0.4
"""distribution_r…",0.39534,0.3143,0.29036
"""distribution_n…",0.6,0.2,0.2


### Distribution - Topics

In [309]:
df_distribution_topics = pl.concat([
    pl.DataFrame(compute_transform_distribution(n_candidates_editorial_picks, lookup_dict=candidate_dict, lookup_key=DEFAULT_SENTIMENT_LABEL_COL, suffix="_editorial_picks")),
    pl.DataFrame(compute_transform_distribution(n_candidates_popular, lookup_dict=candidate_dict, lookup_key=DEFAULT_SENTIMENT_LABEL_COL, suffix="_popular")),
    pl.DataFrame(compute_transform_distribution(n_candidates_random, lookup_dict=candidate_dict, lookup_key=DEFAULT_SENTIMENT_LABEL_COL, suffix="_random")),
    pl.DataFrame(compute_transform_distribution(n_candidates_newest, lookup_dict=candidate_dict, lookup_key=DEFAULT_SENTIMENT_LABEL_COL, suffix="_newest")),
], how="diagonal")
# =>
df_distribution_topics


name,Negative,Positive,Neutral
str,f64,f64,f64
"""distribution_e…",0.4,0.6,
"""distribution_p…",0.2,0.4,0.4
"""distribution_r…",0.39534,0.3143,0.29036
"""distribution_n…",0.6,0.2,0.2
