In [3]:
import polars as pl

from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import tensorflow as tf
import polars as pl

from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_SUBTITLE_COL,
    DEFAULT_LABELS_COL,
    DEFAULT_TITLE_COL,
    DEFAULT_USER_COL,
)

from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history,
)
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
from ebrec.utils._articles import convert_text2encoding_with_transformers
from ebrec.utils._polars import concat_str_columns, slice_join_dataframes
from ebrec.utils._articles import create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._python import write_submission_file, rank_predictions_by_score

from ebrec.models.newsrec.dataloader import NRMSDataLoader
from ebrec.models.newsrec.model_config import hparams_nrms
from ebrec.models.newsrec import NRMSModel

2024-06-16 18:56:03.115102: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
PATH = "/Users/datoapanta/Desktop/1. Coding/MTRec-RecSys/data/articles.parquet"
DEFAULT_NER_COL = "ner_clusters"

df_articles = pl.read_parquet(PATH)

# Explode the 'entity_groups' column to get a flattened list of all entity groups
entity_groups_exploded = df_articles.explode("entity_groups")

# Get unique entity groups
unique_entity_groups = entity_groups_exploded.select("entity_groups").unique()

df_articles.head(60)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3037230,"""Ishockey-spill…","""ISHOCKEY: Isho…",2023-06-29 06:20:57,false,"""Ambitionerne o…",2003-08-28 08:55:00,,"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Kendt"", … ""Mindre ulykke""]",142,"[327, 334]","""sport""",,,,0.9752,"""Negative"""
3044020,"""Prins Harry tv…","""Hoffet tvang P…",2023-06-29 06:21:16,false,"""Den britiske t…",2005-06-29 08:47:00,"[3097307, 3097197, 3104927]","""article_defaul…","""https://ekstra…","[""Harry"", ""James Hewitt""]","[""PER"", ""PER""]","[""Kriminalitet"", ""Kendt"", … ""Personfarlig kriminalitet""]",414,[432],"""underholdning""",,,,0.7084,"""Negative"""
3057622,"""Rådden kørsel …","""Kan ikke straf…",2023-06-29 06:21:24,false,"""Slingrende spr…",2005-10-10 07:20:00,[3047102],"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Transportmiddel"", ""Bil""]",118,[133],"""nyheder""",,,,0.9236,"""Negative"""
3073151,"""Mærsk-arvinger…","""FANGET I FLODB…",2023-06-29 06:21:38,false,"""To oldebørn af…",2005-01-04 06:59:00,"[3067474, 3067478, 3153705]","""article_defaul…","""https://ekstra…",[],[],"[""Erhverv"", ""Privat virksomhed"", … ""Rejse""]",118,[133],"""nyheder""",,,,0.9945,"""Negative"""
3193383,"""Skød svigersøn…","""44-årig kvinde…",2023-06-29 06:22:57,false,"""En 44-årig mor…",2003-09-15 15:30:00,,"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9966,"""Negative"""
3196611,"""Zoo-tårnet 100…","""I mange år var…",2023-06-29 06:23:02,false,"""I mange år var…",2005-06-10 05:40:00,"[3067931, 3035588]","""article_defaul…","""https://ekstra…",[],[],"[""Kultur"", ""Museum og seværdighed""]",539,[],"""ferie""",,,,0.6275,"""Neutral"""
3200325,"""Tævet ihjel på…","""Sadomasochisti…",2023-06-29 06:23:13,false,""". Skolepige vi…",2002-06-25 05:10:00,"[3200179, 3186817]","""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Livsstil"", … ""Samfund""]",140,[],"""krimi""",,,,0.9913,"""Negative"""
3200913,"""Denne kæp kan …","""Nye spor i den…",2023-06-29 06:23:15,false,"""Den usædvanlig…",2003-09-11 08:55:00,,"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9839,"""Negative"""
3209311,"""Morder truer m…","""En morder er b…",2023-06-29 06:23:35,false,"""En morder er i…",2003-03-20 12:50:00,,"""article_defaul…","""https://ekstra…","[""Torben Pedersen""]","[""PER""]","[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9975,"""Negative"""
3209357,"""Pædofil må sta…","""Lærer havde 70…",2023-06-29 06:23:35,false,"""En 56-årig søn…",2005-02-26 04:45:00,[3069815],"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet"", … ""Uddannelse""]",140,[],"""krimi""",,,,0.7929,"""Negative"""


In [7]:
# Display the unique entity groups
print("Unique entity groups:", unique_entity_groups)

Unique entity groups: shape: (7, 1)
┌───────────────┐
│ entity_groups │
│ ---           │
│ str           │
╞═══════════════╡
│ MISC          │
│ ORG           │
│ PER           │
│ PROD          │
│ LOC           │
│ null          │
│ EVENT         │
└───────────────┘


In [12]:
import polars as pl

# Load the dataset
PATH = "/Users/datoapanta/Desktop/1. Coding/MTRec-RecSys/data/articles.parquet"
df_articles = pl.read_parquet(PATH)

# Count the total number of articles
total_articles = df_articles.height

# Count the number of rows where ner_clusters is an empty list
empty_ner_clusters_count = df_articles.filter(pl.col("ner_clusters") == pl.lit([])).height

# Calculate the percentage of articles with empty ner_clusters
percentage_empty_ner_clusters = (empty_ner_clusters_count / total_articles) * 100

# Print the results
print(f"Total number of articles: {total_articles}")
print(f"Number of articles with empty ner_clusters: {empty_ner_clusters_count}")
print(f"Percentage of articles with empty ner_clusters: {percentage_empty_ner_clusters:.2f}%")


Total number of articles: 11777
Number of articles with empty ner_clusters: 1687
Percentage of articles with empty ner_clusters: 14.32%


In [17]:
import polars as pl

# Load the dataset
PATH = "/Users/datoapanta/Desktop/1. Coding/MTRec-RecSys/data/articles.parquet"
df_articles = pl.read_parquet(PATH)

# Print the schema to confirm column names and types
print(df_articles.schema)

# Filter the DataFrame by article_id
article_id = 3044020
filtered_article = df_articles.filter(pl.col("article_id") == article_id)

# Check if the article was found and print the title
if not filtered_article.is_empty():
    title = filtered_article.select("title").to_numpy()[0, 0]
    print(f"Title of article ID {article_id}: {title}")
else:
    print(f"Article with ID {article_id} not found.")


OrderedDict([('article_id', Int32), ('title', String), ('subtitle', String), ('last_modified_time', Datetime(time_unit='us', time_zone=None)), ('premium', Boolean), ('body', String), ('published_time', Datetime(time_unit='us', time_zone=None)), ('image_ids', List(Int64)), ('article_type', String), ('url', String), ('ner_clusters', List(String)), ('entity_groups', List(String)), ('topics', List(String)), ('category', Int16), ('subcategory', List(Int16)), ('category_str', String), ('total_inviews', Int32), ('total_pageviews', Int32), ('total_read_time', Float32), ('sentiment_score', Float32), ('sentiment_label', String)])
Title of article ID 3044020: Prins Harry tvunget til dna-test


In [1]:
import polars as pl

# Load the dataset
PATH = "/Users/datoapanta/Desktop/1. Coding/MTRec-RecSys/data/articles.parquet"
df_articles = pl.read_parquet(PATH)

# Inspect the schema of the DataFrame
print(df_articles.schema)

def bio_tagging(title, entities, entity_groups):
    # Tokenize the title
    tokens = title.split()
    
    # Initialize tags as 'O' (outside)
    tags = ['O'] * len(tokens)
    
    # Iterate through each entity and assign B/I tags
    for entity, group in zip(entities, entity_groups):
        if group == 'null':
            continue
        entity_tokens = entity.split()
        entity_length = len(entity_tokens)
        
        # Find the start position of the entity in the title tokens
        for i in range(len(tokens) - entity_length + 1):
            if tokens[i:i + entity_length] == entity_tokens:
                # Assign B tag to the first token
                tags[i] = f'B-{group}'
                # Assign I tags to the remaining tokens
                for j in range(1, entity_length):
                    tags[i + j] = f'I-{group}'
                break
    
    return tags

# Define a function to apply the bio_tagging function to the DataFrame
def apply_bio_tagging(row):
    return bio_tagging(row['title'], row['ner_clusters'], row['entity_groups'])

# Convert DataFrame to a list of dictionaries, process each row, and create a new DataFrame
rows = df_articles.to_dicts()
for row in rows:
    row['bio_tags'] = apply_bio_tagging(row)

# Convert the list of dictionaries back to a DataFrame with a controlled schema
df_articles = pl.DataFrame(
    rows,
    schema={
        "title": pl.Utf8,
        "ner_clusters": pl.List(pl.Utf8),
        "entity_groups": pl.List(pl.Utf8),
        "bio_tags": pl.List(pl.Utf8)
    }
)

selected = df_articles.select(['title', 'ner_clusters', 'entity_groups', 'bio_tags'])
# Display the result
display(selected.head(10))


OrderedDict([('article_id', Int32), ('title', String), ('subtitle', String), ('last_modified_time', Datetime(time_unit='us', time_zone=None)), ('premium', Boolean), ('body', String), ('published_time', Datetime(time_unit='us', time_zone=None)), ('image_ids', List(Int64)), ('article_type', String), ('url', String), ('ner_clusters', List(String)), ('entity_groups', List(String)), ('topics', List(String)), ('category', Int16), ('subcategory', List(Int16)), ('category_str', String), ('total_inviews', Int32), ('total_pageviews', Int32), ('total_read_time', Float32), ('sentiment_score', Float32), ('sentiment_label', String)])


title,ner_clusters,entity_groups,bio_tags
str,list[str],list[str],list[str]
"""Ishockey-spill…",[],[],"[""O"", ""O"", … ""O""]"
"""Prins Harry tv…","[""Harry"", ""James Hewitt""]","[""PER"", ""PER""]","[""O"", ""B-PER"", … ""O""]"
"""Rådden kørsel …",[],[],"[""O"", ""O"", … ""O""]"
"""Mærsk-arvinger…",[],[],"[""O"", ""O"", ""O""]"
"""Skød svigersøn…",[],[],"[""O"", ""O"", … ""O""]"
"""Zoo-tårnet 100…",[],[],"[""O"", ""O"", ""O""]"
"""Tævet ihjel på…",[],[],"[""O"", ""O"", … ""O""]"
"""Denne kæp kan …",[],[],"[""O"", ""O"", … ""O""]"
"""Morder truer m…","[""Torben Pedersen""]","[""PER""]","[""O"", ""O"", … ""O""]"
"""Pædofil må sta…",[],[],"[""O"", ""O"", … ""O""]"


In [4]:
import polars as pl

# Load the dataset
PATH = "/Users/datoapanta/Desktop/1. Coding/MTRec-RecSys/data/articles.parquet"
df_articles = pl.read_parquet(PATH)

def bio_tagging(title, entities, entity_groups):
    # Tokenize the title
    tokens = title.split()
    
    # Initialize tags as 'O' (outside)
    tags = ['O'] * len(tokens)
    
    # Iterate through each entity and assign B/I tags
    for entity, group in zip(entities, entity_groups):
        if group == 'null':
            continue
        entity_tokens = entity.split()
        entity_length = len(entity_tokens)
        
        # Find the start position of the entity in the title tokens
        for i in range(len(tokens) - entity_length + 1):
            if tokens[i:i + entity_length] == entity_tokens:
                # Assign B tag to the first token
                tags[i] = f'B-{group}'
                # Assign I tags to the remaining tokens
                for j in range(1, entity_length):
                    tags[i + j] = f'I-{group}'
                break
    
    return tags

# Define a function to apply the bio_tagging function to the DataFrame
def apply_bio_tagging(row):
    return bio_tagging(row['title'], row['ner_clusters'], row['entity_groups'])

# Convert DataFrame to a list of dictionaries, process each row, and create a new DataFrame
rows = df_articles.to_dicts()
for row in rows:
    row['bio_tags'] = apply_bio_tagging(row)

# Convert the list of dictionaries back to a DataFrame with a controlled schema
df_articles = pl.DataFrame(
    rows,
    schema={
        "title": pl.Utf8,
        "ner_clusters": pl.List(pl.Utf8),
        "entity_groups": pl.List(pl.Utf8),
        "bio_tags": pl.List(pl.Utf8)
    }
)



In [7]:
PATH = "/Users/datoapanta/Desktop/1. Coding/MTRec-RecSys/data/train/behaviors.parquet"

df_articles = pl.read_parquet(PATH)
df_articles.head(10)

impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
u32,i32,datetime[μs],f32,f32,i8,list[i32],list[i32],u32,bool,i8,i8,i8,bool,u32,f32,f32
48401,,2023-05-21 21:06:50,21.0,,2,"[9774516, 9771051, … 9759966]",[9759966],22779,False,,,,False,21,16.0,27.0
152513,9778745.0,2023-05-24 07:31:26,30.0,100.0,1,"[9778669, 9778736, … 9777397]",[9778661],150224,False,,,,False,298,2.0,48.0
155390,,2023-05-24 07:30:33,45.0,,1,"[9778369, 9777856, … 9778448]",[9777856],160892,False,,,,False,401,215.0,100.0
214679,,2023-05-23 05:25:40,33.0,,2,"[9776715, 9776406, … 9776855]",[9776566],1001055,False,,,,False,1357,40.0,47.0
214681,,2023-05-23 05:31:54,21.0,,2,"[9775202, 9776855, … 9776570]",[9776553],1001055,False,,,,False,1358,5.0,49.0
214684,,2023-05-23 05:32:21,10.0,,2,"[9776508, 9767490, … 9774840]",[9776508],1001055,False,,,,False,1358,52.0,100.0
214691,,2023-05-23 05:30:46,18.0,,2,"[9759955, 9776449, … 9775985]",[9776691],1001055,False,,,,False,1358,4.0,37.0
369958,,2023-05-24 14:25:56,16.0,,2,"[9776023, 9778158, … 7594265]",[9778158],1469458,False,,,,False,1623,0.0,
369959,,2023-05-24 14:23:14,161.0,,2,"[9779186, 9779289, … 9779071]",[9779071],1469458,False,,,,False,1623,16.0,
370414,,2023-05-24 14:48:54,9.0,,2,"[9779408, 9779377, … 9779007]",[9777182],1470585,False,,,,False,1678,13.0,41.0


In [13]:
import polars as pl

# Load the dataset
PATH = "/Users/datoapanta/Desktop/1. Coding/MTRec-RecSys/data/articles.parquet"
df_articles = pl.read_parquet(PATH)

# Define the bio_tagging function
def bio_tagging(title, entities, entity_groups):
    tokens = title.split()
    tags = ['O'] * len(tokens)
    for entity, group in zip(entities, entity_groups):
        if group == 'null':
            continue
        entity_tokens = entity.split()
        entity_length = len(entity_tokens)
        for i in range(len(tokens) - entity_length + 1):
            if tokens[i:i + entity_length] == entity_tokens:
                tags[i] = f'B-{group}'
                for j in range(1, entity_length):
                    tags[i + j] = f'I-{group}'
                break
    return tags

# Apply the bio_tagging function to the DataFrame
rows = df_articles.to_dicts()
for row in rows:
    row['bio_tags'] = apply_bio_tagging(row)

df_articles = pl.DataFrame(
    rows,
    schema={
        "title": pl.Utf8,
        "ner_clusters": pl.List(pl.Utf8),
        "entity_groups": pl.List(pl.Utf8),
        "bio_tags": pl.List(pl.Utf8)
    }
)

# Select the columns of interest
selected = df_articles.select(['title', 'ner_clusters', 'entity_groups', 'bio_tags'])

# Filter rows where bio_tags contains only 'O'
only_os_count = selected.filter(pl.col('bio_tags').apply(lambda tags: all(tag == 'O' for tag in tags))).height

# Total number of rows
total_rows = selected.height

# Calculate the proportion
proportion_only_os = (only_os_count / total_rows) * 100

# Print the results
print(f"Total number of rows: {total_rows}")
print(f"Number of rows with only 'O' tags: {only_os_count}")
print(f"Proportion of rows with only 'O' tags: {proportion_only_os:.2f}%")


Total number of rows: 11777
Number of rows with only 'O' tags: 10558
Proportion of rows with only 'O' tags: 89.65%


  only_os_count = selected.filter(pl.col('bio_tags').apply(lambda tags: all(tag == 'O' for tag in tags))).height
