In [4]:
import numpy as np
import pandas as pd
import altair as alt
import pickle as pkl

In [5]:
naml_predictions = pd.read_json(f"data/recommendations/ebnerd/naml_prediction.json", lines=True)
nrms_predictions = pd.read_json(f"data/recommendations/ebnerd/nrms_prediction.json", lines=True)
lstur_predictions = pd.read_json(f"data/recommendations/ebnerd/lstur_prediction.json", lines=True)
random_predictions = pd.read_json(f"data/recommendations/ebnerd/random_prediction.json", lines=True)

behaviors = pd.read_csv('data/ebnerd/val/behaviors_parsed.tsv', delimiter='\t', header=None)
behaviors = behaviors.replace({np.nan: None})

with open('data/ebnerd/articles_ebnerd.pickle', 'rb') as f:
    articles = pkl.load(f)
    articles.index = articles.article_id

In [6]:
recommenders = ['naml', 'nrms', 'lstur', 'random']

categories = {recommender: [] for recommender in recommenders}
subcategories = {recommender: [] for recommender in recommenders}
activations = {recommender: [] for recommender in recommenders}

recommender_to_df = {
    'naml': naml_predictions,
    'nrms': nrms_predictions,
    'lstur': lstur_predictions,
    'random': random_predictions
}

radio_cutoff = 10

for progress_index, behavior in behaviors[:1000].iterrows():
    behavior_id, behavior_user, behavior_datetime, behavior_history, behavior_impressions, behavior_candidates, labels = behavior
    if not behavior_history:
        continue

    recommendations_collection = {recommender: recommender_to_df[recommender][recommender_to_df[recommender]['impr_index'] == progress_index + 1]['pred_rank'].values.tolist()[0] for recommender in recommenders}

    candidates = behavior_candidates.split(' ')
    candidates_nids = [int(candidate.split('-')[0]) for candidate in candidates]
    filtered_articles = [articles.loc[article_id] for article_id in candidates_nids if article_id in articles.index]
    for recommender in recommendations_collection:
        recommendations = recommendations_collection[recommender]

        article_categories = [filtered_articles[i-1].category_str for i in recommendations][:10]
        article_categories_pool = [filtered_articles[i-1].category_str for i in recommendations]

        article_subcategories = [subcat for i in recommendations[:10] for subcat in filtered_articles[i-1].subcategory]
        article_subcategories_pool = [subcat for i in recommendations for subcat in filtered_articles[i-1].subcategory]

        article_activations = [abs(filtered_articles[i-1].sentiment) for i in recommendations][:10]
        article_activations_pool = [abs(filtered_articles[i-1].sentiment) for i in recommendations]

        categories[recommender] += article_categories
        subcategories[recommender] += article_subcategories
        activations[recommender] += article_activations


In [7]:
# Compute absolute counts per category
naml_counts = pd.Series(categories['naml']).value_counts()
nrms_counts = pd.Series(categories['nrms']).value_counts()
lstur_counts = pd.Series(categories['lstur']).value_counts()
random_counts = pd.Series(categories['random']).value_counts()

# Combine counts into a DataFrame and fill missing values
counts_df = pd.DataFrame({
    'NAML': naml_counts,
    'NRMS': nrms_counts,
    'LSTUR': lstur_counts,
    'Random': random_counts
}).fillna(0).sort_index()

# Convert counts to proportions (each column sums to 1)
prop_df = counts_df.div(counts_df.sum(axis=0), axis=1)

# Melt for Altair
df = prop_df.reset_index().melt('index', var_name='recommender', value_name='proportion')

# Create bar chart of proportions
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X('index:N', title='Category', axis=alt.Axis(
        labelFontSize=14, 
        titleFontSize=16,
        labelAngle=25,  # Angled labels to reduce vertical space
        labelAlign='left'  # Align labels for better readability
    )),
    y=alt.Y('proportion:Q', title='Proportion', axis=alt.Axis(labelFontSize=14, titleFontSize=16)),  # display as percentage
    color=alt.Color(
        'recommender:N',
        scale=alt.Scale(scheme='category10'),
        title='Method',
        legend=alt.Legend(
            titleFontSize=16,
            labelFontSize=14,
            symbolSize=150,
            padding=10,
            orient='right',
            direction='vertical',
            # tweak these to nudge the legend into the plot
            legendX=20,
            legendY=20,
            fillColor='white'
        )
    ),
    xOffset='recommender:N'
).properties(
    width=500,
    height=200,
    
)

# Save with higher resolution (scale factor increases the resolution)
# chart.save('results/ebnerd_categories.png', scale_factor=3.0)
chart


In [8]:
# Compute absolute counts per category
naml_counts = pd.Series(subcategories['naml']).value_counts()
nrms_counts = pd.Series(subcategories['nrms']).value_counts()
lstur_counts = pd.Series(subcategories['lstur']).value_counts()
random_counts = pd.Series(subcategories['random']).value_counts()

# Combine counts into a DataFrame and fill missing values
counts_df = pd.DataFrame({
    'NAML': naml_counts,
    'NRMS': nrms_counts,
    'LSTUR': lstur_counts,
    'Random': random_counts
}).fillna(0).sort_index()

# Get total counts across all recommenders for each category
total_counts = counts_df.sum(axis=1)
# Get top 20 categories by total count
top_20_categories = total_counts.nlargest(20).index

# Filter DataFrame to only include top 20 categories
counts_df = counts_df.loc[top_20_categories]

# Convert counts to proportions (each column sums to 1)
prop_df = counts_df.div(counts_df.sum(axis=0), axis=1)

# Melt for Altair
df = prop_df.reset_index().melt('index', var_name='recommender', value_name='proportion')

# Create bar chart of proportions
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X('index:N', title='Subcategory', axis=alt.Axis(
        labelFontSize=14, 
        titleFontSize=16,
        labelAngle=25,  # Angled labels to reduce vertical space
        labelAlign='left'  # Align labels for better readability
    )),
    y=alt.Y('proportion:Q', title='Proportion', axis=alt.Axis(labelFontSize=14, titleFontSize=16)),  # display as percentage
    color=alt.Color(
        'recommender:N',
        scale=alt.Scale(scheme='category10'),
        title='Method',
        legend=alt.Legend(
            titleFontSize=16,
            labelFontSize=14,
            symbolSize=150,
            padding=10,
            orient='right',
            direction='vertical',
            # tweak these to nudge the legend into the plot
            legendX=20,
            legendY=20,
            fillColor='white'
        )
    ),
    xOffset='recommender:N'
).properties(
    width=500,
    height=200,
)

# Save with higher resolution (scale factor increases the resolution)
# chart.save('results/ebnerd_subcategories.png', scale_factor=3.0)
chart


In [9]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.preprocessing import KBinsDiscretizer

# Initialize KBinsDiscretizer
n_bins = 20
strategy = 'uniform'
discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy)

# Fit-transform activations for each model
a_naml = np.array(activations['naml']).reshape(-1, 1)
a_nrms = np.array(activations['nrms']).reshape(-1, 1)
a_lstur = np.array(activations['lstur']).reshape(-1, 1)
a_random = np.array(activations['random']).reshape(-1, 1)

naml_bins = discretizer.fit_transform(a_naml)
nrms_bins = discretizer.transform(a_nrms)
lstur_bins = discretizer.transform(a_lstur)
random_bins = discretizer.transform(a_random)

counts_df = pd.DataFrame({
    'NAML': pd.Series(naml_bins.flatten()).value_counts().sort_index(),
    'NRMS': pd.Series(nrms_bins.flatten()).value_counts().sort_index(),
    'LSTUR': pd.Series(lstur_bins.flatten()).value_counts().sort_index(),
    'Random': pd.Series(random_bins.flatten()).value_counts().sort_index()
}).fillna(0)

# Convert counts to proportions (each column sums to 1)
prop_df = counts_df.div(counts_df.sum(axis=0), axis=1)

# Prepare DataFrame for Altair
df = prop_df.reset_index().melt('index', var_name='recommender', value_name='proportion')

# Create bar chart of proportions
title_text = f'Activation value proportions (binned into {n_bins} uniform bins)'

# Create bar chart of proportions
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X('index:N', title='Activation', axis=alt.Axis(
        labelFontSize=14, 
        titleFontSize=16,
        labelAngle=25,  # Angled labels to reduce vertical space
        labelAlign='left'  # Align labels for better readability
    )),
    y=alt.Y('proportion:Q', title='Proportion', axis=alt.Axis(labelFontSize=14, titleFontSize=16)),  # display as percentage
    color=alt.Color(
        'recommender:N',
        scale=alt.Scale(scheme='category10'),
        title='Method',
        legend=alt.Legend(
            titleFontSize=16,
            labelFontSize=14,
            symbolSize=150,
            padding=10,
            orient='right',
            direction='vertical',
            # tweak these to nudge the legend into the plot
            legendX=20,
            legendY=20,
            fillColor='white'
        )
    ),
    xOffset='recommender:N'
).properties(
    width=500,
    height=200,
)

# Save with higher resolution (scale factor increases the resolution)
# chart.save('results/ebnerd_activations.png', scale_factor=3.0)
chart


