In [2]:
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import altair as alt
import pickle as pkl

In [3]:
def harmonic_number(n):
    return sum(1 / k for k in range(1, n + 1))

In [4]:
sum_one_over_ranks = harmonic_number(10)

repeat_differences = []

for repeat in range(500):

    js1s = []
    js2s = []

    for seed in range(1, 500):

        repeat_seed = str(repeat) + str(seed)
        np.random.seed(int(repeat_seed))
        
        seed_js1s = []
        seed_js2s = []
        
        context = np.random.rand(10, 1)
        for i, c in enumerate(context):
            rank = i + 1
            context[i] = c + 5 * 1 / rank / sum_one_over_ranks

        ranking1 = np.random.rand(10, 1)
        for i, r in enumerate(ranking1):
            rank = i + 1
            ranking1[i] = r + 5 * 1 / rank / sum_one_over_ranks

        ranking2 = np.random.rand(10, 1)
        for i, r in enumerate(ranking2):
            rank = i + 1
            ranking2[i] = r + 5 * 1 / rank / sum_one_over_ranks

        # Jensen-Shannon divergence
        js1 = 0.5 * np.sum(context * np.log2(context / ((context + ranking1) / 2))) + 0.5 * np.sum(ranking1 * np.log2(ranking1 / ((context + ranking1) / 2)))
        js2 = 0.5 * np.sum(context * np.log2(context / ((context + ranking2) / 2))) + 0.5 * np.sum(ranking2 * np.log2(ranking2 / ((context + ranking2) / 2)))

        js1s.append(js1)
        js2s.append(js2)

    js1s_samples = []
    js2s_samples = []
    for i in range(len(js1s)):
        js1s_samples.append(np.mean(js1s[:i+1]))
        js2s_samples.append(np.mean(js2s[:i+1]))

    differences = abs(np.array(js1s_samples) - np.array(js2s_samples))
    repeat_differences.append(differences)

repeat_differences = np.array(repeat_differences)

# get the mean and variance of the differences
mean_differences = np.mean(repeat_differences, axis=0)
variance_differences = np.var(repeat_differences, axis=0) * 5

# use alt to plot the mean and variance
df = pd.DataFrame({
    'x': range(1, 500),
    'mean': mean_differences,
    'variance': variance_differences
})
base = alt.Chart(df).mark_line().encode(
    x=alt.X('x', title='Number of samples', axis=alt.Axis(labelFontSize=14, titleFontSize=16)),
    y=alt.Y('mean', title='Mean of differences', axis=alt.Axis(labelFontSize=14, titleFontSize=16)),
    color=alt.Color('metric:N', scale=alt.Scale(scheme='category10'), legend=None)
).properties(width=500, height=200, 
title=alt.TitleParams(
        text='Converging divergences',
        fontSize=20,
        fontWeight='bold',
        anchor='middle',  # Aligns the title to the left
        offset=10        # Adds space between the title and the chart
    )
) 

base


In [7]:
naml_predictions = pd.read_json(f"data/recommendations/naml_prediction.json", lines=True)
nrms_predictions = pd.read_json(f"data/recommendations/nrms_prediction.json", lines=True)
lstur_predictions = pd.read_json(f"data/recommendations/lstur_prediction.json", lines=True)
npa_predictions = pd.read_json(f"data/recommendations/npa_prediction.json", lines=True)

random_predictions = pd.read_json(f"data/recommendations/random_prediction.json", lines=True)

behaviors = pd.read_csv('data/MIND/MINDlarge_dev/behaviors.tsv', delimiter='\t', header=None)
behaviors = behaviors.replace({np.nan: None})

with open('data/articles.pickle', 'rb') as f:
    articles = pkl.load(f)

In [18]:
recommenders = ['naml', 'nrms', 'lstur', 'npa', 'random']

categories = {recommender: [] for recommender in recommenders}
activations = {recommender: [] for recommender in recommenders}

recommender_to_df = {
    'naml': naml_predictions,
    'nrms': nrms_predictions,
    'lstur': lstur_predictions,
    'npa': npa_predictions,
    'random': random_predictions
}

radio_cutoff = 10

for progress_index, behavior in behaviors[:10000].iterrows():
    behavior_id, behavior_user, behavior_datetime, behavior_history, behavior_candidates = behavior

    if not behavior_history:
        continue

    recommendations_collection = {recommender: recommender_to_df[recommender][recommender_to_df[recommender]['impr_index'] == behavior_id]['pred_rank'].values.tolist()[0] for recommender in recommenders}

    candidates = behavior_candidates.split(' ')
    candidates_nids = [candidate.split('-')[0] for candidate in candidates]
    
    filtered_articles = [articles.loc[article_id] for article_id in candidates_nids if article_id in articles.index]

    for recommender in recommendations_collection:
        recommendations = recommendations_collection[recommender]

        article_categories = [filtered_articles[i-1].category for i in recommendations][:10]
        article_categories_pool = [filtered_articles[i-1].category for i in recommendations]

        article_activations = [abs(filtered_articles[i-1].sentiment) for i in recommendations][:10]
        article_activations_pool = [abs(filtered_articles[i-1].sentiment) for i in recommendations]

        article_ids = [filtered_articles[i-1].name for i in recommendations][:1]

        categories[recommender] += article_categories
        activations[recommender] += article_activations


In [19]:
# Count occurrences by converting each list to a DataFrame and using value_counts
naml_counts = pd.Series(categories['naml']).value_counts()
nrms_counts = pd.Series(categories['nrms']).value_counts()
lstur_counts = pd.Series(categories['lstur']).value_counts()
npa_counts = pd.Series(categories['npa']).value_counts()

random_counts = pd.Series(categories['random']).value_counts()

# Combine unique categories and align them
data = pd.DataFrame({'NAML': naml_counts, 'NRMS': nrms_counts, 'LSTUR': lstur_counts, 'NPA': npa_counts, 'Random': random_counts}).fillna(0).sort_index()

# make altair barplot with bars next to each other
df = data.reset_index().melt('index', var_name='recommender', value_name='count')
# Adjust the x encoding to display "NAML" and "Random" bars next to each other for each category
base = alt.Chart(df).mark_bar().encode(
    x=alt.X('index:N', title='Category', axis=alt.Axis(labelFontSize=14, titleFontSize=16)),
    y=alt.Y('count:Q', title='Count', axis=alt.Axis(labelFontSize=14, titleFontSize=16)),
    color=alt.Color('recommender:N', scale=alt.Scale(scheme='category10'), legend=alt.Legend(
            title='Model',
            orient='top-right',
            fillColor='white',
            labelFontSize=14,
            titleFontSize=14,
            strokeColor='light-grey',
            padding=10,
        )),
    xOffset='recommender:N'  # Offset bars by the 'recommender' type to place them side by side
).properties(
    width=500,
    height=200,
    title=alt.TitleParams(
        text='Category counts in top-10 recommendations',
        fontSize=20,
        fontWeight='bold',
        anchor='middle',
        offset=10
    )
)

base

In [21]:
# Initialize KBinsDiscretizer
discretizer = KBinsDiscretizer(n_bins=20, encode='ordinal', strategy='uniform')

naml_activation_counts = discretizer.fit_transform(np.array(activations['naml']).reshape(-1, 1))
nrms_activation_counts = discretizer.transform(np.array(activations['nrms']).reshape(-1, 1))
lstur_activation_counts = discretizer.transform(np.array(activations['lstur']).reshape(-1, 1))
npa_activation_counts = discretizer.transform(np.array(activations['npa']).reshape(-1, 1))

random_activation_counts = discretizer.transform(np.array(activations['random']).reshape(-1, 1))

# Create DataFrame and compute frequencies
data = pd.DataFrame({
    'NAML': pd.Series(naml_activation_counts.flatten()).value_counts().sort_index(),
    'NRMS': pd.Series(nrms_activation_counts.flatten()).value_counts().sort_index(),
    'LSTUR': pd.Series(lstur_activation_counts.flatten()).value_counts().sort_index(),
    'NPA': pd.Series(npa_activation_counts.flatten()).value_counts().sort_index(),
    'Random': pd.Series(random_activation_counts.flatten()).value_counts().sort_index()
}).fillna(0)

# make altair barplot with bars next to each other
df = data.reset_index().melt('index', var_name='recommender', value_name='count')
# Adjust the x encoding to display "NAML" and "Random" bars next to each other for each category
base = alt.Chart(df).mark_bar().encode(
    x=alt.X('index:N', title='Activation Value', axis=alt.Axis(labelFontSize=14, titleFontSize=16)),
    y=alt.Y('count:Q', title='Count', axis=alt.Axis(labelFontSize=14, titleFontSize=16)),
    color=alt.Color('recommender:N', scale=alt.Scale(scheme='category10'), legend=alt.Legend(
            title='Model',
            orient='top-right',
            fillColor='white',
            labelFontSize=14,
            titleFontSize=14,
            strokeColor='light-grey',
            padding=10,
        )),
    xOffset='recommender:N'  # Offset bars by the 'recommender' type to place them side by side
).properties(
    width=500,
    height=200,
    title=alt.TitleParams(
        text='Activation counts in top-10 recommendations',
        fontSize=20,
        fontWeight='bold',
        anchor='middle',
        offset=10
    )
)

base

